def test_compute_loc_using_different_time_bins(self): service_mgr = ServiceMgr(self.db_path) service_names = service_mgr.list_all_service_names() until = datetime(year=2019, month=12, day=31) step_size = 182 for service_name in service_names: service = service_mgr.get_service(service_name=service_name) loc_dict = dict() for i in [1, 2]: s = step_size * i time_bins = self.data_mgr.get_time_bins( service_names=service_names, step_size_aprox=s, until=until) print(time_bins) locs = self.data_mgr.compute_loc(service=service, time_bins=time_bins) for t, l in zip(time_bins[1:], locs): t_str = t.isoformat() if t_str in loc_dict.keys(): loc_dict[t_str].append(l) else: loc_dict[t_str] = [l] for k, v in loc_dict.items(): if len(v) == 2: l1 = v[0] l2 = v[1] self.assertEqual(l1, l2)
def test_extract_new_data_from_ghe(self): f = os.path.join(os.getcwd(), 'microservices_miner', 'example.json') api_token = os.getenv('MINING_GHE_PERSONAL_ACCESS_TOKEN') with open(f) as file: input_data = json.load(file) self.ghe_extractor.extract_new_data_from_ghe(service_list=input_data, db_path=self.db_path, api_token=api_token) service_mgr = ServiceMgr(db_path=self.db_path) for s in input_data: name = s.get('name') service_names = service_mgr.list_all_service_names() self.assertIn(name, service_names) service = service_mgr.get_service(service_name=name) start_dt = datetime.strptime(s.get('start_date'), '%Y-%m-%d').date() self.assertEqual(service.start_date, start_dt) input_repos = s.get('repositories') for input_repo in input_repos: found = False for repo_data in service.list_repository_data(): repo = repo_data.get('repository') if repo.name == input_repo.get('name'): repo_start_date = input_repo.get('start_date') start_date = repo_data.get('start_date') self.assertEqual(repo_start_date, start_date) repo_end_date = input_repo.get('end_date') end_date = repo_data.get('end_date') self.assertEqual(repo_end_date, end_date) found = True self.assertTrue(found)
def test_compute_time_to_repair(self): service_mgr = ServiceMgr(self.db_path) service_names = service_mgr.list_all_service_names() df = self.data_mgr.compute_repair_time(service_names=service_names) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(df.shape[1], 3) self.assertIn('closed_at', df.columns) self.assertIn('time_to_repair', df.columns) self.assertIn('repository', df.columns) self.assertGreaterEqual(df.shape[0], 1)
def compute_repair_time(self, service_names): """ compute the time to repair a bug, i.e., the time between the bug report and the bug fix. This time is computed in days Parameters ---------- service_names: list of str Returns ------- pd.DataFrame """ repository_list = list() service_mgr = ServiceMgr(db_path=self.db_path) issue_mgr = IssueMgr(path_to_db=self.db_path) for service_name in service_names: print('Getting data from {}'.format(service_name)) service = service_mgr.get_service(service_name=service_name) for repo_data in service.list_repository_data(): repo = repo_data.get('repository') if repo not in repository_list: repository_list.append(repo) time_to_repair = list() closed_at_list = list() repositories = list() for repo in repository_list: issues_aux = issue_mgr.get_issues_by_label( repository_id=repo.repository_id) for issue in issues_aux: if issue.state == 'closed': td = issue.closed_at - issue.created_at ttr = td.days * 24 * 60 * 60 + td.seconds ttrhr = ttr / (24 * 60 * 60) if ttrhr >= 500: print(issue) time_to_repair.append(ttrhr) closed_at_list.append(issue.closed_at) repositories.append(repo.name) df = pd.DataFrame( data={ 'closed_at': closed_at_list, 'time_to_repair': time_to_repair, 'repository': repositories }) filename = os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics', 'repair_time.csv') df.to_csv(filename) return df
class TestServiceMgr(unittest.TestCase): def setUp(self) -> None: db_path = os.getenv('DB_PATH') self.service_mgr = ServiceMgr(db_path=db_path) self.conn = sqlite3.connect(db_path) def test_get_service_by_name(self): cursor = self.conn.cursor() service_names = self.service_mgr.list_all_service_names() filesystem_mgr = FileSystemMgr(db_path=os.getenv('DB_PATH')) for service_name in service_names: service = self.service_mgr.get_service(service_name=service_name) count_repo_sql = 'select count(*) from {} where service_id = {};'.format( ServiceRepositoryConn.TABLE_NAME, service.service_id) cursor.execute(count_repo_sql) row = cursor.fetchone() total_number_of_repositories = row[0] self.assertIsInstance(service, Service) self.assertTrue(len(service.list_repository_data()) == total_number_of_repositories > 0, msg='Error! Invalid number of services') for repo_data in service.list_repository_data(): repo = repo_data.get('repository') count_commits_sql = 'select count(*) from {} where repository_id = {};' \ .format(RepositoryCommitConn.TABLE_NAME, repo.repository_id) cursor.execute(count_commits_sql) self.assertIsInstance(repo, Repository) for c in repo.commits: filename_sql = 'select filename from {} where commit_id = {};' \ .format(FileModificationConn.TABLE_NAME, c.commit_id) cursor.execute(filename_sql) rows = cursor.fetchall() counter = 0 for row in rows: filename = row[0] if filesystem_mgr.check_filename( filename=filename, service_id=service.service_id, repository_id=repo.repository_id): counter += 1 filemodication_counter = len(c.file_modifications) self.assertEqual( counter, filemodication_counter, msg='Different number of file modifications')
def test_compute_bug_per_loc_ratio(self): """ Returns ------- """ until = datetime(year=2019, month=12, day=31) service_mgr = ServiceMgr(db_path=self.db_path) service_names = service_mgr.list_all_service_names() time_bins = self.data_mgr.get_time_bins(service_names=service_names, step_size_aprox=365, until=until) df = self.data_mgr.compute_bug_per_loc_ratio( service_names=service_names, time_bins=time_bins) self.assertGreater(df.shape[0], 0) self.assertEqual(df.shape[1], 4) self.assertTrue( all(c in df.columns for c in ['year', 'defect_density', 'bugs', 'loc']))
def test_check_filename(self): service_mgr = ServiceMgr(db_path=self.db_path) conn = sqlite3.connect(self.db_path) cursor = conn.cursor() for name in service_mgr.list_all_service_names(): service = service_mgr.get_service(service_name=name) for repo_data in service.list_repository_data(): repo = repo_data.get('repository') # type: Repository sql = 'select * from {} where repository_id = {};'.format(ServiceRepositoryConn.TABLE_NAME, repo.repository_id) cursor.execute(sql) rows = cursor.fetchall() if len(rows) > 1: sql = 'select pattern, type, repository_id from filename_pattern where service_id = {};'\ .format(service.service_id) cursor.execute(sql) patterns = cursor.fetchall() for pat in patterns: pattern = pat[0] type = pat[1] repo_id = pat[2] if repo.repository_id == repo_id: for c in repo.commits: for fm in c.file_modifications: is_valid = self.fs.check_filename(filename=fm.filename, service_id=service.service_id, repository_id=repo.repository_id) self.assertIsInstance(is_valid, bool) endswith_py = fm.filename.endswith('.py') if pattern in fm.filename.lower() and type == FileSystemMgr.INCLUSION: self.assertTrue(is_valid, msg='service = {} filename={} '.format(service.name, fm.filename)) self.assertTrue(endswith_py, msg='service = {} filename={} '.format(service.name, fm.filename)) elif pattern in fm.filename and type == FileSystemMgr.EXCLUSION: self.assertFalse(is_valid, msg='service = {} filename={} '.format(service.name, fm.filename)) break
def compute_bug_per_loc_ratio(self, service_names, time_bins): """ Parameters ---------- service_names: List[str] time_bins: Tuple[datetime] Returns ------- pd.DataFrame """ print( 'Getting bugs and changes from services: {}'.format(service_names)) # connecting to the database issue_mgr = IssueMgr(path_to_db=self.db_path) service_mgr = ServiceMgr(db_path=self.db_path) # initializing variables bugs = list() loc_list = list() date_list = list() service_list = list() # for each service for service_name in service_names: print('Analysing the defect density of {} service'.format( service_name)) service = service_mgr.get_service(service_name=service_name) loc_aux = self.compute_loc(service, time_bins) loc_list.extend(loc_aux) for i in range(1, len(time_bins)): prev_t = time_bins[i - 1] cur_t = time_bins[i] bugs_counter = 0 for repo_data in service.list_repository_data(): repo = repo_data.get('repository') # type: Repository # print('Getting issues from {}'.format(repo.name)) issues = issue_mgr.get_issues_by_label( repository_id=repo.repository_id) for issue in issues: if issue.closed_at is not None and prev_t <= issue.closed_at < cur_t: bugs_counter += 1 for c in repo.commits: if prev_t <= c.date < cur_t: if DataMgr._is_bug_fix(comment=c.comment) \ and not DataMgr.closed_using_keywords(line=c.comment): print('{}: {}'.format(service.name, c.comment)) bugs_counter += 1 bugs.append(bugs_counter) service_list.append(service_name) date_list.append(cur_t) assert len(loc_list) == len(bugs) == len(date_list) == len( service_list) temp_df = pd.DataFrame( data={ 'service': service_list, 'date': date_list, 'bugs': bugs, 'loc': loc_list }) temp_filename = os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics', 'defect_density.csv') temp_df.to_csv(temp_filename, sep=',') temp_df = temp_df[temp_df['loc'] != 0] temp_df.loc[:, 'year'] = temp_df['date'].apply(lambda t: t.year) df_group = temp_df.groupby(by=['year'], as_index=False).sum() df_group['defect_density'] = df_group['bugs'] / (df_group['loc'] / 1000) df_group.to_csv( os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics', PlotMgr.DEFECT_DENSITY_CSV_FILE)) return df_group
def test_compute_loc_per_commit(self): service_mgr = ServiceMgr(self.db_path) repository_mgr = RepositoryMgr(self.db_path) max_absolute_error = 1000 max_relative_error = 0.1 service_names = service_mgr.list_all_service_names() for service_name in sorted(service_names): print('**********\n{}'.format(service_name)) service = service_mgr.get_service( service_name=service_name) # type: Service for repo_data in service.list_repository_data(): temp_repo = repo_data.get('repository') # type: Repository repo_sha_set = set() oracle_sha_set = set() repository = repository_mgr.get_repository( service_id=service.service_id, repository_id=temp_repo.repository_id) test_oracle = os.path.join( os.getenv('BASE_DIR'), 'data', 'oracle', 'service_id_{}_{}__oracle_cloc.csv'.format( service.service_id, temp_repo.name)) self.assertTrue( os.path.isfile(test_oracle), msg='Error! file does not exist: {}'.format(test_oracle)) loc_list, sha_list, date_list = self.data_mgr.compute_loc_per_repository( repository=repository) df = pd.read_csv(test_oracle, index_col=0, sep=';') self.assertGreater(df.shape[0], 0) for i in df.index: oracle_sha_set.add(i) self.assertGreater(len(loc_list), 0) for loc, sha, dt in zip(loc_list, sha_list, date_list): if sha in df.index: s = df.loc[sha, ['CLOC']] l = s.tolist() assert len(l) == 1, "Error! {}".format(l) cloc = l.pop() if loc == 0: if cloc == 0: ratio = 1.0 else: continue else: ratio = cloc / loc rel_diff = np.abs(ratio - 1.0) print('SHA={} LOC={} CLOC={}'.format(sha, loc, cloc)) self.assertIsInstance( rel_diff, float, msg='Error! delta is {} type={}'.format( rel_diff, type(rel_diff))) abs_diff = np.abs(loc - cloc) print( '{} relative diff = {} absolute diff = {}'.format( dt, rel_diff, abs_diff)) if rel_diff > max_relative_error: self.assertGreater(max_absolute_error, abs_diff, msg='Error! sha={}'.format(sha)) print('service={} len oracle = {} len GHE = {}'.format( service_name, len(oracle_sha_set), len(repo_sha_set))) print('service={} oracle - GHE = {}'.format( service_name, oracle_sha_set - repo_sha_set)) print('service={} GHE - oracle = {}'.format( service_name, repo_sha_set - oracle_sha_set)) print('service={} intersection {}'.format( service_name, repo_sha_set.intersection(oracle_sha_set)))
def setUp(self) -> None: db_path = os.getenv('DB_PATH') self.service_mgr = ServiceMgr(db_path=db_path) self.conn = sqlite3.connect(db_path)
def extract_new_data_from_ghe(service_list: List[Dict], db_path: str, api_token: str) -> None: """ Parameters ---------- service_list: List[Dict] db_path: str api_token: str Returns ------- None """ extractor = GHEExtractor(db_path=db_path) commit_mgr = CommitMgr(path_to_db=db_path) repo_mgr = RepositoryMgr(path_to_db=db_path) service_mgr = ServiceMgr(db_path=db_path) filesystem_mgr = FileSystemMgr(db_path) for s in service_list: service_name = s.get('name') end_date_str = s.get('end_date') start_date_str = s.get('start_date') programming_languages = s.get('programming_languages') logging.info('Extracting data from: {}'.format(service_name)) service = service_mgr.get_service(service_name=service_name) if service is None: sid = service_mgr.insert_service(name=service_name, start_date_str=start_date_str) service = service_mgr.get_service(service_id=sid) assert isinstance(sid, int) filesystem_mgr.insert_extensions( service_id=sid, programming_languages=programming_languages) else: sid = service.service_id # get list of extensions of this service extensions = filesystem_mgr.get_extensions(service_id=sid) for rep in s.get('repositories'): url = rep.get('url') base_url = GHEExtractor._get_base_url(repo_url=url) print('Getting data from: {}'.format(url)) repo_name = rep.get('name') owner = rep.get('owner') repo = repo_mgr.get_repository(url=url, service_id=sid) if repo is None: repo_url = '{}/{}/{}'.format(base_url, owner, repo_name) repo = repo_mgr.create_repository(name=repo_name, url=repo_url) repo.repository_id = repo_mgr.insert_repository( repository=repo) repo.owner = owner start_date_repo = rep.get('start_date') end_date_repo = rep.get('end_date') initial_loc = rep.get('initial_loc') service_mgr.insert_service_repository( service_name=service_name, repository_id=repo.repository_id, start_date=start_date_repo, end_date=end_date_repo, initial_loc=initial_loc) # get most recent commits last_commit = commit_mgr.get_commit_by_position( repository_id=repo.repository_id, pos=-1) if last_commit is not None: since = last_commit.date else: since = None assert repo is not None, "Error! Invalid repo" assert isinstance( repo, Repository), "Error! repo is not a type of Repository" assert repo.repository_id is not None, "Error! repo id is none: {}".format( repo) commits = extractor.extract_commits_from_ghe( base_url=base_url, owner=owner, repo=repo, since=since, api_token=api_token) extractor._insert_commit_data_into_database( commit_list=commits, repo=repo, owner=owner, base_url=base_url, api_token=api_token) extractor._find_and_repair_inconsistencies( owner=owner, repository=repo, extensions=extensions, base_url=base_url) try: end_date = datetime.strptime(end_date_str, "%Y-%m-%d") except (ValueError, TypeError) as e: end_date = None try: start_date_dt = datetime.strptime(start_date_str, "%Y-%m-%d") start_date = start_date_dt.date() except (ValueError, TypeError) as e: start_date = None service_mgr.update_dates(service_id=service.service_id, end_date=end_date, start_date=start_date)