def __init__(self, path_to_repo: str, since: datetime = None, to: datetime = None, from_commit: str = None, to_commit: str = None): """ :path_to_repo: path to a single repo :param datetime since: starting date :param datetime to: ending date :param str from_commit: starting commit (only if `since` is None) :param str to_commit: ending commit (only if `to` is None) """ if not since and not from_commit: raise TypeError('You must pass one between since and from_commit') if not to and not to_commit: raise TypeError('You must pass one between to and to_commit') if from_commit and to_commit and from_commit == to_commit: # Use 'single' param to avoid Warning self.repo_miner = Repository(path_to_repo, single=from_commit) else: self.repo_miner = Repository(path_to_repo=path_to_repo, since=since, to=to, from_commit=from_commit, to_commit=to_commit, order='reverse')
def test_badly_formatted_url(): with pytest.raises(Exception): list( Repository(path_to_repo='https://github.com/ishepard.git/test'). traverse_commits()) with pytest.raises(Exception): list(Repository(path_to_repo='test').traverse_commits())
def test_ignore_add_whitespaces(): commit = list( Repository('test-repos/whitespace', single="338a74ceae164784e216555d930210371279ba8e"). traverse_commits())[0] assert len(commit.modified_files) == 1 commit = list( Repository('test-repos/whitespace', skip_whitespaces=True, single="338a74ceae164784e216555d930210371279ba8e"). traverse_commits())[0] assert len(commit.modified_files) == 0
def test_ignore_deleted_whitespaces(): commit = list( Repository('test-repos/whitespace', single="e6e429f6b485e18fb856019d9953370fd5420b20"). traverse_commits())[0] assert len(commit.modified_files) == 1 commit = list( Repository('test-repos/whitespace', skip_whitespaces=True, single="e6e429f6b485e18fb856019d9953370fd5420b20"). traverse_commits())[0] assert len(commit.modified_files) == 0
def test_ignore_add_whitespaces_and_changed_file(): commit = list( Repository('test-repos/whitespace', single="532068e9d64b8a86e07eea93de3a57bf9e5b4ae0"). traverse_commits())[0] assert len(commit.modified_files) == 2 commit = list( Repository('test-repos/whitespace', skip_whitespaces=True, single="532068e9d64b8a86e07eea93de3a57bf9e5b4ae0"). traverse_commits())[0] assert len(commit.modified_files) == 1
def test_clone_repo_to_repeated(): import tempfile tmp_path = tempfile.gettempdir() dt2 = datetime(2018, 10, 20) url = "https://github.com/ishepard/pydriller.git" assert len( list( Repository(path_to_repo=url, to=dt2, clone_repo_to=str(tmp_path)).traverse_commits())) == 159 assert os.path.isdir(os.path.join(tmp_path, "pydriller")) assert len( list( Repository(path_to_repo=url, to=dt2, clone_repo_to=str(tmp_path)).traverse_commits())) == 159 assert os.path.isdir(os.path.join(tmp_path, "pydriller"))
def _exclude_commits_by_change_size(self, commit_hash: str, max_change_size: int = 20) -> Set[str]: to_exclude = set() repo_mining = Repository(self.repository_path, to_commit=commit_hash, order='reverse').traverse_commits() for commit in repo_mining: try: if len(commit.modified_files) > max_change_size: to_exclude.add(commit.hash) else: break except Exception as e: log.error( f'unable to analyze commit: {self.repository_path} {commit.hash}' ) raise e if len(to_exclude) > 0: log.info( f'count of commits excluded by change size > {max_change_size}: {len(to_exclude)}' ) return to_exclude
def git_log_to_json(pathRepository, commitFrom): pathRepository = r"C:\Users\login\work\buggyChangeLocater\projects\egit\repositoryMethod" commitFrom = "b459d7381ea57e435bd9b71eb37a4cb4160e252b" commits = [] for i, commit in enumerate( Repository(pathRepository, to_commit=commitFrom).traverse_commits()): comment = commit.msg.replace("\r\n", " ").replace("\n", " ").replace( "\r", " ").replace(" ", " ") comment = comment + " Modified : None" for modified_file in commit.modified_files: if (modified_file.change_type == ModificationType.ADD): comment = comment + " Added : " + modified_file.new_path elif (modified_file.change_type == ModificationType.DELETE): comment = comment + " Deleted : " + modified_file.old_path else: comment = comment + " Modified : " + modified_file.new_path row = {} row["id"] = i #commit.hash row["author"] = commit.author.name row["date"] = str( int(commit.committer_date.timestamp()) ) + "000" #.strftime("%Y-%m-%d %H:%M:%S %z")#2020-07-24 23:52:45 +0200 row["comment"] = comment commits.append(row) with open('commits.csv', 'w', encoding="utf-8", newline="") as f: writer = csv.writer(f, delimiter='\t') count = 0 for commit in commits: count += 1 writer.writerow([ commit["id"], commit["date"], commit["author"], commit["comment"] ])
def pydriller_szz(git_path, bugsfixes_json, results_path): g = Git(git_path) with open(bugsfixes_json) as f: commits = json.loads(f.read()) bic = {} for a in commits: bic[a['fix_commit_hash']] = {} c = next( Repository(git_path, single=a['fix_commit_hash']).traverse_commits()) for f in c.modified_files: if f.new_path is None: continue if '\\test\\' in f.new_path or not f.new_path.endswith('.java'): continue ans = g.get_commits_last_modified_lines(c, f) for f_name in ans: bic[a['fix_commit_hash']][f_name] = list(ans[f_name]) with open(results_path + ".json", 'w') as out: json.dump(bic, out) as_csv = [] for bugfix_commit in bic: for f_name in bic[bugfix_commit]: for bic_commit in bic[bugfix_commit][f_name]: as_csv.append([bugfix_commit, f_name, bic_commit]) df = pd.DataFrame(as_csv, columns=['bugfix_commit', 'filename', 'bic']) df.to_csv(results_path + ".csv", index=False)
def mine(_type): p = psutil.Process(os.getpid()) dt1 = datetime(2017, 1, 1) dt2 = datetime(2017, 7, 1) all_commits = [] start = datetime.now() for commit in Repository('test-repos-hadoop/hadoop', since=dt1, to=dt2).traverse_commits(): memory = p.memory_info()[0] / (2 ** 20) all_commits.append(memory) h = commit.author.name # noqa if _type == 0: continue for mod in commit.modified_files: dd = mod.diff # noqa if _type == 1: continue if mod.filename.endswith('.java'): cc = mod.complexity # noqa end = datetime.now() diff = end - start return diff, all_commits
def test_projectname_multiple_repos(): repos = [ 'test-repos/files_in_directories', 'test-repos/files_in_directories', 'test-repos/files_in_directories' ] for commit in Repository(path_to_repo=repos).traverse_commits(): assert commit.project_name == 'files_in_directories'
def test_diff_with_histogram(git_repo): # with histogram commit = list( Repository('test-repos/histogram', single="93df8676e6fab70d9677e94fd0f6b17db095e890", histogram_diff=True).traverse_commits())[0] diff = commit.modified_files[0].diff_parsed assert (4, ' {') in diff["added"] assert (5, ' log.error("Icon path is null");') in diff["added"] assert (6, ' return null;') in diff["added"] assert (7, ' }') in diff["added"] assert (8, '') in diff["added"] assert (11, ' if (imgURL == null)') in diff["added"] assert (12, ' {') in diff["added"] assert (13, ' log.error("Couldn\'t find icon: " + imgURL);' ) in diff["added"] assert (14, ' return null;') in diff["added"] assert (17, ' return new ImageIcon(imgURL);') in diff["added"] assert (6, ' {') in diff["deleted"] assert (7, ' return new ImageIcon(imgURL);') in diff["deleted"] assert (10, ' {') in diff["deleted"] assert (11, ' log.error("Couldn\'t find icon: " + imgURL);' ) in diff["deleted"] assert (12, ' }') in diff["deleted"] assert (13, ' return null;') in diff["deleted"]
def test_diff_without_histogram(git_repo): # without histogram commit = list( Repository('test-repos/histogram', single="93df8676e6fab70d9677e94fd0f6b17db095e890"). traverse_commits())[0] diff = commit.modified_files[0].diff_parsed assert len(diff['added']) == 11 assert (3, ' if (path == null)') in diff['added'] assert (5, ' log.error("Icon path is null");') in diff['added'] assert (6, ' return null;') in diff['added'] assert (8, '') in diff['added'] assert (9, ' java.net.URL imgURL = GuiImporter.class.getResource(path);' ) in diff['added'] assert (10, '') in diff['added'] assert (11, ' if (imgURL == null)') in diff['added'] assert (12, ' {') in diff['added'] assert (14, ' return null;') in diff['added'] assert (16, ' else') in diff['added'] assert (17, ' return new ImageIcon(imgURL);') in diff['added'] assert len(diff['deleted']) == 7 assert (3, ' java.net.URL imgURL = GuiImporter.class.getResource(path);' ) in diff['deleted'] assert (4, '') in diff['deleted'] assert (5, ' if (imgURL != null)') in diff['deleted'] assert (7, ' return new ImageIcon(imgURL);') in diff['deleted'] assert (9, ' else') in diff['deleted'] assert (10, ' {') in diff['deleted'] assert (13, ' return null;') in diff['deleted']
def test_clone_repo_to(tmp_path): dt2 = datetime(2018, 10, 20) url = "https://github.com/ishepard/pydriller.git" assert len( list( Repository(path_to_repo=url, to=dt2, clone_repo_to=str(tmp_path)).traverse_commits())) == 159 assert tmp_path.exists() is True
def get_impacted_files( self, fix_commit_hash: str, file_ext_to_parse: List[str] = None, only_deleted_lines: bool = True) -> List['ImpactedFile']: """ Parse the diff of given fix commit using PyDriller to obtain a list of ImpactedFile with impacted file path and modified line ranges. As default behaviour, all deleted lines in the diff which are also added are treated as modified lines. :param List[str] file_ext_to_parse: parse only the given file extensions :param only_deleted_lines: considers as modified lines only the line numbers that are deleted and added. By default, only deleted lines are considered :param str fix_commit_hash: hash of fix commit to parse :returns List[ImpactedFile] impacted_files """ impacted_files = list() # fix_commit = PyDrillerGitRepo(self.repository_path).get_commit(fix_commit_hash) fix_commit = next( Repository(self.repository_path, single=fix_commit_hash).traverse_commits()) for mod in fix_commit.modified_files: # skip newly added files if not mod.old_path: continue # filter files by extension if file_ext_to_parse: ext = mod.filename.split('.') if len(ext) < 2 or (len(ext) > 1 and ext[1] not in file_ext_to_parse): log.info(f"skip file: {mod.filename}") continue file_path = mod.new_path if mod.change_type == ModificationType.DELETE or mod.change_type == ModificationType.RENAME: file_path = mod.old_path lines_added = [added[0] for added in mod.diff_parsed['added']] lines_deleted = [ deleted[0] for deleted in mod.diff_parsed['deleted'] ] if only_deleted_lines: mod_lines = lines_deleted else: mod_lines = [ld for ld in lines_deleted if ld in lines_added] if len(mod_lines) > self.max_file_modifications: log.warning("File changes too large") continue if len(mod_lines) > 0: impacted_files.append(ImpactedFile(file_path, mod_lines)) log.info([str(f) for f in impacted_files]) return impacted_files
def test_ignore_add_whitespaces_and_modified_normal_line(git_repo): commit = list( Repository('test-repos/whitespace', single="52716ef1f11e07308b5df1b313aec5496d5e91ce"). traverse_commits())[0] assert len(commit.modified_files) == 1 parsed_normal_diff = commit.modified_files[0].diff_parsed commit = list( Repository('test-repos/whitespace', skip_whitespaces=True, single="52716ef1f11e07308b5df1b313aec5496d5e91ce"). traverse_commits())[0] assert len(commit.modified_files) == 1 parsed_wo_whitespaces_diff = commit.modified_files[0].diff_parsed assert len(parsed_normal_diff['added']) == 2 assert len(parsed_wo_whitespaces_diff['added']) == 1 assert len(parsed_normal_diff['deleted']) == 1 assert len(parsed_wo_whitespaces_diff['deleted']) == 0
def mine(self, **kwargs: Any) -> None: """Gather data from repository. To be extended in subclasses.""" miner = Repository(self.repo, **kwargs) for commit in miner.traverse_commits(): try: self.mine_commit(commit) except GitCommandError as err: # Warn about failing git commands, but continue warnings.warn(str(err))
def test_deletion_remotes(): repos = [ 'https://github.com/ishepard/pydriller', 'https://github.com/ishepard/pydriller' ] paths = set() for commit in Repository(path_to_repo=repos).traverse_commits(): paths.add(commit.project_path) for path in paths: assert os.path.exists(path) is False
def visualizar_parents(self): rm = Repository( "C:\\Users\\Leandro César\\Documents\\Nova pasta\\junit4") for commit in rm.traverse_commits(): if len(commit.parents) == 2: print(list[commit.parents]) else: print(list[commit.parents]) pass
def visualizar_commits(self): for commit in Repository( "C:\\Users\\Leandro César\\Documents\\Nova pasta\\junit4" ).traverse_commits(): print('O commit de número {} foi modificado pelo autor {}, ' 'e comitado por {} na data {}'.format( commit.hash, commit.author.name, commit.committer.name, commit.committer_date)) botao = input("Pressione qualquer botao para sair") print("\x1b[2J\x1b[1;1H") pass
def get_merge_commits(self, commit_hash: str) -> Set[str]: merge = set() repo_mining = Repository(single=commit_hash, path_to_repo=self.repository_path).traverse_commits() for commit in repo_mining: try: if commit.merge: merge.add(commit.hash) except Exception as e: log.error(f'unable to analyze commit: {self.repository_path} {commit.hash}') if len(merge) > 0: log.info(f'merge commits count: {len(merge)}') return merge
def determine_commits_in_period(repository, start_date, end_date): """Determine the commits that are made in the period from start_date to end_date.""" production_code_commit_count = 0 test_code_commit_count = 0 number_of_commits = 0 for commit in Repository(repository, since=start_date, to=end_date).traverse_commits(): if commit_contains_test_code(commit): test_code_commit_count = test_code_commit_count + 1 if commit_contains_production_code(commit): production_code_commit_count = production_code_commit_count + 1 number_of_commits = number_of_commits + 1 return number_of_commits, production_code_commit_count, test_code_commit_count
def dictionaryWithAllCommmits(self): dictionaryAux = {} for commit in Repository(self.repository).traverse_commits(): commitAuthorNameFormatted = '{}'.format(commit.author.name) commitAuthorDateFormatted = '{}'.format(commit.author_date) listFilesModifiedInCommit = [] for modification in commit.modified_files: itemMofied = '{}'.format(modification.filename) listFilesModifiedInCommit.append(itemMofied) dictionaryAux[commit.hash] = [ commitAuthorNameFormatted, commitAuthorDateFormatted, listFilesModifiedInCommit ] return dictionaryAux
def save_commits_and_authors_in_json(self, user_id): list_of_commits = list() list_of_authors = list() for commit in Repository(self.repository).traverse_commits(): list_of_commits.append(commit.hash) list_of_authors.append(commit.author.name) authors = set(list_of_authors) authors = list(authors) dict_commits = {} dict_commits[self.name] = list_of_commits Util.save_dictionary_in_json_file(self.name, user_id, dict_commits, 'commits') dict_authors = {} dict_authors[self.name] = authors Util.save_dictionary_in_json_file(self.name, user_id, dict_authors, 'authors')
def visualizar_arquivo_especifico(self): caminho = "C:\\Users\\Leandro César\\Documents\\Nova pasta\\junit4" caminho_do_arquivo = input( "Cole o caminho a partir do diretório src: " ) #"\\src\\test\\java\\junit\\tests\\AllTests.java" for commit in Repository(caminho, filepath=caminho + caminho_do_arquivo).traverse_commits(): print(commit.hash) botao = input("Pressione qualquer botao para sair") print("\x1b[2J\x1b[1;1H") pass
def get_meta_changes(self, commit_hash: str, current_file: str) -> Set[str]: meta_changes = set() repo_mining = Repository(path_to_repo=self.repository_path, single=commit_hash).traverse_commits() for commit in repo_mining: show_str = self.repository.git.show(commit.hash, '--summary').splitlines() if show_str and self._is_git_mode_change(show_str, current_file): log.info(f'exclude meta-change (file mode change): {current_file} {commit.hash}') meta_changes.add(commit.hash) else: try: for m in commit.modified_files: if (current_file == m.new_path or current_file == m.old_path) and (m.change_type in self.change_types_to_ignore): log.info(f'exclude meta-change ({m.change_type}): {current_file} {commit.hash}') meta_changes.add(commit.hash) except Exception as e: log.error(f'unable to analyze commit: {self.repository_path} {commit.hash}') return meta_changes
def get_java_method_metrics(files, project_path, version): # 获取测试java文件的所有文件名 file_names = get_all_name_of_files(files) num_of_add_method = 0 num_of_sub_method = 0 num_of_modify_method = 0 def is_in(name, names): for index in range(0, len(names)): if name in names[index]: return index return -1 for commit in Repository(project_path, single=version).traverse_commits(): for m in commit.modified_files: # 查看提交文件的文件名是否属于测试文件 index = is_in(m.filename, file_names) if index != -1: changed_methods = [] for method in m.changed_methods: changed_methods.append(method.name) num_of_modify_method += len(list(set(changed_methods))) # 得到所有修改了的方法名 all_methods = [] for method in m.methods: all_methods.append(method.name) all_methods_before = [] for method in m.methods_before: all_methods_before.append(method.name) all_methods, all_methods_before = method_filter( file_names, all_methods, all_methods_before) # 确定连续修改多个行数的修改块的第一个大括号前的语句是否包含在这些方法名中 add, sub = judge_add_or_delete_method(all_methods, all_methods_before) num_of_add_method += add num_of_sub_method += sub return [ num_of_add_method, num_of_sub_method, num_of_modify_method - num_of_add_method - num_of_sub_method ]
def process(self, force=False) -> Tuple[int, int]: filters = {} if self.author_name: filters['only_authors'] = [ self.author_name, ] commits = Repository(self.repo_url, **filters).traverse_commits() self.get_entries().delete() entries_to_create = [] for commit in commits: entries_to_create.append( Entry(title=commit.msg, description=commit.hash, date_on_timeline=commit.committer_date.astimezone( pytz.UTC), schema='commit', source=self.entry_source, extra_attributes={ 'hash': commit.hash, 'url': self.get_commit_url(commit), 'author': { 'email': commit.author.email, 'name': commit.author.name, }, 'changes': { 'files': commit.files, 'insertions': commit.insertions, 'deletions': commit.deletions, }, 'repo': { 'name': self.get_repo_name() or commit.project_name, 'url': self.get_repo_url(), }, })) Entry.objects.bulk_create(entries_to_create) return len(entries_to_create), 0
def filter_commits(repo, commits): rels = [] for commit_sha in commits: # has java not test if not list( filter(lambda x: x.is_java and not x.is_test, commits[commit_sha])): continue c = next( Repository(repo.working_dir, single=commit_sha).traverse_commits()) committed = list( map( lambda f: CommittedFile(commit_sha, f.new_path, f.added_lines, f.deleted_lines, f), filter(lambda f: f.language_supported and f.new_path, c.modified_files))) if not list( filter(lambda x: x.is_java and not x.is_test and x.is_relevant, committed)): continue rels.append(commit_sha) return rels
from pydriller import Repository for commit in Repository( 'C:\\Users\\Leandro César\\eclipse-workspace\\Analise_Git\\Junit4' ).traverse_commits(): print('The commit {} has been modified by {}, ' 'committed by {} in date {}'.format(commit.hash, commit.author.name, commit.committer.name, commit.committer_date))