from pydriller import RepositoryMining, GitRepository import datetime import re #listaCommits = getLinuxCommits() pathLinux = '../../journal/repositories/linux' fileLinuxFeatures = open('output/featuresLinux.csv', 'w') featuresLinux = [] for commit in RepositoryMining( pathLinux, only_modifications_with_file_types=['kconfig']).traverse_commits(): for modification in commit.modifications: if ('kconfig' in modification.filename.lower() and modification.change_type.value == 5): currentSourceCode = modification.source_code.replace( '\t', '').strip().split('\n') for line in currentSourceCode: res = re.match(r'^config \S+', line) if ((res != None) and not (line.split()[1] in featuresLinux)): featuresLinux.append(line.split()[1]) fileLinuxFeatures.write('{}\n'.format(line.split()[1])) print(line)
def extractBuggyCommits(input_filename, local_repos_directory, output_directory): projects = ProjectLoader.getReposPlainName(input_filename) print(projects) for projectName in projects: bug_counter = 0 bugFixes = [] print("Analyzing", projectName) if (projectName == 'pytorch' or projectName == 'react-native'): continue if (os.path.exists( str(output_directory) + "/" + str(projectName) + "_bug_fixing_commits") and os.path.isfile( str(output_directory) + "/" + str(projectName) + "_bug_fixing_commits")): print(projectName, "already analyzed, skipping...") continue # 1. iterate over each project # 2. find all commits that fixed bugs using syntactic analysis # 3. find the commit that caused the bug # 4. for the commit that caused the bug, extract how many files were in the change set, the number of lines changed (added or removed), the author, how many commits the file has had to it, the number of contributors that have contributed to the file, contributor experience which returns the percentage of # lines authored by the highest contributor of a file, the hunks count # commits count, contributor count and contributor experience are in process metrics startTime = time.time() for commit in RepositoryMining( local_repos_directory + "/" + str(projectName), only_in_branch='master', only_no_merge=True, since=datetime.datetime(2019, 6, 1, 0, 0, 0)).traverse_commits(): commit_msg = commit.msg containsBug = 'bug' in commit_msg.casefold() containsPatch = 'patch' in commit_msg.casefold() containsFix = 'fix' in commit_msg.casefold() containsBugIdentifier = bool(re.search('#+\d', commit_msg)) if (containsBug and (containsFix or containsPatch or containsBugIdentifier)) or ( containsFix and containsBugIdentifier): bug_counter = bug_counter + 1 # get the list of modified files in the fix listFixedFiles = commit.modifications numFilesModifiedForFix = 0 numLinesAddedForFix = 0 numLinesRemovedForFix = 0 totalComplexityFixedFiles = 0 fileComplexityCount = 0 averageComplexityFixedFiles = -1 totalLinesOfCodeAllFiles = 0 changedMethods = 0 numFilesMoved = 0 for file in listFixedFiles: sourceCodeLanguage = LanguageDetector.detect(file.filename) if (sourceCodeLanguage == None or file.nloc == 0): continue if (file.nloc): totalLinesOfCodeAllFiles = totalLinesOfCodeAllFiles + file.nloc numFilesModifiedForFix = numFilesModifiedForFix + 1 numLinesAddedForFix = numLinesAddedForFix + file.added numLinesRemovedForFix = numLinesRemovedForFix + file.removed if file.complexity: fileComplexityCount = fileComplexityCount + 1 totalComplexityFixedFiles = totalComplexityFixedFiles + file.complexity changedMethods = changedMethods + len(file.changed_methods) if (numFilesModifiedForFix == 0): continue if (fileComplexityCount != 0): averageComplexityFixedFiles = totalComplexityFixedFiles / fileComplexityCount bugFixInfo = { "commit_hash": commit.hash, "author": commit.author.name, "total_complexity": totalComplexityFixedFiles, "average_complexity": averageComplexityFixedFiles, "sum_nloc": totalLinesOfCodeAllFiles, "num_files": numFilesModifiedForFix, "lines_added": numLinesAddedForFix, "lines_removed": numLinesRemovedForFix, "commit_date": commit.author_date, "branches": commit.branches, "num_methods_changed": changedMethods } bugFixes.append(bugFixInfo) tempMap = {projectName: bugFixes} IOUtils.writeBugMap(tempMap, output_directory, "_bug_fixing_commits") endTime = time.time() print("time", endTime - startTime)
from RepositoryMiner import RepositoryMiner from dataStructures.Repository_summary import RepositorySummary from pydriller import RepositoryMining repo_summary = RepositorySummary() repoMiner = RepositoryMiner( RepositoryMining(path_to_repo="~/Projects/focus-android", from_commit=None, to_commit=None), repo_summary) repoMiner.create_repository_summary( "~/Projects/focus-android", "2d4dc678ce1260b90d3499ebefcdcaf19549f983", None) repoMiner.save_table_as_csv("first_try.csv")
data[r][c]["files"][f] = {} if not "changes" in data[r][c]["files"][f]: data[r][c]["files"][f]["changes"] = [] data[r][c]["files"][f]["changes"].append( thischange) changesfromdiff = True changeCommits.append(c) if changesfromdiff: #if any changes in this diff were useful...we get the sourcecode for those files using pydriller print("\n\n" + mode + " mining " + r + " " + str(progress) + "/" + str(len(data))) commitlist = [] try: for commit in RepositoryMining(r).traverse_commits(): commitlist.append(commit.hash) #go through all commits in the repository mining and check if they match with one of the commits that are of interest if not commit.hash in changeCommits: continue for m in commit.modifications: #run through all modifications in the single commit in the repository mining if m.old_path != None and m.source_code_before != None: if not ".py" in m.old_path: continue #ignore files that are too large if len(m.source_code_before) > 30000: continue
def main(): repo_path = sys.argv[1] repo_branch = 'master' commits = RepositoryMining(repo_path, only_in_branch=repo_branch).traverse_commits() commits = [commit for commit in commits] gitRepo = GitRepository(repo_path) start_date = commits[0].committer_date + relativedelta(years=3) last_date = commits[-1].committer_date - relativedelta(years=3) bug_tracker = defaultdict(list) bug_tracker_pickle = "data3/{}.pickle".format( os.path.basename(os.path.normpath(repo_path))) # First index the buggy files if os.path.exists(bug_tracker_pickle): with open(bug_tracker_pickle, 'rb') as handle: bug_tracker = pickle.load(handle) else: for commit_index, commit in enumerate(commits): if not is_bugfix_commit(commit.msg): continue try: for m in commit.modifications: if not valid_source_file(m.filename): continue bug_commit = gitRepo.get_commits_last_modified_lines( commit, m) ### uses SZZ # if bug_commit == {}: continue bug_start_index = 99999999999999999999 for _file in bug_commit: for i, _commit in enumerate(commits[:commit_index]): if _commit.hash in bug_commit[_file] \ and i<bug_start_index: bug_start_index = i for _commit in commits[bug_start_index:commit_index]: bug_tracker[_commit.hash].append(m.filename) except Exception as e: print("[***]", e) print(traceback.format_exc()) print("Continuing for next commits") print(len(bug_tracker.keys())) with open(bug_tracker_pickle, 'wb') as handle: pickle.dump(bug_tracker, handle, protocol=pickle.HIGHEST_PROTOCOL) # Copy the files with open('maj_versions/{}.hash'.format( os.path.basename(os.path.normpath(repo_path)))) as f: major_releases = [] for line in f.read().splitlines(): tag, hash = line.split(',') major_releases.append((tag, hash)) for version, commit in enumerate(commits): if not commit.hash in [item[1] for item in major_releases]: continue if commit.committer_date < start_date or commit.committer_date > last_date: continue for tag, hash in major_releases: if hash == commit.hash: break print("[*] Doing {}".format(tag)) gitRepo.checkout(commit.hash) base_dir_not_bug = "data3/{}/{}/not_bug".format( os.path.basename(os.path.normpath(repo_path)), tag) base_dir_bug = "data3/{}/{}/bug".format( os.path.basename(os.path.normpath(repo_path)), tag) if not os.path.exists(base_dir_bug): os.makedirs(base_dir_bug) if not os.path.exists(base_dir_not_bug): os.makedirs(base_dir_not_bug) all_files = gitRepo.files() for _file in all_files: if not valid_source_file(_file): continue filename = os.path.basename(os.path.normpath(_file)) if commit.hash in bug_tracker and filename in bug_tracker[ commit.hash]: file_path_to_write = os.path.join(base_dir_bug, filename) else: file_path_to_write = os.path.join(base_dir_not_bug, filename) shutil.copyfile(_file, file_path_to_write) print("All Done!")
def repo_to(request): path, to = request.param return list(RepositoryMining(path_to_repo=path, to=to).traverse_commits())
def test_no_url(): with pytest.raises(Exception): list(RepositoryMining().traverse_commits())
from pydriller import RepositoryMining for commit in RepositoryMining('../test-repos/test1/').traverse_commits(): print('hash {} authored by {}'.format(commit.hash, commit.author.name))
def whole_evolution_with_try_except_tracking(self, repository, topic): print(f"Analysing repo ... {repository} in {topic}") commits_with_code_smells_dict = {} total_number_of_commits = 0 try: for commit in RepositoryMining(f"https://github.com/{repository}.git", only_modifications_with_file_types=['.py']).traverse_commits(): total_number_of_commits += 1 for modification in commit.modifications: if ".py" in str(modification.filename): source_code = modification.source_code ## can be _None_ if the file is added if modification.old_path is None: file_path = modification.new_path else: file_path = modification.old_path else: continue try_excepts = ExceptionHandler().find_exception_handler_patterns(source_code, commit) """ try: a = ast.parse(source_code) except SyntaxError as e: continue except ValueError as e: continue v = TryVisitor() v.visit(a) """ if try_excepts is None: continue if len(try_excepts) == 0: continue # if not "zeeguu/model/user.py" in file: # continue """ If none, the occurrence will be added as commits_with_code_smells_dict[file] = [code_smell] """ if commits_with_code_smells_dict.get(file_path) is None: for eh in try_excepts: eh.author = commit.author.name if eh.robustness_exception_handling: eh.robustness_added_or_removed = "added" if eh.any_exception_smell: eh.exception_smell_added_or_removed = "added" commits_with_code_smells_dict[file_path] = [dict( {'date': str(commit.committer_date), 'exception_handlers': try_excepts})] continue if commits_with_code_smells_dict.get(file_path) is not None: handler_changes = False for eh in try_excepts: eh.author = commit.author.name incomings = [] current_exception_handlers = commits_with_code_smells_dict.get(file_path)[-1][ "exception_handlers"] new_current_list_buffer = [] if len(try_excepts) > len(current_exception_handlers): # del new_incoming_list[i] # search for nearest "number" # old_list_lines = [x.lineno for x in current_exception_handlers] # new_list_lines = [x.lineno for x in new_incoming_list] commits_with_code_smells_dict.get(file_path).append(dict( {'date': str(commit.committer_date), 'exception_handlers': try_excepts})) continue """ for file_diff_lineno_key, lineno in modification.diff_parsed.get("added"): for i, current_h in enumerate(new_incoming_list): if current_h.lineno == file_diff_lineno_key: new_incoming_list_buffer.append(current_h) """ """ for i, new_change in enumerate(new_incoming_list): for old_change in current_exception_handlers: closest_number = old_list_lines[ min(range(len(old_list_lines)), key=lambda i: abs( old_list_lines[i] - new_change.lineno))] if old_change.lineno == closest_number: handler_changes, newest_change, = self.process_changes(old_change, handler_changes, new_change) for i in range(len(new_incoming_list)): if new_incoming_list[i].lineno == newest_change.lineno: new_incoming_list[i] = newest_change continue """ if len(try_excepts) < len(current_exception_handlers): for file_diff_lineno_key, lineno in modification.diff_parsed.get("deleted"): for current_hnew in current_exception_handlers: if current_hnew.lineno == file_diff_lineno_key: new_current_list_buffer.append(current_hnew) commits_with_code_smells_dict.get(file_path).append(dict( {'date': str(commit.committer_date), 'exception_handlers': try_excepts, 'removed': new_current_list_buffer})) continue for (current, incoming) in zip(current_exception_handlers, try_excepts): handler_changes, newest_change, = self.process_changes(current, handler_changes, incoming) incomings.append(newest_change) if handler_changes: commits_with_code_smells_dict.get(file_path).append(dict( {'date': str(commit.committer_date), 'exception_handlers': incomings})) except Exception as e: print(e) repo_name = repository.replace("/", "_") path_to_results = f'topic_analysis_results/{topic}' if not os.path.exists(path_to_results): os.makedirs(path_to_results) filename = f"{path_to_results}/{repo_name}_result.json" finaldict = {'repo': repository, 'total_commits': total_number_of_commits} finaldict.update(commits_with_code_smells_dict) with open(filename, "w") as result_file: result_file.write(json.dumps(finaldict, indent=4, sort_keys=False, default=lambda x: x.__dict__))
def test_projectname_multiple_repos_remote(): repos = ['https://github.com/ishepard/pydriller', 'test-repos/pydriller'] for commit in RepositoryMining(path_to_repo=repos).traverse_commits(): assert commit.project_name == 'pydriller'
def repo_to(path, to): return list(RepositoryMining(path_to_repo=path, to=to).traverse_commits())
newline='') # with csvfile1: # writer = csv.writer(csvfile1, delimiter=',') # writer.writerow((commit, date_time, elemet_file, 0, 0, 0, 0, 0, 0, 0, 0, 0)) # csvfile1.close() # wbk.close return row count = 0 os.chdir(pathDirectory) for commit in RepositoryMining(pathDirectory, from_commit=start_commit).traverse_commits(): count += 1 print("Commit :", count) cmd_Checkout = "git checkout " + commit.hash + " -f" # print(cmd_Checkout) # print('CLEAN') # os.system("git reset --hard") print('Checkout !!!!!') subprocess.check_output(cmd_Checkout, shell=True) # os.system(cmd_Checkout) # json_file = { # 'name': "D:/Projects/AnalysteProject/" + ApplicationName + "/Analyse_" + ApplicationName + ".csv", # 'row': row, # 'commit': commit.hash,
from pydriller import RepositoryMining import csv import shutil filename = "commitData.csv" csv_writer = csv.writer(open(filename, 'w')) csv_writer.writerow([ "projectID", "commitHash", "commitMessage", "author", "authorDate", "authorTimezone", "committeer", "committeerDate", "committeerTimezone", "branches", "inMainBranch", "merge", "parents" ]) for commit in RepositoryMining('../usr/src').traverse_commits(): projectName = commit.project_name commitHash = commit.hash message = commit.msg author = commit.author.name date = commit.author_date timezone = commit.author_timezone committeer = commit.committer.name committeerDate = commit.committer_date committeerTimezone = commit.committer_timezone branches = commit.branches inMainBranch = commit.in_main_branch merge = commit.merge parents = commit.parents
from pydriller import RepositoryMining import json from tqdm import tqdm author_dict = dict() for commit in tqdm( RepositoryMining('F:\GitHub\elasticsearch').traverse_commits()): if commit.author.email not in author_dict: author_dict[commit.author.email] = dict() author_dict[commit.author.email]['name'] = str(commit.author.name) author_dict[commit.author.email]['cmt'] = 0 author_dict[commit.author.email]['add'] = 0 author_dict[commit.author.email]['del'] = 0 author_dict[commit.author.email]['fixes'] = 0 author_dict[commit.author.email]['files'] = 0 author_dict[commit.author.email]['cmt'] += 1 if 'fix' in commit.msg: author_dict[commit.author.email]['fixes'] += 1 for mod in commit.modifications: author_dict[commit.author.email]['add'] += mod.added author_dict[commit.author.email]['del'] += mod.removed author_dict[commit.author.email]['files'] += 1 filename = 'Pydriller.json' with open(filename, 'w') as file_obj: json.dump(author_dict, file_obj)
def test_2_identical_local_urls(): urls = ["test-repos/test1", "test-repos/test1"] assert 10 == len( list(RepositoryMining(path_to_repo=urls).traverse_commits()))
class RepositoryProcessor: def __init__(self, repository: str, owner: str): self.owner = owner self.repository = os.path.split(repository)[-1] self.repo = GitRepository(repository) self.mining = RepositoryMining(repository) self.pairs = [] random.seed(42) def run(self): self.get_all_filepairs() with open(os.path.join('filepairs', self.repository, 'pairs.txt'), 'w') as f: f.write('\n'.join( map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs))) f.write('\n') def get_all_filepairs(self, file_filter=java_file_filter): commits = list( filter(lambda x: not x.merge, self.mining.traverse_commits())) for commit in commits: for modification in commit.modifications: if modification.change_type == ModificationType.MODIFY: if file_filter(modification.filename): self.get_file_pair(commit, modification) def get_file_pair(self, commit, modification: Modification): parent = commit.parents[0] repo = self.repo.project_name commit_hash = commit.hash filename = modification.filename path = os.path.join('filepairs', repo, commit_hash, filename) os.makedirs(path, exist_ok=True) self.repo.checkout(parent) before = os.path.join(self.repository, modification.old_path) before_saved = os.path.join(path, 'before_' + commit_hash + '_' + filename) copyfile(before, before_saved) self.repo.checkout(commit_hash) after = os.path.join(self.repository, modification.new_path) after_saved = os.path.join(path, 'after__' + commit_hash + '_' + filename) copyfile(after, after_saved) self.pairs.append( (before_saved, after_saved, commit_hash + '.' + self.owner + '.' + before.replace('/', '.'))) def run_random(self, number): self.get_random_filepairs(number) with open(os.path.join('filepairs', self.repository, 'pairs.txt'), 'w') as f: f.write('\n'.join( map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs))) f.write('\n') def get_random_filepairs(self, number, file_filter=java_file_filter): commits = random.choices(list( filter(lambda x: not x.merge, self.mining.traverse_commits())), k=number) for idx, commit in enumerate(commits): print(f'Processing commit №{idx}: {commit.hash}') for modification in commit.modifications: if modification.change_type == ModificationType.MODIFY: if file_filter(modification.filename): self.get_file_pair(commit, modification)
def repo(request): return list(RepositoryMining(path_to_repo=request.param).traverse_commits())
from pydriller import RepositoryMining from pydriller.domain.commit import ModificationType repo = '/Users/luca/TUProjects/Salerno/jpacman-framework' start = 'f3178b8' stop = '51f041d' files = {} for commit in RepositoryMining(repo, from_commit=start, to_commit=stop).traverse_commits(): for mod in commit.modifications: if mod.filename.endswith( '.java') and mod.change_type is not ModificationType.DELETE: process_metrics = { 'change': mod.change_type, 'added': mod.added, 'removed': mod.removed, 'loc': mod.nloc, 'comp': mod.complexity } path = mod.new_path if path not in files: files[path] = [] files.get(path, []).append(process_metrics) output = open('output.csv', 'w') output.write('file,n-changes,added,removed,loc,complexity\n'.format()) for key, value in files.items(): n_changes = len(value)
def test_clone_repo_to_not_existing(): with pytest.raises(Exception): list(RepositoryMining("https://github.com/ishepard/pydriller", clone_repo_to="NOTEXISTINGDIR").traverse_commits())
def store_commit_data(git_directory_path, devranker_dir, output_file_path, str_from_date, str_to_date): # Why 'set_start_method("spawn")'? # Because getting Multiple windows unnecessarily and window became unresponsive after Mining is done # Ref: https://pythonspeed.com/articles/python-multiprocessing/ mp.set_start_method("spawn") # Creating empty lists for carrying commit data doclist = [] # Using list to update progress bar because it's thread-safe completed_commits = [] # Create Multithreading pool to use full CPU # Ref: https://pythonspeed.com/articles/python-multiprocessing/ pool = mp.Pool(mp.cpu_count()) global total_commits_count # If the Repo has just been cloned, the program will traverse the whole Repo # https://dzone.com/articles/shared-counter-python%E2%80%99s if str_from_date == "All": commits = RepositoryMining(git_directory_path).traverse_commits() # 'more_itertools' used here to find commits count as 'commits' is Iterable # Note: ilen(commits) consumes the iterable 'commits' total_commits_count = more_itertools.ilen(commits) [ pool.apply_async(process_commit(commit, doclist, completed_commits)) for commit in RepositoryMining(git_directory_path).traverse_commits() ] # Close Multiprocessing pool pool.close() pool.join() else: arr_from_date = str_from_date.split("-") arr_to_date = str_to_date.split("-") dt_from = datetime(int(arr_from_date[0]), int(arr_from_date[1]), int(arr_from_date[2]), 0, 0, 0) dt_to = datetime(int(arr_to_date[0]), int(arr_to_date[1]), int(arr_to_date[2]), 0, 0, 0) commits = RepositoryMining(git_directory_path, since=dt_from, to=dt_to).traverse_commits() # 'more_itertools' used here to find commits count as 'commits' is Iterable # Note: ilen(commits) consumes the iterable 'commits' total_commits_count = more_itertools.ilen(commits) if total_commits_count == 0: dict_callback_start_mining["msg"] = "no_commits" print(json.dumps(dict_callback_start_mining)) return [ pool.apply_async(process_commit(commit, doclist, completed_commits)) for commit in RepositoryMining(git_directory_path, since=dt_from, to=dt_to).traverse_commits() ] # Close Multiprocessing pool pool.close() pool.join() # We have data in json format but we need output as csv. # There are many approaches to doing this including using dictionaries and stuff. # But the easiest way is to write json to file using json.dump and using pandas to read json file. # Write data to temp file since pandas.read_json expects file. We can probably optimise without having to # create a new file. temp_file = os.path.join(devranker_dir, 'mod_data.json') with open(temp_file, 'w') as temp_out_file: # json.dump cannot handle python datetime object. We should convert this object to 'str' # https://stackoverflow.com/questions/11875770/how-to-overcome-datetime-datetime-not-json-serializable # https://code-maven.com/serialize-datetime-object-as-json-in-python json.dump(doclist, temp_out_file, default=str) # Use pandas to read json and write to csv. df = pandas.read_json(temp_file) df.to_csv(output_file_path) # Remove the temp file os.remove(temp_file) # display_data_file_location_path() # Inform user that mining is complete dict_callback_start_mining["msg"] = "Done" dict_callback_start_mining["tc"] = 0 dict_callback_start_mining["cc"] = 0 print(json.dumps(dict_callback_start_mining))
def test_badly_formatted_repo_url(): with pytest.raises(Exception): list(RepositoryMining(path_to_repo=set('repo')).traverse_commits())
def get_commit_count(project_path): result = 0 for commit in RepositoryMining(project_path).traverse_commits(): result += 1 return result
#!/usr/bin/env python3 #Takes in a commit hash and returns the code associated with that hash #Hash in question is a hash marked with a cyclomatic complexity of 2041 and I was curious to see what that looks like lol from pydriller import RepositoryMining project_url = 'https://github.com/NationalSecurityAgency/ghidra.git' hash = "2df81f803b99e0900c298f0213dfb7d0911052b1" count = 0 avgLinesOfCode = 0 CommitList = [] with open("codeSegment.txt", 'w') as myfile: for commit in RepositoryMining(project_url).traverse_commits(): for m in commit.modifications: if (commit.hash == hash): myfile.write(m.source_code) myfile.close()
import sys from pydriller import RepositoryMining import csv from functions import remove_duplicate_commits, dictionary_of_spoon_output import subprocess countOfArgs = len(sys.argv) pathToRepo = None if countOfArgs == 2: pathToRepo = sys.argv[1] else: pathToRepo = '../repository/' with open('output/pathToRepo.csv', 'w') as myFile: myFile.write(pathToRepo) changes = [] for commit in RepositoryMining(pathToRepo, only_modifications_with_file_types=['.java']).traverse_commits(): for modification in commit.modifications: if modification.change_type is not None: extOfFile = modification.filename[modification.filename.find('.') + 1:] if extOfFile == 'java' and (modification.change_type.name == 'MODIFY') or \ (modification.change_type.name == 'RENAME'): changes.append([commit.parents[0], modification.old_path, commit.hash, modification.new_path]) with open('output/changes.csv', 'w') as myFile: wr = csv.writer(myFile) wr.writerows(changes) listOfCommitsToIterate = remove_duplicate_commits(changes) with open('output/inputForSpoon.csv', 'w') as myFile: commitsCount = len(listOfCommitsToIterate) position = 1 for key, value in listOfCommitsToIterate.items(): if position != commitsCount:
from pydriller import RepositoryMining import pandas as pd data = pd.read_excel('../Data/types_algos_occurences.xlsx') writer = pd.ExcelWriter('../Data/types.xlsx') df = pd.DataFrame(data) i = 0 while i < len(df): cmp = 0 string = df.at[i, 'name'].lower() for commit in RepositoryMining('../../scikit-learn').traverse_commits(): if (string in commit.msg.lower()): cmp += 1 df.at[i, 'nb'] = cmp i += 1 df.to_excel(writer) writer.save() writer.close()
def miner(): repo_path = os.path.abspath(working_path + repo_name) # Clone if necessary if not os.path.exists(repo_path): print("Cloning: {}".format(repo_name)) for c in RepositoryMining(repo_git, clone_repo_to=os.path.abspath( working_path)).traverse_commits(): pass else: print("{} clone done!".format(repo_name)) # Extract FIX and BIC bic_csv = os.path.abspath(working_path + repo_name + "_all.csv") header = [ "hash", "path", "size", "developer", "type", "fix", "bic_path", "bic_hash", "bic_size" ] if not os.path.exists(bic_csv): print("Extracting FIX and BIC") out_file = open(bic_csv, 'w', newline='', encoding="utf-8") writer = csv.DictWriter(out_file, delimiter=',', fieldnames=header) writer.writeheader() to_date = datetime(2017, 12, 1, 12, 0, 0) gr = GitRepository(repo_path) gr2 = GitRepository(repo_path) for commit in RepositoryMining( repo_path, to=to_date, only_no_merge=True, only_modifications_with_file_types=extensions, reversed_order=True).traverse_commits(): msg = commit.msg.lower() mods = commit.modifications if len(mods) < 50 and any(word in msg for word in keywords): dout = { "hash": commit.hash, "size": len(mods), "developer": commit.committer.email, "fix": True } for mod in mods: dout["type"] = mod.change_type if mod.change_type == ModificationType.DELETE: dout["path"] = mod.old_path else: dout["path"] = mod.new_path bics_per_mod = gr.get_commits_last_modified_lines( commit, mod) for bic_path, bic_commit_hashs in bics_per_mod.items(): dout["bic_path"] = bic_path for bic_commit_hash in bic_commit_hashs: bic = gr2.get_commit(bic_commit_hash) dout["bic_hash"] = bic_commit_hash dout["bic_size"] = len(bic.modifications) writer.writerow(dout) out_file.flush() else: dout = { "hash": commit.hash, "size": len(mods), "developer": commit.committer.email, "fix": False, "bic_path": "---", "bic_hash": "---", "bic_size": "---" } for mod in mods: dout["path"] = mod.new_path writer.writerow(dout) out_file.flush() out_file.close() else: print("Extracting FIX and BIC done!") # Get unique BIC in_file = open(bic_csv, 'r', newline='', encoding="utf-8") reader = csv.DictReader(in_file, delimiter=',') unique_devs = set() unique_commits = set() fixes = {} unique_bics = set() unique_fics = set() for row in reader: unique_commits.add(row["hash"]) if row["path"].endswith(tuple(extensions)): unique_devs.add(row["developer"]) unique_bics.add(row["bic_hash"]) unique_fics.add(row["bic_path"]) if row["fix"] == "True": fixes[row["hash"]] = True unique_bics.remove("---") unique_fics.remove("---") in_file.close() print("Developers: {}, Commits: {} Defective: {}".format( len(unique_devs), len(unique_commits), len(fixes))) # Save list of BIC unique_bic_txt = os.path.abspath(working_path + repo_name + "_unique_bic.txt") out_file = open(unique_bic_txt, 'w', newline='', encoding="utf-8") for bic in unique_bics: out_file.write(bic) out_file.write("\n") out_file.close() # Save list of FIX unique_fix_txt = os.path.abspath(working_path + repo_name + "_unique_fix.txt") out_file = open(unique_fix_txt, 'w', newline='', encoding="utf-8") for fix in fixes: out_file.write(fix) out_file.write("\n") out_file.close() # Count fully and partially defective commits, and defective files in defective commits bic_csv = os.path.abspath(working_path + repo_name + "_bic_metrics.csv") header = ["bic_hash", "bic_size", "bic_path", "defective"] if not os.path.exists(bic_csv): print("Counting partial BIC") out_file = open(bic_csv, 'w', newline='', encoding="utf-8") writer = csv.DictWriter(out_file, delimiter=',', fieldnames=header) writer.writeheader() gr = GitRepository(repo_path) for bic_hash in unique_bics: commit = gr.get_commit(bic_hash) diff = count_file = len(commit.modifications) dout = { "bic_hash": bic_hash, "bic_size": len(commit.modifications) } for mod in commit.modifications: if mod.filename.endswith( tuple(extensions) ) and mod.change_type is not ModificationType.DELETE: dout["bic_path"] = mod.new_path if mod.new_path in unique_fics: diff -= 1 dout["defective"] = True else: dout["defective"] = False writer.writerow(dout) out_file.flush() else: count_file -= 1 diff -= 1 out_file.close() else: print("Counting partial BIC done!") # Calculate partially defective commits in_file = open(bic_csv, 'r', newline='', encoding="utf-8") reader = csv.DictReader(in_file, delimiter=',') bics = {} fully_defective = partially_defective = 0 partially_defective_files = total_defective_files = 0 for row in reader: if row["bic_path"].endswith(tuple(extensions)): if row["bic_hash"] in bics: bics[row["bic_hash"]].append(row["defective"]) else: bics[row["bic_hash"]] = [row["defective"]] for key, value in bics.items(): count_defective_files = value.count("True") if len(value) > 1: total_defective_files += count_defective_files if len(value) == count_defective_files: fully_defective += 1 else: partially_defective += 1 partially_defective_files += len(value) - count_defective_files ratio_defective_files_in_defective_commits = round( (partially_defective_files / total_defective_files) * 100, 1) ratio_partially_defective_commits = round( (partially_defective / (fully_defective + partially_defective)) * 100, 1) print( "Partially def. commits: {}%. Defective files in partially def. commits: {}%" .format(ratio_partially_defective_commits, ratio_defective_files_in_defective_commits))
def findBugCausingCommits(projectMap, local_repos_directory, output_directory): bugInducingProjectMap = {} for project, commits in projectMap.items(): print("finding bug causing commits for ", str(local_repos_directory) + "/" + project) if (os.path.exists( str(output_directory) + "/" + str(project) + "_bug_causing_commits") and os.path.isfile( str(output_directory) + "/" + str(project) + "_bug_causing_commits")): print(project, "already analyzed, skipping...") continue repo_path = str(local_repos_directory) + "/" + project repo = GitRepository(repo_path) startTime = time.time() bugInducingCommits = [] hashes = [x["commit_hash"] for x in commits] try: # analyze each bug fix for this project for bugFix in RepositoryMining( repo_path, only_commits=hashes).traverse_commits(): # get the commits that last touched the modified lines of the files commitsLastTouchedFix = repo.get_commits_last_modified_lines( bugFix) bugCausingHashes = set([]) for filename, fileCommit in commitsLastTouchedFix.items(): for fileHash in fileCommit: bugCausingHashes.add(fileHash) hashList = [x for x in bugCausingHashes] # get average statistics about each of these commits # number of files modified for the commit # number of lines added for the commit # number of lines removed for the commit # number of methods changed for the commit # author of the commit # the elapsed time for the bug fix # branches for bugCausingCommit in RepositoryMining( repo_path, only_commits=hashList).traverse_commits(): numModifiedFiles = len(bugCausingCommit.modifications) linesAdded = 0 linesRemoved = 0 numMethodsChanged = 0 sum_nloc = 0 numFilesWithComplexity = 0 sumComplexity = 0 if numModifiedFiles <= 0: continue for modification in bugCausingCommit.modifications: sourceCodeLanguage = LanguageDetector.detect( modification.filename) try: if (sourceCodeLanguage == None or modification.nloc == 0 or modification.nloc is None): continue except: pass sum_nloc = sum_nloc + modification.nloc linesAdded = linesAdded + modification.added linesRemoved = linesRemoved + modification.removed numMethodsChanged = numMethodsChanged + len( modification.changed_methods) if modification.complexity: numFilesWithComplexity = numFilesWithComplexity + 1 sumComplexity = sumComplexity + modification.complexity averageComplexityFixedFiles = 0 if (numFilesWithComplexity != 0): averageComplexityFixedFiles = sumComplexity / numFilesWithComplexity bugInducingInfo = { "commit_hash": bugCausingCommit.hash, "author": bugCausingCommit.author.name, "total_complexity": sumComplexity, "average_complexity": averageComplexityFixedFiles, "sum_nloc": sum_nloc, "num_files": numModifiedFiles, "lines_added": linesAdded, "lines_removed": linesRemoved, "commit_date": bugCausingCommit.author_date, "branches": bugCausingCommit.branches, "num_methods_changed": numMethodsChanged, "time_to_fix": bugFix.author_date - bugCausingCommit.author_date } # print(bugInducingInfo["commit_hash"]) # print(bugInducingInfo["author"]) # print(bugInducingInfo["total_complexity"]) # print(bugInducingInfo["average_complexity"]) # print(bugInducingInfo["sum_nloc"]) # print(bugInducingInfo["num_files"]) # print(bugInducingInfo["lines_added"]) # print(bugInducingInfo["lines_removed"]) # print(bugInducingInfo["commit_date"]) # print(bugInducingInfo["branches"]) # print(bugInducingInfo["num_methods_changed"]) # print(bugInducingInfo["time_to_fix"]) bugInducingCommits.append(bugInducingInfo) tempMap = {project: bugInducingCommits} IOUtils.writeBugMap(tempMap, output_directory, "_bug_causing_commits") endTime = time.time() print("time", endTime - startTime) except: print("FAILED FOR", project) pass
def test_simple_url(): assert 5 == len( list( RepositoryMining( path_to_repo="test-repos/test1").traverse_commits()))
from pydriller import RepositoryMining for commit in RepositoryMining('https://github.com/java-native-access/jna').traverse_commits(): print('hash {} authored by {}'.format(commit.hash, commit.author.name))
for x in node.body: if isinstance(x, ast.FunctionDef): Contador += 1 super(MyCustomVisitor, self).generic_visit(node) class RodarAnalise(): def __init__(self): pass if __name__ == "__main__": numeroDoComit = 0 for lista in RepositoryMining( 'https://github.com/WilliamCDL/Testeinicial', only_modifications_with_file_types=['.py']).traverse_commits(): for arquivos in lista.modifications: if arquivos.filename.endswith('.py'): try: root = ast.parse(arquivos.source_code) except (SyntaxError, ValueError) as e1: #logging.error('{}\n\t{}'.format(e1.msg, e1.text)) continue except IndentationError as e2: logging.error(e2.print_file_and_line) continue visitor = MyCustomVisitor() visitor.visit(root)