def analyze_syntax_commits(directory, project): """ Creates links from the commit to the possible bug number from the commit message together with the syntactic score. It checked syntax only -- there is no verification whether the numbers considered to be possible bugs are indeed bug numbers. :param directory: :return: a list of tuples - links between commits and the possible bug numbers with the syntactic scores """ commits_dir = path.join(directory, 'commits.json') number_pattern = re.compile('[# \t][0-9]+|[/ \t][0-9]+|[- \t][0-9]+') commits = load_data(commits_dir) links = list() for commit in commits: # Note: omit merge commits if is_merge(commit['message']) or is_revert(commit['message']): continue for n in number_pattern.findall(commit['message']): num = select_number(n) if int(num) < 1: continue syntactic = 0 if is_bug_number(commit['message'], num, number_pattern) or is_hash_number( commit['message'], num, number_pattern): syntactic += 1 if is_number(commit['message']) or is_keyword(commit['message']): syntactic += 1 if is_test(commit['message']): syntactic = 0 link = (commit['commit'], commit['message'], num, syntactic, commit['author_email'], commit['time']) links.append(link) if test: test_dir = get_test_dir(directory) save_in_file(links, 'test_fix_partial_links.json', test_dir) return links
def save_bugs(project): directory = path.join('data', project) makedirs(directory, exist_ok=True) if test: get_test_dir(directory) bugs = determine_bugs(directory) save_in_file(bugs, 'bugs.json', directory)
def choose_fixes(links, directory, project): fixes = set() if test: test_fixes_positive = list() test_fixes_negative = list() test_fixes = { "positives": test_fixes_positive, "negatives": test_fixes_negative, "notes": "" } for link in links: # sytax > 0 and semantics = 1 or semantics > 1 if (int(link[3]) > 0 and int(link[6]) == 1) or int(link[6]) > 1: fixes.add(link[0]) if test: pos = (link[0], link[1], link[2], link[3], link[6]) test_fixes_positive.append(pos) elif test: neg = (link[0], link[1], link[2], link[3], link[6]) test_fixes_negative.append(neg) log_debug(project, 'Number of fixes: {0[0]}', (len(fixes), )) if test: # Note: There may be more positives than fixes since full data is saved there. It does not mean duplicate fixes. log_debug(project, 'Number of positives: {0[0]}', (len(test_fixes_positive), )) log_debug(project, 'Number of negatives: {0[0]}', (len(test_fixes_negative), )) test_fixes.update({"fixes": len(fixes)}) test_dir = get_test_dir(directory) save_in_file(test_fixes, 'test_fixes.json', test_dir) return fixes
def determine_bugs(d): if test: counter = 0 test_bugs_positive = list() test_bugs_negative = list() test_bugs = {'positives' : test_bugs_positive, 'negatives' : test_bugs_negative } test_dir = get_test_dir(d) issues = load_issues(d) label_score = load_labels(d) bugs = list() for x in issues: score = 0 for label in x['labels']: if label['name'] in label_score: score += label_score[label['name']]['score'] else: logging.error('E: extract_bugs: unknown label {}'.format(label['name'])) if score > 0: bug = {'number': x['number'],'labels' : x['labels'], 'title': x['title'], 'assignee': x['assignee'], 'state': x['state'], 'body': x['body'], 'created_at': x['created_at'], 'closed_at' : x['closed_at']} bugs.append(bug) if test: counter += 1 if counter % 3 == 0 and len(test_bugs_positive) < 25: test_bugs_positive.append(bug) elif test: counter += 1 if counter % 7 == 0 and len(test_bugs_negative) < 25: test_bugs_negative.append(x) if test: print('size of negatives:', len(test_bugs_negative)) print('size of positives:', len(test_bugs_positive), '\n size of bugs: ', len(bugs)) save_in_file(test_bugs, 'test_bugs.json', test_dir) return bugs
def remove_outliers(authors, project): """ Removes the authors with the bottom <cutoff_perc>% of the lines. :param authors: dictionary of authors with day commit data :return: prolific authors """ sums = dict() sum_list = list() for a in authors.keys(): sum = 0 for date, changes in authors[a].items(): sum += changes['changes'][2] if sum in sums.keys(): sums[sum].append(a) else: sums.update({sum: [a]}) for key, value in sums.items(): temp = [key, value] sum_list.append(temp) # bottom % sorted_sums = sorted(sum_list, key=lambda x: int(x[0])) if test == True: dir = path.join('data/test', project) save_in_file(sorted_sums, 'test_sums.json', dir) start = math.ceil(cutoff_perc * len(sorted_sums)) remove = sorted_sums[0:start] for s in remove: for auth in s[1]: del authors[auth] assert sorted_sums[start + 1][0] > sorted_sums[start][0] info = 'Removed (line nums) {0[0]} - {0[1]}% from {0[2]} with the lowest lines: {0[3]} and cutoff: {0[4]}' log_debug(project, info, (len(remove), cutoff_perc, len(sorted_sums), sorted_sums[start + 1][0], sorted_sums[start][0]))
def rate_developers(project): T = time() directory = path.join('data', project) commits = load_commits(directory) buggy = load_buggy(directory) annotated = annotate_commits(project, commits, buggy) log_debug(project, 'Commits annotated in {0[0]} seconds\n', (time() - T, )) save_in_file(annotated, 'commits_changes.json', directory)
def get_counts_years(data): for dev, values in data.items(): for y, extensions in values.items(): ext = dict() sum = len(extensions) for e in extensions: if ext.get(e) is None: ext.update({e: extensions.count(e)}) data[dev][y] = {'extensions' : ext} data[dev][y].update({'sum' : sum}) if test: save_in_file(data, 'test_years_count.json', 'data/test') return data
def report_results(errors, project, name): good = name + ' -- OK' bad = name + ' -- Errors: {0[0]}' if len(errors) == 0: log_info(project, good, None) else: if len(errors) > 3 and name == 'test_buggy_duplicates': t = path.join('data', project, 'test') f = name + '_buggy.json' save_in_file(list(errors), f, t) mssg = '...... See the file for the list to examine' log_info(project, bad, (mssg, )) else: log_info(project, bad, (errors, ))
def report_results(errors, project, name): good = name + ' -- OK\n' bad = name + ' -- The following fixes need extra verification: {0[0]}. There are {0[1]} items to verify.\n' if len(errors) == 0: log_info(project, good, None) else: if len(errors) > 10: t = path.join('data', project, 'test') f = name + '_fixes.json' save_in_file(list(errors), f, t) show = list(errors)[:3] mssg = str(show) + '...... See the file for more' log_info(project, bad, (mssg, len(errors))) else: log_info(project, bad, (errors, len(errors)))
def get_unique_extensions(dates): """Get unique extensions.""" uniques = dict() for day in dates: # to avoid duplicates, populate seen seen = dict() for ext in day[1]: if seen.get(ext) is None: seen.update({ext: list()}) if uniques.get(ext) is None: uniques.update({ext: [list(day)]}) seen.get(ext).append(list(day)) else: if day not in seen.get(ext): seen.get(ext).append(list(day)) uniques.get(ext).append(list(day)) if test: save_in_file(uniques, 'test_uniques.json', 'data/test') return uniques
def select_devs(): authors = get_authors() r = ranges(authors) log_debug(None, r, None) ratio_limit = 0 args = argparser.parse_args() if args.l is not None: ratio_limit = float(args.l) good, bad = select(ratio_limit, authors) if test: save_in_file({ 'good': good, 'bad': bad }, 'selected_test.json', 'data/test') save_in_file( { 'good': [x['dev'] for x in good], 'bad': [x['dev'] for x in bad] }, 'selected.json', 'data')
def get_fix_changes(project): """ Get deleted and inserted lines for each fix """ T = time() directory = path.join('data', project) differences = dict() for fix in load_fixes(directory): diffs = get_commit_differences(fix, project, False) if diffs is None: continue commit = get_short(list(diffs.keys())[0], project) if differences.get(commit) is not None: continue differences.update({commit: diffs}) log_debug(project, 'Fix changes done in {0[0]} seconds', (format(time() - T))) if test: save_in_file(differences, 'test_buggy_fix_diffs.json', get_test_dir(directory)) return list(differences.values())
def get_data(authors, g): """Get extensions data per period""" days = dict() devs = dict() test_overall = list() for dev, dates in authors.items(): days[dev] = list() for date, info in dates.items(): exts = list() day = [date, exts] for c in info['commits']: for sha, files in c.items(): fs = [x['new'] for x in files] extensions = extract_extentions(fs) exts.extend(extensions) test_overall.extend(extensions) if len(day[1]) > 0: days[dev].append(day) # devs if devs.get(dev) is None: devs.update({dev: list(day[1])}) else: devs[dev].extend(day[1]) years = get_years_data(days) nd = 'days_' + g + '.json' save_in_file(days, nd, 'data') if test: ny = 'test_years_' + g + '.json' to = 'test_overall_' + g + '.json' save_in_file(years, ny, 'data/test') save_in_file(test_overall, to, 'data/test') return {'days': days, 'years': years, 'devs' : devs}
def devs_table(): """ Create a csv file summing up the data of each dev for each repo separately - dev email, total commits, total lines, ratio, daily ratio, commit ratio :return: None """ repos = get_repos() # todo: should we do it one for all repos? for repo in repos: data = list() data.append([ 'Author', 'Line Ratio', 'Daily Line Ratio', 'Lines', 'Commit Ratio', 'Commits' ]) ratios_dir = os.path.join('data', repo, 'authors_line_ratio.json') commit_dir = os.path.join('data', repo, 'authors_commits_ratio.json') authors = load_data(ratios_dir) commits = load_data(commit_dir) for author in authors: d = [ author['dev'], author['commits']['ratio'], author['commits']['daily_ratio'], author['commits']['sum'] ] c = [(x['commits']['buggy'], x['commits']['good']) for x in commits if x['dev'] == author['dev']] if c[0][1] == 0: d.append(1) else: d.append(c[0][0] / c[0][1]) d.append(c[0][0] + c[0][1]) data.append(d) save_in_file([], 'authors.csv', os.path.join('data', repo)) myFile = open(os.path.join('data', repo, 'authors.csv'), 'w') with myFile: writer = csv.writer(myFile) writer.writerows(data) log_debug(repo, 'done', None)
def compare_weeks_daily_sums(): """Compare weekly entropy values with the sums of daily per week. Save the results.""" days = {'good': load_data('data/days_good.json'), 'bad': load_data('data/days_bad.json')} weeks = {'good': get_weeks(days['good']), 'bad': get_weeks(days['bad'])} if test: save_in_file(weeks, 'test_weekly.json', 'data/test') weeks_entropy = weekly_entropy(weeks) save_in_file(weeks_entropy, 'entropy_weekly.json', 'data') daily_entropies = load_data('data/entropy_daily.json') weeks_sums_daily = weekly_entropy_sum_days(daily_entropies) # save_in_file(weeks_sums_daily, 'entropy_weeks_sums_daily.json', 'data') weeks_analyzed, summary = analyze_weekly(weeks_entropy, weeks_sums_daily) save_in_file(weeks_analyzed, 'entropy_weeks_analyzed.json', 'data')
def get_focus(): os.makedirs('data/test', exist_ok=True) groups = set_up_groups() fpe = get_ext_file_nums(groups) # files per extension save_in_file(groups, 'entropy_groups.json', 'data') ratios = load_ratios() T1 = time() ## focus analysis - daily, weekly, yearly, 4 years analyze_extensions(groups, ratios) compare_weeks_daily_sums() ## focus analysis per extension ext = extensions_entropies() save_in_file(ext, 'entropy_extensions.json', 'data') average_ext_number(ext) distribution = extension_distribution(ext) save_in_file(distribution, 'entropy_ext_distribution.json', 'data') get_ext_ranges(ext) popular = compare_most_popular(ext) save_in_file(popular, 'entropy_exts_popular.json', 'data') get_extremes_avg(ext, fpe) # analyze_group_exts() log_debug(None, 'Extensions analyzed in {0[0]} seconds', (time() - T1,))
def save_issues(owner, project, directory): bs = download_issues(owner, project) save_in_file(bs, 'issues.json', directory)
def combine_authors(): combined = combine() save_in_file(combined, 'authors_combined.json', 'data') authors = get_ratios(combined) save_in_file(authors, 'authors_ratio.json', 'data')
def get_labels(owner, project, directory): labels = download_labels(owner, project) rated = rate_labels(labels, project) save_in_file(rated, 'labels.json', directory)
def extract_fixes(directory, project): links = analyze_syntax_commits(directory, project) fixes = analyze_semantics_commits(links, directory, project) save_in_file(list(fixes), 'fixes.json', directory)
def save_commits(project, repo, directory): commits = extract_commits_basic(project, repo) save_in_file(commits, 'commits.json', directory)
def analyze_extensions(groups, ratios): """Save the focus entropy results for time period.""" entropies = { 'good' : None, 'bad' : None } daily = { 'good' : list(), 'bad' : list() } yearly = { 'good': list(), 'bad': list(), 'all' : list() } per_year = { 2014 : { 'good': list(), 'bad' : list() }, 2015: { 'good': list(), 'bad': list() }, 2016 : { 'good': list(), 'bad' : list() }, 2017: { 'good': list(), 'bad': list() } } per_dev = { 'good' : list(), 'bad' : list() } ranges_vis = { 'categories' : list(), 'good' : list(), 'bad' : list() } focus_level = { 'good' : None, 'bad' : None } entr_devs = { 'good' : None, 'bad' : None } for g, authors in groups.items(): T = time() data = get_data(authors, g) days = data.get('days') years = data.get('years') devs = data.get('devs') years_counted = get_counts_years(years) devs_counted, devs_sums = get_counts_obj(devs) log_debug(None, 'Data prepared in {0[0]}', (time()-T,)) daily[g] = daily_entropy(days, g) ye = yearly_entropy(years_counted, g, per_year) yearly[g] = ye yearly['all'].extend(ye) per_dev[g], pd = per_dev_entropy(devs_counted, devs_sums, ratios) ranges = get_ranges(per_dev[g]) entropies[g] = ranges[0] ranges_vis['categories'] = ranges[2] ranges_vis[g] = ranges[1] focus_level[g] = ranges[3] log_info(None, 'Ranges for {0[0]} devs: {0[1]}', (g, entropies[g])) entr_devs[g] = pd log_info(None, 'Focus levels are {0[0]}', (focus_level,)) save_in_file(daily, 'entropy_daily.json', 'data') save_in_file(yearly, 'entropy_yearly.json', 'data') save_in_file(per_year, 'entropy_per_year.json', 'data') save_in_file(per_dev, 'entropy_per_dev.json', 'data') save_in_file(entr_devs, 'entropy_all.json', 'data') save_in_file(entropies, 'entropy_ranges.json', 'data') save_in_file(ranges_vis, 'entropy_ranges_vis.json', 'data')
def prep_data(project): directory = path.join('data', project) commits = load_commits(directory) author_data = by_author(commits, project) save_in_file(author_data, 'authors.json', directory)
def get_fix_inducing(fixes, project): """ Get commits and line numbers of the fix-inducing changes. fic is a collection of { buggy commit : lines considered buggy } while blame_coll contains full buggy lines for troubleshooting and verification. ./data/<project>/buggy_changes.json file contain the commit (and the line numbers) that introduced a bug. For each fix, 'git blame' is run on parents for the files modified and lines deleted. Output of blame is parsed to indicate lines inserted in each fix-inducing commit corresponding to a fix. """ if test: blame_coll = list() test_lines = list() fic = {} # fix inducing commits T = time() repo_dir = path.join('repos', project) directory = path.join('data', project) for fix in fixes: if fix is None: continue commit = list(fix.keys())[0] for fo in fix.get(commit): # Note: We may get two parents in case of a merge. parents = get_commit_parents(commit, '1', repo_dir) lines = fo['deleted'] # lines deleted in a fix if len(lines) == 0 or lines is None: continue line_groups = get_lines(lines) for lines_str in line_groups: line_start = int(lines_str.split(',')[0]) for parent in parents[0].split(" "): if fo['old'] is None: continue cmd = [ 'git', 'blame', '-f', '-L', lines_str, parent, '--', fo['old'] ] completed = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True, cwd=repo_dir) blames = str(completed.stdout).split('\n') if test: new_blames = list() test_l = list() # save only the lines we are interested in line = line_start for blame in blames: if line in list(lines): buggy_commit = blame.strip().split(' ')[0] if len(buggy_commit) < 2: continue if fic.get(buggy_commit) is not None: if (line, fo['old'] ) not in fic.get(buggy_commit): fic.get(buggy_commit).append( (line, fo['old'])) else: # Note: we may have dup line numbers from different files. ls = list() fix_inducing = {buggy_commit: ls} ls.append((line, fo['old'])) fic.update(fix_inducing) if test: new_blames.append(blame) test_l.append((buggy_commit, line, fo['old'])) line += 1 if test: blame_coll.append({ commit: new_blames, 'lines': lines, 'file': fo['old'], "parent": parent }) test_lines.append({commit: test_l}) log_debug(project, 'Blames done in {0[0]} seconds\n', (time() - T, )) save_in_file(fic, 'buggy_changes.json', directory) if test: test_dir = get_test_dir(directory) save_in_file(blame_coll, 'test_buggy_lines.json', test_dir) save_in_file(test_lines, 'test_buggy_lines_dups.json', test_dir) log_debug(project, 'Number of buggy commits: {0[0]}.\n', (len(fic), )) return fic