Ejemplo n.º 1
0
def analyze_syntax_commits(directory, project):
    """
    Creates links from the commit to the possible bug number from the commit message together with the syntactic score.
    It checked syntax only -- there is no verification whether the numbers considered to be possible bugs are indeed bug numbers.
    :param directory:
    :return: a list of tuples - links between commits and the possible bug numbers with the syntactic scores
    """
    commits_dir = path.join(directory, 'commits.json')
    number_pattern = re.compile('[# \t][0-9]+|[/ \t][0-9]+|[- \t][0-9]+')
    commits = load_data(commits_dir)
    links = list()
    for commit in commits:
        # Note: omit merge commits
        if is_merge(commit['message']) or is_revert(commit['message']):
            continue
        for n in number_pattern.findall(commit['message']):
            num = select_number(n)
            if int(num) < 1:
                continue
            syntactic = 0
            if is_bug_number(commit['message'], num,
                             number_pattern) or is_hash_number(
                                 commit['message'], num, number_pattern):
                syntactic += 1
            if is_number(commit['message']) or is_keyword(commit['message']):
                syntactic += 1
            if is_test(commit['message']):
                syntactic = 0
            link = (commit['commit'], commit['message'], num, syntactic,
                    commit['author_email'], commit['time'])
            links.append(link)
    if test:
        test_dir = get_test_dir(directory)
        save_in_file(links, 'test_fix_partial_links.json', test_dir)
    return links
Ejemplo n.º 2
0
def save_bugs(project):
  directory = path.join('data', project)
  makedirs(directory, exist_ok=True)
  if test:
    get_test_dir(directory)
  bugs = determine_bugs(directory)
  save_in_file(bugs, 'bugs.json', directory)
Ejemplo n.º 3
0
def choose_fixes(links, directory, project):
    fixes = set()
    if test:
        test_fixes_positive = list()
        test_fixes_negative = list()
        test_fixes = {
            "positives": test_fixes_positive,
            "negatives": test_fixes_negative,
            "notes": ""
        }
    for link in links:
        # sytax > 0 and semantics = 1  or  semantics > 1
        if (int(link[3]) > 0 and int(link[6]) == 1) or int(link[6]) > 1:
            fixes.add(link[0])
            if test:
                pos = (link[0], link[1], link[2], link[3], link[6])
                test_fixes_positive.append(pos)
        elif test:
            neg = (link[0], link[1], link[2], link[3], link[6])
            test_fixes_negative.append(neg)
    log_debug(project, 'Number of fixes: {0[0]}', (len(fixes), ))
    if test:
        # Note: There may be more positives than fixes since full data is saved there. It does not mean duplicate fixes.
        log_debug(project, 'Number of positives: {0[0]}',
                  (len(test_fixes_positive), ))
        log_debug(project, 'Number of negatives: {0[0]}',
                  (len(test_fixes_negative), ))
        test_fixes.update({"fixes": len(fixes)})
        test_dir = get_test_dir(directory)
        save_in_file(test_fixes, 'test_fixes.json', test_dir)
    return fixes
Ejemplo n.º 4
0
def determine_bugs(d):
  if test:
    counter = 0
    test_bugs_positive = list()
    test_bugs_negative = list()
    test_bugs = {'positives' : test_bugs_positive, 'negatives' : test_bugs_negative }
    test_dir = get_test_dir(d)
  issues = load_issues(d)
  label_score = load_labels(d)
  bugs = list()
  for x in issues:
    score = 0
    for label in x['labels']:
      if label['name'] in label_score:
        score += label_score[label['name']]['score']
      else:
        logging.error('E: extract_bugs: unknown label {}'.format(label['name']))
    if score > 0:
      bug = {'number': x['number'],'labels' : x['labels'], 'title': x['title'], 'assignee': x['assignee'], 'state': x['state'], 'body': x['body'], 'created_at': x['created_at'], 'closed_at' : x['closed_at']}
      bugs.append(bug)
      if test:
        counter += 1
        if counter % 3 == 0 and len(test_bugs_positive) < 25:
          test_bugs_positive.append(bug)
    elif test:
        counter += 1
        if counter % 7 == 0 and len(test_bugs_negative) < 25:
          test_bugs_negative.append(x)
  if test:
    print('size of negatives:', len(test_bugs_negative))
    print('size of positives:', len(test_bugs_positive), '\n size of bugs: ', len(bugs))
    save_in_file(test_bugs, 'test_bugs.json', test_dir)
  return bugs
Ejemplo n.º 5
0
def remove_outliers(authors, project):
    """
    Removes the authors with the bottom <cutoff_perc>% of the lines.
    :param authors: dictionary of authors with day commit data
    :return: prolific authors
    """
    sums = dict()
    sum_list = list()
    for a in authors.keys():
        sum = 0
        for date, changes in authors[a].items():
            sum += changes['changes'][2]
        if sum in sums.keys():
            sums[sum].append(a)
        else:
            sums.update({sum: [a]})
    for key, value in sums.items():
        temp = [key, value]
        sum_list.append(temp)
    # bottom %
    sorted_sums = sorted(sum_list, key=lambda x: int(x[0]))
    if test == True:
        dir = path.join('data/test', project)
        save_in_file(sorted_sums, 'test_sums.json', dir)
    start = math.ceil(cutoff_perc * len(sorted_sums))
    remove = sorted_sums[0:start]
    for s in remove:
        for auth in s[1]:
            del authors[auth]
    assert sorted_sums[start + 1][0] > sorted_sums[start][0]
    info = 'Removed (line nums) {0[0]} - {0[1]}% from {0[2]} with the lowest lines: {0[3]} and cutoff: {0[4]}'
    log_debug(project, info,
              (len(remove), cutoff_perc, len(sorted_sums),
               sorted_sums[start + 1][0], sorted_sums[start][0]))
Ejemplo n.º 6
0
def rate_developers(project):
    T = time()
    directory = path.join('data', project)
    commits = load_commits(directory)
    buggy = load_buggy(directory)
    annotated = annotate_commits(project, commits, buggy)
    log_debug(project, 'Commits annotated in {0[0]} seconds\n', (time() - T, ))
    save_in_file(annotated, 'commits_changes.json', directory)
Ejemplo n.º 7
0
def get_counts_years(data):
    for dev, values in data.items():
        for y, extensions in values.items():
            ext = dict()
            sum = len(extensions)
            for e in extensions:
                if ext.get(e) is None:
                    ext.update({e: extensions.count(e)})
            data[dev][y] = {'extensions' : ext}
            data[dev][y].update({'sum' : sum})
    if test:
        save_in_file(data, 'test_years_count.json', 'data/test')
    return data
Ejemplo n.º 8
0
def report_results(errors, project, name):
    good = name + ' -- OK'
    bad = name + ' -- Errors: {0[0]}'
    if len(errors) == 0:
        log_info(project, good, None)
    else:
        if len(errors) > 3 and name == 'test_buggy_duplicates':
            t = path.join('data', project, 'test')
            f = name + '_buggy.json'
            save_in_file(list(errors), f, t)
            mssg = '...... See the file for the list to examine'
            log_info(project, bad, (mssg, ))
        else:
            log_info(project, bad, (errors, ))
Ejemplo n.º 9
0
def report_results(errors, project, name):
    good = name + ' -- OK\n'
    bad = name + ' -- The following fixes need extra verification: {0[0]}. There are {0[1]} items to verify.\n'
    if len(errors) == 0:
        log_info(project, good, None)
    else:
        if len(errors) > 10:
            t = path.join('data', project, 'test')
            f = name + '_fixes.json'
            save_in_file(list(errors), f, t)
            show = list(errors)[:3]
            mssg = str(show) + '...... See the file for more'
            log_info(project, bad, (mssg, len(errors)))
        else:
            log_info(project, bad, (errors, len(errors)))
Ejemplo n.º 10
0
def get_unique_extensions(dates):
    """Get unique extensions."""
    uniques = dict()
    for day in dates:
        # to avoid duplicates, populate seen
        seen = dict()
        for ext in day[1]:
            if seen.get(ext) is None:
                seen.update({ext: list()})
            if uniques.get(ext) is None:
                uniques.update({ext: [list(day)]})
                seen.get(ext).append(list(day))
            else:
                if day not in seen.get(ext):
                    seen.get(ext).append(list(day))
                    uniques.get(ext).append(list(day))
    if test:
        save_in_file(uniques, 'test_uniques.json', 'data/test')
    return uniques
Ejemplo n.º 11
0
def select_devs():
    authors = get_authors()
    r = ranges(authors)
    log_debug(None, r, None)
    ratio_limit = 0
    args = argparser.parse_args()
    if args.l is not None:
        ratio_limit = float(args.l)
    good, bad = select(ratio_limit, authors)
    if test:
        save_in_file({
            'good': good,
            'bad': bad
        }, 'selected_test.json', 'data/test')
    save_in_file(
        {
            'good': [x['dev'] for x in good],
            'bad': [x['dev'] for x in bad]
        }, 'selected.json', 'data')
Ejemplo n.º 12
0
def get_fix_changes(project):
    """
        Get deleted and inserted lines for each fix
    """
    T = time()
    directory = path.join('data', project)
    differences = dict()
    for fix in load_fixes(directory):
        diffs = get_commit_differences(fix, project, False)
        if diffs is None:
            continue
        commit = get_short(list(diffs.keys())[0], project)
        if differences.get(commit) is not None:
            continue
        differences.update({commit: diffs})
    log_debug(project, 'Fix changes done in {0[0]} seconds',
              (format(time() - T)))
    if test:
        save_in_file(differences, 'test_buggy_fix_diffs.json',
                     get_test_dir(directory))
    return list(differences.values())
Ejemplo n.º 13
0
def get_data(authors, g):
    """Get extensions data per period"""
    days = dict()
    devs = dict()
    test_overall = list()
    for dev, dates in authors.items():
        days[dev] = list()
        for date, info in dates.items():
            exts = list()
            day = [date, exts]
            for c in info['commits']:
                for sha, files in c.items():
                    fs = [x['new'] for x in files]
                    extensions = extract_extentions(fs)
                    exts.extend(extensions)
                    test_overall.extend(extensions)
            if len(day[1]) > 0:
                days[dev].append(day)
            # devs
            if devs.get(dev) is None:
                devs.update({dev: list(day[1])})
            else:
                devs[dev].extend(day[1])
    years = get_years_data(days)
    nd = 'days_' + g + '.json'
    save_in_file(days, nd, 'data')
    if test:
        ny = 'test_years_' + g + '.json'
        to = 'test_overall_' + g + '.json'
        save_in_file(years, ny, 'data/test')
        save_in_file(test_overall, to, 'data/test')
    return {'days': days, 'years': years, 'devs' : devs}
Ejemplo n.º 14
0
def devs_table():
    """
    Create a csv file summing up the data of each dev for each repo separately - dev email, total commits, total lines, ratio, daily ratio, commit ratio
    :return: None
    """
    repos = get_repos()
    # todo: should we do it one for all repos?
    for repo in repos:
        data = list()
        data.append([
            'Author', 'Line Ratio', 'Daily Line Ratio', 'Lines',
            'Commit Ratio', 'Commits'
        ])
        ratios_dir = os.path.join('data', repo, 'authors_line_ratio.json')
        commit_dir = os.path.join('data', repo, 'authors_commits_ratio.json')
        authors = load_data(ratios_dir)
        commits = load_data(commit_dir)
        for author in authors:
            d = [
                author['dev'], author['commits']['ratio'],
                author['commits']['daily_ratio'], author['commits']['sum']
            ]
            c = [(x['commits']['buggy'], x['commits']['good']) for x in commits
                 if x['dev'] == author['dev']]
            if c[0][1] == 0:
                d.append(1)
            else:
                d.append(c[0][0] / c[0][1])
            d.append(c[0][0] + c[0][1])
            data.append(d)
        save_in_file([], 'authors.csv', os.path.join('data', repo))
        myFile = open(os.path.join('data', repo, 'authors.csv'), 'w')
        with myFile:
            writer = csv.writer(myFile)
            writer.writerows(data)
        log_debug(repo, 'done', None)
Ejemplo n.º 15
0
def compare_weeks_daily_sums():
    """Compare weekly entropy values with the sums of daily per week. Save the results."""
    days = {'good': load_data('data/days_good.json'), 'bad': load_data('data/days_bad.json')}
    weeks = {'good': get_weeks(days['good']), 'bad': get_weeks(days['bad'])}
    if test:
        save_in_file(weeks, 'test_weekly.json', 'data/test')
    weeks_entropy = weekly_entropy(weeks)
    save_in_file(weeks_entropy, 'entropy_weekly.json', 'data')
    daily_entropies = load_data('data/entropy_daily.json')
    weeks_sums_daily = weekly_entropy_sum_days(daily_entropies)
    # save_in_file(weeks_sums_daily, 'entropy_weeks_sums_daily.json', 'data')
    weeks_analyzed, summary = analyze_weekly(weeks_entropy, weeks_sums_daily)
    save_in_file(weeks_analyzed, 'entropy_weeks_analyzed.json', 'data')
Ejemplo n.º 16
0
def get_focus():
    os.makedirs('data/test', exist_ok=True)
    groups = set_up_groups()
    fpe = get_ext_file_nums(groups) # files per extension
    save_in_file(groups, 'entropy_groups.json', 'data')
    ratios = load_ratios()
    T1 = time()
    ## focus analysis - daily, weekly, yearly, 4 years
    analyze_extensions(groups, ratios)
    compare_weeks_daily_sums()
    ## focus analysis per extension
    ext = extensions_entropies()
    save_in_file(ext, 'entropy_extensions.json', 'data')
    average_ext_number(ext)
    distribution = extension_distribution(ext)
    save_in_file(distribution, 'entropy_ext_distribution.json', 'data')
    get_ext_ranges(ext)
    popular = compare_most_popular(ext)
    save_in_file(popular, 'entropy_exts_popular.json', 'data')
    get_extremes_avg(ext, fpe)
    # analyze_group_exts()
    log_debug(None, 'Extensions analyzed in {0[0]} seconds', (time() - T1,))
Ejemplo n.º 17
0
def save_issues(owner, project, directory):
    bs = download_issues(owner, project)
    save_in_file(bs, 'issues.json', directory)
Ejemplo n.º 18
0
def combine_authors():
    combined = combine()
    save_in_file(combined, 'authors_combined.json', 'data')
    authors = get_ratios(combined)
    save_in_file(authors, 'authors_ratio.json', 'data')
Ejemplo n.º 19
0
def get_labels(owner, project, directory):
    labels = download_labels(owner, project)
    rated = rate_labels(labels, project)
    save_in_file(rated, 'labels.json', directory)
Ejemplo n.º 20
0
def extract_fixes(directory, project):
    links = analyze_syntax_commits(directory, project)
    fixes = analyze_semantics_commits(links, directory, project)
    save_in_file(list(fixes), 'fixes.json', directory)
Ejemplo n.º 21
0
def save_commits(project, repo, directory):
    commits = extract_commits_basic(project, repo)
    save_in_file(commits, 'commits.json', directory)
Ejemplo n.º 22
0
def analyze_extensions(groups, ratios):
    """Save the focus entropy results for time period."""
    entropies = {
        'good' : None,
        'bad' : None
    }
    daily = {
        'good' : list(),
        'bad' : list()
    }
    yearly = {
        'good': list(),
        'bad': list(),
        'all' : list()
    }
    per_year = {
        2014 : {
            'good': list(),
            'bad' : list()
        },
        2015: {
            'good': list(),
            'bad': list()
        },
        2016 : {
            'good': list(),
            'bad' : list()
        },
        2017: {
            'good': list(),
            'bad': list()
        }
    }
    per_dev = {
        'good' : list(),
        'bad' : list()
    }
    ranges_vis = {
        'categories' : list(),
        'good' : list(),
        'bad' : list()
    }
    focus_level = {
        'good' : None,
        'bad' : None
    }
    entr_devs = {
        'good' : None,
        'bad' : None
    }
    for g, authors in groups.items():
        T = time()
        data = get_data(authors, g)
        days = data.get('days')
        years = data.get('years')
        devs = data.get('devs')
        years_counted = get_counts_years(years)
        devs_counted, devs_sums = get_counts_obj(devs)
        log_debug(None, 'Data prepared in {0[0]}', (time()-T,))
        daily[g] = daily_entropy(days, g)
        ye = yearly_entropy(years_counted, g, per_year)
        yearly[g] = ye
        yearly['all'].extend(ye)
        per_dev[g], pd = per_dev_entropy(devs_counted, devs_sums, ratios)
        ranges = get_ranges(per_dev[g])
        entropies[g] = ranges[0]
        ranges_vis['categories'] = ranges[2]
        ranges_vis[g] = ranges[1]
        focus_level[g] = ranges[3]
        log_info(None, 'Ranges for {0[0]} devs: {0[1]}', (g, entropies[g]))
        entr_devs[g] = pd
    log_info(None, 'Focus levels are {0[0]}', (focus_level,))
    save_in_file(daily, 'entropy_daily.json', 'data')
    save_in_file(yearly, 'entropy_yearly.json', 'data')
    save_in_file(per_year, 'entropy_per_year.json', 'data')
    save_in_file(per_dev, 'entropy_per_dev.json', 'data')
    save_in_file(entr_devs, 'entropy_all.json', 'data')
    save_in_file(entropies, 'entropy_ranges.json', 'data')
    save_in_file(ranges_vis, 'entropy_ranges_vis.json', 'data')
Ejemplo n.º 23
0
def prep_data(project):
    directory = path.join('data', project)
    commits = load_commits(directory)
    author_data = by_author(commits, project)
    save_in_file(author_data, 'authors.json', directory)
Ejemplo n.º 24
0
def get_fix_inducing(fixes, project):
    """
        Get commits and line numbers of the fix-inducing changes.
        fic is a collection of { buggy commit : lines considered buggy }
        while blame_coll contains full buggy lines for troubleshooting and verification.
        ./data/<project>/buggy_changes.json file contain the commit (and the line numbers) that introduced a bug.
        For each fix, 'git blame' is run on parents for the files modified and lines deleted.
        Output of blame is parsed to indicate lines inserted in each fix-inducing commit corresponding to a fix.
    """
    if test:
        blame_coll = list()
        test_lines = list()
    fic = {}  # fix inducing commits
    T = time()
    repo_dir = path.join('repos', project)
    directory = path.join('data', project)
    for fix in fixes:
        if fix is None:
            continue
        commit = list(fix.keys())[0]
        for fo in fix.get(commit):
            # Note: We may get two parents in case of a merge.
            parents = get_commit_parents(commit, '1', repo_dir)
            lines = fo['deleted']  # lines deleted in a fix
            if len(lines) == 0 or lines is None:
                continue
            line_groups = get_lines(lines)
            for lines_str in line_groups:
                line_start = int(lines_str.split(',')[0])
                for parent in parents[0].split(" "):
                    if fo['old'] is None:
                        continue
                    cmd = [
                        'git', 'blame', '-f', '-L', lines_str, parent, '--',
                        fo['old']
                    ]
                    completed = subprocess.run(cmd,
                                               stdout=subprocess.PIPE,
                                               universal_newlines=True,
                                               cwd=repo_dir)
                    blames = str(completed.stdout).split('\n')
                    if test:
                        new_blames = list()
                        test_l = list()
                    # save only the lines we are interested in
                    line = line_start
                    for blame in blames:
                        if line in list(lines):
                            buggy_commit = blame.strip().split(' ')[0]
                            if len(buggy_commit) < 2:
                                continue
                            if fic.get(buggy_commit) is not None:
                                if (line, fo['old']
                                    ) not in fic.get(buggy_commit):
                                    fic.get(buggy_commit).append(
                                        (line, fo['old']))
                            else:
                                # Note: we may have dup line numbers from different files.
                                ls = list()
                                fix_inducing = {buggy_commit: ls}
                                ls.append((line, fo['old']))
                                fic.update(fix_inducing)
                            if test:
                                new_blames.append(blame)
                                test_l.append((buggy_commit, line, fo['old']))
                        line += 1
                    if test:
                        blame_coll.append({
                            commit: new_blames,
                            'lines': lines,
                            'file': fo['old'],
                            "parent": parent
                        })
                        test_lines.append({commit: test_l})
    log_debug(project, 'Blames done in {0[0]} seconds\n', (time() - T, ))
    save_in_file(fic, 'buggy_changes.json', directory)
    if test:
        test_dir = get_test_dir(directory)
        save_in_file(blame_coll, 'test_buggy_lines.json', test_dir)
        save_in_file(test_lines, 'test_buggy_lines_dups.json', test_dir)
        log_debug(project, 'Number of buggy commits: {0[0]}.\n', (len(fic), ))
    return fic