Beispiel #1
0
def bootstrap_diff(df, ccp_estimator, rounds, sample_size):
    bootstrap_results = []
    for i in range(rounds):
        # Get first model parameters
        s1 = df.sample(sample_size, replace=True)
        bug_g = s1.groupby([classifier, concept],
                           as_index=False).agg({count: 'count'})
        bug_cm = ConfusionMatrix(g_df=bug_g,
                                 classifier=classifier,
                                 concept=concept,
                                 count=count)

        positive_rate = bug_cm.positive_rate()
        hit_rate = bug_cm.hit_rate()
        ccp = ccp_estimator.estimate_positives(hit_rate)
        ccp_diff = ccp - positive_rate

        # Find difference in given points
        bootstrap_results.append([positive_rate, hit_rate, ccp, ccp_diff])

        if (i % 100 == 0):
            print("finished " + str(i), datetime.datetime.now())

    results_df = pd.DataFrame(
        bootstrap_results,
        columns=['positive_rate', 'hit_rate', 'ccp', 'ccp_diff'])
    return results_df
Beispiel #2
0
def quality_and_speed_over_years(commits_per_user_file):

    print("over the years ccp and speed change")
    trep = get_valid_repos()
    trep = trep[['repo_name']]
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year > 2014]
    df = pd.merge(users_per_project, trep, on='repo_name')

    df = df[[
        'repo_name', 'year', 'corrective_commits_ratio',
        'commits_per_above11_users'
    ]]
    df = df.dropna()

    cur_df = df.copy()
    cur_df['prev_year'] = cur_df.year - 1
    cur_df = cur_df.rename(
        columns={
            'year': 'cur_year',
            'corrective_commits_ratio': 'cur_corrective_commits_ratio',
            'commits_per_above11_users': 'cur_commits_per_above11_users'
        })

    prev_df = df.copy()
    prev_df = prev_df.rename(
        columns={
            'year': 'prev_year',
            'corrective_commits_ratio': 'prev_corrective_commits_ratio',
            'commits_per_above11_users': 'prev_commits_per_above11_users'
        })

    two_years = pd.merge(cur_df,
                         prev_df,
                         left_on=['repo_name', 'prev_year'],
                         right_on=['repo_name', 'prev_year'])
    two_years[
        'improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio
    two_years[
        'hurt_ccp'] = two_years.cur_corrective_commits_ratio > two_years.prev_corrective_commits_ratio
    two_years[
        'improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users

    g = two_years.groupby(['improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='improved_ccp',
                         concept='improved_speed',
                         count='repo_name')

    print(cm.summarize())
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision())
    print("ccp improvement given speed improvement",
          cm.tp() / (cm.fn() + cm.tp()))

    two_years[
        'sig_improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio - 0.1
    two_years[
        'sig_improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users + 10

    g = two_years.groupby(['sig_improved_ccp', 'sig_improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_ccp',
                         concept='sig_improved_speed',
                         count='repo_name')
    print(cm.summarize())

    g = two_years.groupby(['sig_improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    print(cm.summarize())
    print()
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision(), "lift",
          cm.precision_lift())
    print("ccp improvement given speed improvement", cm.recall(), "lift",
          cm.recall() / cm.hit_rate() - 1)
    print()

    g = two_years.groupby(['sig_improved_speed', 'hurt_ccp'],
                          as_index=False).agg({'repo_name': 'count'})
    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_speed',
                         concept='hurt_ccp',
                         count='repo_name')

    print(cm.summarize())
    print()
    print("ccp hurt given significant speed improvement", cm.precision(),
          "lift", cm.precision_lift())
    print()
Beispiel #3
0
def two_years_analysis(two_years_df
                       , first_metric
                       , second_metric
                       , key):
    print()
    print("Co-change"
          , first_metric
          , second_metric)
    g = two_years_df.groupby([first_metric, second_metric]
                             , as_index=False).agg({key : 'count'})

    print(g)

    cm = ConfusionMatrix(g_df=g
                             , classifier=first_metric
                             , concept=second_metric, count=key)

    print(cm.summarize())
    print()
    print("Samples", cm.samples())
    print("Both metrics increment match", cm.accuracy())
    print(second_metric
            , " improvement given "
            , first_metric
            , " improvement", cm.precision(), "lift", cm.precision_lift())
    print(first_metric
            , " improvement given "
            , second_metric
            , "improvement",  cm.recall(), "lift", ifnull(safe_divide(ifnull(cm.recall()),cm.hit_rate())) - 1)
    print()