Beispiel #1
0
def two_years_analysis(two_years_df
                       , first_metric
                       , second_metric
                       , key):
    print()
    print("Co-change"
          , first_metric
          , second_metric)
    g = two_years_df.groupby([first_metric, second_metric]
                             , as_index=False).agg({key : 'count'})

    print(g)

    cm = ConfusionMatrix(g_df=g
                             , classifier=first_metric
                             , concept=second_metric, count=key)

    print(cm.summarize())
    print()
    print("Samples", cm.samples())
    print("Both metrics increment match", cm.accuracy())
    print(second_metric
            , " improvement given "
            , first_metric
            , " improvement", cm.precision(), "lift", cm.precision_lift())
    print(first_metric
            , " improvement given "
            , second_metric
            , "improvement",  cm.recall(), "lift", ifnull(safe_divide(ifnull(cm.recall()),cm.hit_rate())) - 1)
    print()
Beispiel #2
0
def quality_and_speed_over_years(commits_per_user_file):

    print("over the years ccp and speed change")
    trep = get_valid_repos()
    trep = trep[['repo_name']]
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year > 2014]
    df = pd.merge(users_per_project, trep, on='repo_name')

    df = df[[
        'repo_name', 'year', 'corrective_commits_ratio',
        'commits_per_above11_users'
    ]]
    df = df.dropna()

    cur_df = df.copy()
    cur_df['prev_year'] = cur_df.year - 1
    cur_df = cur_df.rename(
        columns={
            'year': 'cur_year',
            'corrective_commits_ratio': 'cur_corrective_commits_ratio',
            'commits_per_above11_users': 'cur_commits_per_above11_users'
        })

    prev_df = df.copy()
    prev_df = prev_df.rename(
        columns={
            'year': 'prev_year',
            'corrective_commits_ratio': 'prev_corrective_commits_ratio',
            'commits_per_above11_users': 'prev_commits_per_above11_users'
        })

    two_years = pd.merge(cur_df,
                         prev_df,
                         left_on=['repo_name', 'prev_year'],
                         right_on=['repo_name', 'prev_year'])
    two_years[
        'improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio
    two_years[
        'hurt_ccp'] = two_years.cur_corrective_commits_ratio > two_years.prev_corrective_commits_ratio
    two_years[
        'improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users

    g = two_years.groupby(['improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='improved_ccp',
                         concept='improved_speed',
                         count='repo_name')

    print(cm.summarize())
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision())
    print("ccp improvement given speed improvement",
          cm.tp() / (cm.fn() + cm.tp()))

    two_years[
        'sig_improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio - 0.1
    two_years[
        'sig_improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users + 10

    g = two_years.groupby(['sig_improved_ccp', 'sig_improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_ccp',
                         concept='sig_improved_speed',
                         count='repo_name')
    print(cm.summarize())

    g = two_years.groupby(['sig_improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    print(cm.summarize())
    print()
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision(), "lift",
          cm.precision_lift())
    print("ccp improvement given speed improvement", cm.recall(), "lift",
          cm.recall() / cm.hit_rate() - 1)
    print()

    g = two_years.groupby(['sig_improved_speed', 'hurt_ccp'],
                          as_index=False).agg({'repo_name': 'count'})
    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_speed',
                         concept='hurt_ccp',
                         count='repo_name')

    print(cm.summarize())
    print()
    print("ccp hurt given significant speed improvement", cm.precision(),
          "lift", cm.precision_lift())
    print()