Esempio n. 1
0
def analayze_smell_removal(stats_file):

    keys = ['repo_name', 'full_file_name']

    metric_per_year_df = get_per_year_dataset(binary=True)
    two_years_df = build_two_years_df(metric_per_year_df,
                                      keys,
                                      metrics=SINGLE_SMELL,
                                      time_column='year')

    stats = {}
    for i in SINGLE_SMELL:
        stats[i] = features_confusion_matrix_analysis(
            two_years_df,
            first_metric=PREV_PREFIX + i,
            second_metric=CUR_PREFIX + i,
            keys=keys)

    stats_df = pd.DataFrame.from_dict(stats, orient='index')

    stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'})
    stats_df['removal_probability'] = 1 - stats_df.precision
    stats_df = stats_df[[
        'feature', 'removal_probability', 'true_positives', 'false_positives',
        'false_negatives'
    ]]
    stats_df = stats_df.sort_values(['removal_probability', 'feature'],
                                    ascending=[False, True])
    stats_df.to_csv(stats_file, index=False)

    return stats_df
Esempio n. 2
0
def analayze_smells_cochange(stability_stats_file):

    keys = ['repo_name', 'full_file_name']

    metric_per_year_df = get_per_year_dataset(binary=False)

    significant_ccp_improvment = lambda prev, cur: prev - 0.1 > cur
    the_lower_the_better = lambda prev, cur: prev > cur

    metrics_dict = {'file_ccp': the_lower_the_better}

    for i in SINGLE_SMELL:
        metrics_dict[i] = the_lower_the_better

    stats = cochange_analysis(metric_per_year_df,
                              metrics_dict,
                              keys,
                              control_variables=[])

    stats_df = pd.DataFrame.from_dict(stats, orient='index')
    stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'})
    stats_df = stats_df.sort_values(['precision_lift', 'feature'],
                                    ascending=[False, True])
    stats_df.to_csv(stability_stats_file)

    return stats_df
Esempio n. 3
0
def run_analyze_means():
    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR]

    stats_df = analyze_relative_mean(df,
                                     concepts=list(CONCEPTS_DICT.keys()),
                                     metrics=SINGLE_SMELL,
                                     output_file=join(DATA_PATH,
                                                      RELATIVE_MEANS_FILE))
Esempio n. 4
0
def evaluate_smell_monotonocity():

    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR]

    relevant_columns = set(df.columns) - NON_PREDICTIVE_FEATURES

    monotone_df = evaluate_sides_monotonocity_vs_concept(
        df=df,
        relevant_columns=relevant_columns,
        concepts_list=list(CONCEPTS_DICT.keys()),
        output_file_template=MONOTONE_PATH_TEMPLATE)
    return monotone_df
Esempio n. 5
0
def analayze_smells_stability(stability_stats_file):

    keys = ['repo_name', 'full_file_name']

    metric_per_year_df = get_per_year_dataset(binary=False)

    stats = analyze_stability(metric_per_year_df,
                              keys=keys,
                              metrics=SINGLE_SMELL,
                              time_column='year',
                              minimal_time=EARLIEST_ANALYZED_YEAR,
                              control_variables=[])

    stats_df = pd.DataFrame.from_dict(stats, orient='index')
    stats_df.to_csv(stability_stats_file)
Esempio n. 6
0
def file_by_author_twin_analysis():
    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR]
    single_author_files = df[df.authors == 1]

    keys= ['repo_name', 'Author_email']
    filtering_function = lambda x: x.full_file_name_x == x.full_file_name_y
    comparision_function= lambda first, second : second > first \
        if isinstance(first, numbers.Number) and isinstance(second, numbers.Number) \
        else None

    comparision_columns = SINGLE_SMELL + ['full_file_name'] + list(CONCEPTS_DICT.keys())

    comp_df = compare_twin_behaviours(first_behaviour=single_author_files
                                        , second_behaviour=single_author_files
                                        , keys=keys
                                        , comparision_columns=comparision_columns
                                        , comparision_function=comparision_function
                                        , filtering_function=filtering_function)

    twins_file = 'file_by_author_twin_analysis_concepts.csv'
    comp_df.to_csv(os.path.join(DATA_PATH, twins_file))

    #comp_df = pd.read_csv(os.path.join(DATA_PATH, twins_file))

    for concept in CONCEPTS_DICT.keys():
        Pearson = comp_df.corr()[concept + COMPARISON_SUFFIX]
        Pearson_df = pd.DataFrame(Pearson).reset_index()
        Pearson_df.columns = ['feature', 'Pearson']
        Pearson_df = Pearson_df.sort_values('Pearson', ascending=False)

        #print(Pearson_df)
        Pearson_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_PEARSON_TEMPLATE.format(concept=concept))
                          , index=False)

        stats = compute_confusion_matrics(df=comp_df
                    , concept=concept + COMPARISON_SUFFIX
                    , columns=[i + COMPARISON_SUFFIX for i in SINGLE_SMELL]
                    , keys=keys)
        stats_df = pd.DataFrame.from_dict(stats, orient='index')
        stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'})
        stats_df['feature'] = stats_df['feature'].map(lambda x : x[:-4])
        stats_df = stats_df.sort_values(['precision_lift','feature'] , ascending=[False, True])
        stats_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_CM_TEMPLATE.format(concept=concept))
                                        , index=False)
Esempio n. 7
0
def ccp_stability():

    metrics = ['file_ccp', 'worse_10_hs', 'reduced_risk']
    keys = [ 'repo_name', 'full_file_name']

    metric_per_year_df = get_per_year_dataset()
    metric_per_year_df['worse_10_hs'] = metric_per_year_df.worse_10_hs.map(lambda x:  int(x))
    metric_per_year_df['reduced_risk'] = metric_per_year_df.corrective_rate.map(lambda x:  int(x <= 0.05))

    print(analyze_stability(metric_per_year_df
                      , keys=keys
                      , metrics=metrics
                      , time_column='year'
                      , minimal_time=EARLIEST_ANALYZED_YEAR
                      , control_variables=[]
                      , min_cnt_column='commits'
                      , min_cnt_threshold=10
                            ))
Esempio n. 8
0
def build_dataset():

    df = get_per_year_dataset(binary=False)
    df[SMELLS_SUM] = df[SINGLE_SMELL].sum(axis=1)

    repos = df.groupby(keys, as_index=False).agg({
        SMELLS_SUM: 'sum',
        'file': 'count',
        'MethodLength': 'sum',
        'NPathComplexity': 'sum',
        'AvoidInlineConditionals': 'sum',
        'NestedIfDepth': 'sum',
        'VisibilityModifier': 'sum'
    })

    #repos['smells_per_file'] = repos[SMELLS_SUM]/repos['file']

    prof = pd.read_csv(join(DATA_PATH, 'repo_properties_per_year.csv'))
    prof = prof[keys + [CONCEPT]]
    repos = pd.merge(repos, prof, on=keys)

    repos.to_csv(join(DATA_PATH, SMELLS_PER_REPO_FILE), index=False)

    return repos
Esempio n. 9
0
def run_basic_models(concept):
    start = time.time()
    df = get_per_year_dataset()
    q25 = df[concept].quantile(0.25)
    df = df[df.year == MAX_YEAR - 1]
    df = df.fillna(NUMERIC_NULL)

    df[CONCEPT] = df[concept].map(lambda x: x <= q25)
    stats = pd.read_csv(
        join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)))
    smells = potential_smells(stats, concept)

    features = smells + [CONCEPT]
    #df = df[SINGLE_SMELL + [CONCEPT]]
    df = df[features]

    print(risk_predictive_columns(df))

    end = time.time()
    print("Load data time", end - start)

    class_weight = {1: 1, 0: 1}
    #class_weight = {1: 100 , 0: 1}
    #class_weight = {1: 1 , 0: 100}

    #class_weight =  {1: 1 , 0: 0.001}
    classifiers = {
        'Tree_ms50_md3':
        DecisionTreeClassifier(min_samples_leaf=200,
                               max_depth=3,
                               class_weight=class_weight),
        'Tree_default':
        DecisionTreeClassifier(class_weight=class_weight),
        'Tree_ms50':
        DecisionTreeClassifier(min_samples_leaf=200,
                               class_weight=class_weight),
        'Tree_md3':
        DecisionTreeClassifier(max_depth=3, class_weight=class_weight),
        'RandomForest':
        RandomForestClassifier(n_estimators=10, min_samples_leaf=50)
    }

    for model_name in classifiers.keys():
        print(model_name)
        start = time.time()
        regressor = classifiers[model_name]
        regressor, performance = build_basic_model(
            df,
            concept=CONCEPT,
            classifier=regressor,
            model_file_name='{}.pkl'.format(model_name),
            performance_file=os.path.join(PERFORMANCE_PATH,
                                          '{}.json'.format(model_name)))
        if 'Tree' in model_name:
            plot_tree(regressor,
                      dot_file_path=os.path.join(FIGURES_PATH,
                                                 '{}.dot'.format(model_name)),
                      png_file_path=os.path.join(FIGURES_PATH,
                                                 '{}.png'.format(model_name)),
                      feature_names=smells)

            tree_to_sql(tree=regressor,
                        feature_names=smells,
                        function_name="tree",
                        output_file=os.path.join(MODELS_PATH,
                                                 '{}.sql'.format(model_name)))
        else:
            plot_random_forest(
                regressor,
                dot_files_prefix=os.path.join(FIGURES_PATH, 'rf1'),
                png_files_prefix=os.path.join(FIGURES_PATH, 'rf1'),
                feature_names=smells)

            random_forest_to_sql(regressor,
                                 feature_names=smells,
                                 function_name_prefix="rf",
                                 output_file_prefix=os.path.join(
                                     MODELS_PATH, 'rf'))

        end = time.time()
        print("Model running time", end - start)

    return regressor, df
Esempio n. 10
0
def run_feature_evaluation():
    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR]

    evaluate_features(df)
    evaluate_features_length_control(df=df, features=SINGLE_SMELL)
Esempio n. 11
0
def model_groups_influence():

    SMELLS_COUNT = 'smells_count'
    CLASSIFIER = 'has_smells'
    LOW_QUALITY = 'low_quality'
    HIGH_QUALITY = 'high_quality'

    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR - 1]

    for l in [['short'], ['medium'], ['long'], ['short', 'medium', 'long']]:
        print(l)
        df = get_per_year_dataset()
        df = df[df.year == MAX_YEAR - 1]
        df = df[df['length_group'].isin(l)]
        rows = []
        for concept in CONCEPTS_DICT.keys():
            row = [CONCEPT_NAMES[concept]]
            stats = pd.read_csv(
                join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)))
            smells = potential_smells(stats, concept)
            #smells = robust_smells(stats
            #                                               , concept)

            df[SMELLS_COUNT] = df[smells].sum(axis=1)
            df[CLASSIFIER] = df[SMELLS_COUNT].map(lambda x: x == 0)
            row.append(df[CLASSIFIER].mean())

            q25 = df[concept].quantile(0.25)
            df[HIGH_QUALITY] = df[concept].map(lambda x: x <= q25)
            cm = pair_analysis(df,
                               first_metric=CLASSIFIER,
                               second_metric=HIGH_QUALITY)
            row.append(cm['precision_lift'])

            rows.append(row)

            q75 = df[concept].quantile(0.75)
            df[LOW_QUALITY] = df[concept].map(lambda x: x >= q75)

            cm = pair_analysis(df,
                               first_metric=CLASSIFIER,
                               second_metric=LOW_QUALITY)
            row.append(cm['precision_lift'])

            #row.append(df[concept].mean())
            #row.append(df[df[CLASSIFIER]][concept].mean())

        features_df = pd.DataFrame(
            rows,
            columns=[
                'Metric',
                'Hit Rate',
                'High Quality',
                'Low Quality'  #, 'Mean', 'CMean'
            ]).sort_values('Metric')

        print()
        df_to_latex_table(
            features_df,
            '\label{tab:group_smell_influence} Smells Groups Influence ')