コード例 #1
0
def print_smells_proprties_table():

    records = []
    for i in CONCEPTS_DICT.keys():
        file = join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i))
        df = pd.read_csv(file)

        record = []
        record.append(CONCEPT_NAMES[i])
        record.append(len(potential_smells(df, i)))
        record.append(len(robust_smells(df, i)))
        record.append(len(almost_robust_smells(df, i)))

        record.append(len(predictive_features(df, i)))
        record.append(len(cochange_features(df)))

        record.append(len(twins_features(df)))
        record.append(len(monotonicity_features(df)))
        record.append(len(length_features(df)))

        records.append(record)

    table_df = pd.DataFrame(records,
                            columns=[
                                'Concept', 'Potential', 'Robust', 'Almost',
                                'Predictive', 'Cochange', 'Twins',
                                'Monotonicity', 'Length'
                            ])
    table_df = table_df.sort_values(['Concept'], ascending=[False])

    title = '\label{tab:smells-properties} Smells Properties'
    print()
    df_to_latex_table(table_df, title, rounding_digits=0)
    print()
コード例 #2
0
def run_print_influence_tables():

    for i in CONCEPTS_DICT.keys():
        stats = pd.read_csv(
            join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i)))

        print_influence_tables(stats, concept=i)
        print_act_upon_tables(stats, concept=i)
コード例 #3
0
def missing_propetries_distribution():
    concept = 'file_ccp'
    df = pd.read_csv(
        join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)))

    properties_existnace = [
        'has_predictive', 'has_cochange', 'has_monotonicity', 'has_twins',
        'has_length'
    ]

    df['has_predictive'] = df.apply(
        lambda x: 1
        if x[RELATIVE_MEAN_DIFF_PREFIX + concept
             ] > 0 and x.true_positives + x.false_negatives > 200 else 0,
        axis=1)

    df['has_cochange'] = df.apply(lambda x: 1
                                  if x.cochange_precision_lift > 0.0 else 0,
                                  axis=1)

    df['has_monotonicity'] = df.apply(lambda x: 1 if x.monotonicity else 0,
                                      axis=1)

    df['has_twins'] = df.apply(lambda x: 1
                               if x.twins_precision_lift > 0.0 else 0,
                               axis=1)

    df['has_length'] = df.apply(lambda x: 1 if ((x.line_pearson < 0.5) and (
        (x.control_short_precision_lift > 0.0) or
        (x.control_short_hit_rate == 0.0)) and (
            (x.control_medium_precision_lift > 0.0) or
            (x.control_medium_hit_rate == 0.0)) and (
                (x.control_long_precision_lift > 0.0) or
                (x.control_long_hit_rate == 0.0))) else 0,
                                axis=1)

    df['properties_num'] = \
        df[properties_existnace].sum(axis=1)

    g = df.groupby(['properties_num'], as_index=False).agg({
        'feature': 'count'
    }).sort_values('properties_num')
    print(g)

    has_4 = df[df.properties_num == 4][['feature'] + properties_existnace]

    for i in [
            'has_predictive', 'has_cochange', 'has_monotonicity', 'has_twins',
            'has_length'
    ]:
        print(i, 1 - has_4[i].mean())

    print(has_4.describe())
コード例 #4
0
def multiple_smells():

    rows = []
    for i in CONCEPTS_DICT.keys():
        df = pd.read_csv(
            join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i)))

        smells = potential_smells(df, i)
        for smell in smells:
            rows.append((i, smell))

    features_df = pd.DataFrame(rows,
                               columns=['concpet',
                                        'feature']).sort_values('feature')

    # grouped = features_df.groupby(['feature'], as_index=False).agg({'concept' :  'count'})
    grouped = features_df.groupby(['feature'],
                                  as_index=False).agg(['count']).reset_index()
    grouped.columns = ['feature', 'concept']
    #grouped.columns = [ 'concept']
    grouped = grouped[grouped.concept > 1]

    print(features_df[features_df.feature.isin(grouped.feature.tolist())])
コード例 #5
0
def run_basic_models(concept):
    start = time.time()
    df = get_per_year_dataset()
    q25 = df[concept].quantile(0.25)
    df = df[df.year == MAX_YEAR - 1]
    df = df.fillna(NUMERIC_NULL)

    df[CONCEPT] = df[concept].map(lambda x: x <= q25)
    stats = pd.read_csv(
        join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)))
    smells = potential_smells(stats, concept)

    features = smells + [CONCEPT]
    #df = df[SINGLE_SMELL + [CONCEPT]]
    df = df[features]

    print(risk_predictive_columns(df))

    end = time.time()
    print("Load data time", end - start)

    class_weight = {1: 1, 0: 1}
    #class_weight = {1: 100 , 0: 1}
    #class_weight = {1: 1 , 0: 100}

    #class_weight =  {1: 1 , 0: 0.001}
    classifiers = {
        'Tree_ms50_md3':
        DecisionTreeClassifier(min_samples_leaf=200,
                               max_depth=3,
                               class_weight=class_weight),
        'Tree_default':
        DecisionTreeClassifier(class_weight=class_weight),
        'Tree_ms50':
        DecisionTreeClassifier(min_samples_leaf=200,
                               class_weight=class_weight),
        'Tree_md3':
        DecisionTreeClassifier(max_depth=3, class_weight=class_weight),
        'RandomForest':
        RandomForestClassifier(n_estimators=10, min_samples_leaf=50)
    }

    for model_name in classifiers.keys():
        print(model_name)
        start = time.time()
        regressor = classifiers[model_name]
        regressor, performance = build_basic_model(
            df,
            concept=CONCEPT,
            classifier=regressor,
            model_file_name='{}.pkl'.format(model_name),
            performance_file=os.path.join(PERFORMANCE_PATH,
                                          '{}.json'.format(model_name)))
        if 'Tree' in model_name:
            plot_tree(regressor,
                      dot_file_path=os.path.join(FIGURES_PATH,
                                                 '{}.dot'.format(model_name)),
                      png_file_path=os.path.join(FIGURES_PATH,
                                                 '{}.png'.format(model_name)),
                      feature_names=smells)

            tree_to_sql(tree=regressor,
                        feature_names=smells,
                        function_name="tree",
                        output_file=os.path.join(MODELS_PATH,
                                                 '{}.sql'.format(model_name)))
        else:
            plot_random_forest(
                regressor,
                dot_files_prefix=os.path.join(FIGURES_PATH, 'rf1'),
                png_files_prefix=os.path.join(FIGURES_PATH, 'rf1'),
                feature_names=smells)

            random_forest_to_sql(regressor,
                                 feature_names=smells,
                                 function_name_prefix="rf",
                                 output_file_prefix=os.path.join(
                                     MODELS_PATH, 'rf'))

        end = time.time()
        print("Model running time", end - start)

    return regressor, df
コード例 #6
0
def aggregate_stats(concept):

    length_df = pd.read_csv(join(DATA_PATH, LENGTH_PEARSON_STATS))
    length_df = length_df[['feature', 'line_pearson']]

    removal_df = pd.read_csv(join(DATA_PATH, SMELL_REMOVAL_FILE))
    removal_df = rename_columns(removal_df,
                                prefix='removal_',
                                columns=set(removal_df.columns) -
                                set(['feature']))

    monotinicity_df = pd.read_csv(
        MONOTONE_PATH_TEMPLATE.format(monotone_column=concept))

    cochange_df = pd.read_csv(
        join(DATA_PATH, COHANGE_STATS_TEMPLATE.format(metric=concept)))
    cochange_df = rename_columns(cochange_df,
                                 prefix='cochange_',
                                 columns=set(cochange_df.columns) -
                                 set(['feature']))

    features_df = pd.read_csv(
        join(DATA_PATH, PREDICTIVE_STATS_TEMPLATE.format(concept=concept)))
    features_df = features_df.rename(columns={'feature_name': 'feature'
                                              })  # TODO - change in original

    twins_df = pd.read_csv(
        join(DATA_PATH, AUTHOR_TWIN_CM_TEMPLATE.format(concept=concept)))
    twins_df = rename_columns(twins_df,
                              prefix='twins_',
                              columns=set(twins_df.columns) - set(['feature']))

    relative_mean_df = pd.read_csv(join(DATA_PATH, RELATIVE_MEANS_FILE))
    relative_mean_df = relative_mean_df[[
        'feature', RELATIVE_MEAN_PREFIX + concept,
        RELATIVE_MEAN_DIFF_PREFIX + concept
    ]]

    joint_df = pd.merge(features_df, cochange_df, on='feature')
    joint_df = pd.merge(joint_df, relative_mean_df, on='feature')
    joint_df = pd.merge(joint_df, monotinicity_df, on='feature')
    joint_df = pd.merge(joint_df, length_df, on='feature')
    joint_df = pd.merge(joint_df, removal_df, on='feature')
    joint_df = pd.merge(joint_df, twins_df, on='feature')

    file_codesmell_df = pd.read_csv(join(DATA_PATH, BINARY_DATASET_FILE))
    control_variable = 'length_group'
    for i in file_codesmell_df[control_variable].unique():
        template = 'pred_stats_ctl_{control_variable}_{control_val}_' + concept + '.csv'
        file = template.format(control_variable=control_variable,
                               control_val=i)
        control_df = pd.read_csv(join(DATA_PATH, file))
        control_df = control_df.rename(columns={'feature_name': 'feature'
                                                })  # TODO - change in original
        control_df = rename_columns(control_df,
                                    prefix='control_{val}_'.format(val=i),
                                    columns=set(control_df.columns) -
                                    set(['feature']))
        joint_df = pd.merge(joint_df, control_df, on='feature')

    joint_df.to_csv(join(DATA_PATH,
                         JOINT_STATS_TEMPLATE.format(concept=concept)),
                    index=False)
    return joint_df
コード例 #7
0
def model_groups_influence():

    SMELLS_COUNT = 'smells_count'
    CLASSIFIER = 'has_smells'
    LOW_QUALITY = 'low_quality'
    HIGH_QUALITY = 'high_quality'

    df = get_per_year_dataset()
    df = df[df.year == MAX_YEAR - 1]

    for l in [['short'], ['medium'], ['long'], ['short', 'medium', 'long']]:
        print(l)
        df = get_per_year_dataset()
        df = df[df.year == MAX_YEAR - 1]
        df = df[df['length_group'].isin(l)]
        rows = []
        for concept in CONCEPTS_DICT.keys():
            row = [CONCEPT_NAMES[concept]]
            stats = pd.read_csv(
                join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)))
            smells = potential_smells(stats, concept)
            #smells = robust_smells(stats
            #                                               , concept)

            df[SMELLS_COUNT] = df[smells].sum(axis=1)
            df[CLASSIFIER] = df[SMELLS_COUNT].map(lambda x: x == 0)
            row.append(df[CLASSIFIER].mean())

            q25 = df[concept].quantile(0.25)
            df[HIGH_QUALITY] = df[concept].map(lambda x: x <= q25)
            cm = pair_analysis(df,
                               first_metric=CLASSIFIER,
                               second_metric=HIGH_QUALITY)
            row.append(cm['precision_lift'])

            rows.append(row)

            q75 = df[concept].quantile(0.75)
            df[LOW_QUALITY] = df[concept].map(lambda x: x >= q75)

            cm = pair_analysis(df,
                               first_metric=CLASSIFIER,
                               second_metric=LOW_QUALITY)
            row.append(cm['precision_lift'])

            #row.append(df[concept].mean())
            #row.append(df[df[CLASSIFIER]][concept].mean())

        features_df = pd.DataFrame(
            rows,
            columns=[
                'Metric',
                'Hit Rate',
                'High Quality',
                'Low Quality'  #, 'Mean', 'CMean'
            ]).sort_values('Metric')

        print()
        df_to_latex_table(
            features_df,
            '\label{tab:group_smell_influence} Smells Groups Influence ')