def analayze_smell_removal(stats_file): keys = ['repo_name', 'full_file_name'] metric_per_year_df = get_per_year_dataset(binary=True) two_years_df = build_two_years_df(metric_per_year_df, keys, metrics=SINGLE_SMELL, time_column='year') stats = {} for i in SINGLE_SMELL: stats[i] = features_confusion_matrix_analysis( two_years_df, first_metric=PREV_PREFIX + i, second_metric=CUR_PREFIX + i, keys=keys) stats_df = pd.DataFrame.from_dict(stats, orient='index') stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'}) stats_df['removal_probability'] = 1 - stats_df.precision stats_df = stats_df[[ 'feature', 'removal_probability', 'true_positives', 'false_positives', 'false_negatives' ]] stats_df = stats_df.sort_values(['removal_probability', 'feature'], ascending=[False, True]) stats_df.to_csv(stats_file, index=False) return stats_df
def analayze_smells_cochange(stability_stats_file): keys = ['repo_name', 'full_file_name'] metric_per_year_df = get_per_year_dataset(binary=False) significant_ccp_improvment = lambda prev, cur: prev - 0.1 > cur the_lower_the_better = lambda prev, cur: prev > cur metrics_dict = {'file_ccp': the_lower_the_better} for i in SINGLE_SMELL: metrics_dict[i] = the_lower_the_better stats = cochange_analysis(metric_per_year_df, metrics_dict, keys, control_variables=[]) stats_df = pd.DataFrame.from_dict(stats, orient='index') stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'}) stats_df = stats_df.sort_values(['precision_lift', 'feature'], ascending=[False, True]) stats_df.to_csv(stability_stats_file) return stats_df
def run_analyze_means(): df = get_per_year_dataset() df = df[df.year == MAX_YEAR] stats_df = analyze_relative_mean(df, concepts=list(CONCEPTS_DICT.keys()), metrics=SINGLE_SMELL, output_file=join(DATA_PATH, RELATIVE_MEANS_FILE))
def evaluate_smell_monotonocity(): df = get_per_year_dataset() df = df[df.year == MAX_YEAR] relevant_columns = set(df.columns) - NON_PREDICTIVE_FEATURES monotone_df = evaluate_sides_monotonocity_vs_concept( df=df, relevant_columns=relevant_columns, concepts_list=list(CONCEPTS_DICT.keys()), output_file_template=MONOTONE_PATH_TEMPLATE) return monotone_df
def analayze_smells_stability(stability_stats_file): keys = ['repo_name', 'full_file_name'] metric_per_year_df = get_per_year_dataset(binary=False) stats = analyze_stability(metric_per_year_df, keys=keys, metrics=SINGLE_SMELL, time_column='year', minimal_time=EARLIEST_ANALYZED_YEAR, control_variables=[]) stats_df = pd.DataFrame.from_dict(stats, orient='index') stats_df.to_csv(stability_stats_file)
def file_by_author_twin_analysis(): df = get_per_year_dataset() df = df[df.year == MAX_YEAR] single_author_files = df[df.authors == 1] keys= ['repo_name', 'Author_email'] filtering_function = lambda x: x.full_file_name_x == x.full_file_name_y comparision_function= lambda first, second : second > first \ if isinstance(first, numbers.Number) and isinstance(second, numbers.Number) \ else None comparision_columns = SINGLE_SMELL + ['full_file_name'] + list(CONCEPTS_DICT.keys()) comp_df = compare_twin_behaviours(first_behaviour=single_author_files , second_behaviour=single_author_files , keys=keys , comparision_columns=comparision_columns , comparision_function=comparision_function , filtering_function=filtering_function) twins_file = 'file_by_author_twin_analysis_concepts.csv' comp_df.to_csv(os.path.join(DATA_PATH, twins_file)) #comp_df = pd.read_csv(os.path.join(DATA_PATH, twins_file)) for concept in CONCEPTS_DICT.keys(): Pearson = comp_df.corr()[concept + COMPARISON_SUFFIX] Pearson_df = pd.DataFrame(Pearson).reset_index() Pearson_df.columns = ['feature', 'Pearson'] Pearson_df = Pearson_df.sort_values('Pearson', ascending=False) #print(Pearson_df) Pearson_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_PEARSON_TEMPLATE.format(concept=concept)) , index=False) stats = compute_confusion_matrics(df=comp_df , concept=concept + COMPARISON_SUFFIX , columns=[i + COMPARISON_SUFFIX for i in SINGLE_SMELL] , keys=keys) stats_df = pd.DataFrame.from_dict(stats, orient='index') stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'}) stats_df['feature'] = stats_df['feature'].map(lambda x : x[:-4]) stats_df = stats_df.sort_values(['precision_lift','feature'] , ascending=[False, True]) stats_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_CM_TEMPLATE.format(concept=concept)) , index=False)
def ccp_stability(): metrics = ['file_ccp', 'worse_10_hs', 'reduced_risk'] keys = [ 'repo_name', 'full_file_name'] metric_per_year_df = get_per_year_dataset() metric_per_year_df['worse_10_hs'] = metric_per_year_df.worse_10_hs.map(lambda x: int(x)) metric_per_year_df['reduced_risk'] = metric_per_year_df.corrective_rate.map(lambda x: int(x <= 0.05)) print(analyze_stability(metric_per_year_df , keys=keys , metrics=metrics , time_column='year' , minimal_time=EARLIEST_ANALYZED_YEAR , control_variables=[] , min_cnt_column='commits' , min_cnt_threshold=10 ))
def build_dataset(): df = get_per_year_dataset(binary=False) df[SMELLS_SUM] = df[SINGLE_SMELL].sum(axis=1) repos = df.groupby(keys, as_index=False).agg({ SMELLS_SUM: 'sum', 'file': 'count', 'MethodLength': 'sum', 'NPathComplexity': 'sum', 'AvoidInlineConditionals': 'sum', 'NestedIfDepth': 'sum', 'VisibilityModifier': 'sum' }) #repos['smells_per_file'] = repos[SMELLS_SUM]/repos['file'] prof = pd.read_csv(join(DATA_PATH, 'repo_properties_per_year.csv')) prof = prof[keys + [CONCEPT]] repos = pd.merge(repos, prof, on=keys) repos.to_csv(join(DATA_PATH, SMELLS_PER_REPO_FILE), index=False) return repos
def run_basic_models(concept): start = time.time() df = get_per_year_dataset() q25 = df[concept].quantile(0.25) df = df[df.year == MAX_YEAR - 1] df = df.fillna(NUMERIC_NULL) df[CONCEPT] = df[concept].map(lambda x: x <= q25) stats = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept))) smells = potential_smells(stats, concept) features = smells + [CONCEPT] #df = df[SINGLE_SMELL + [CONCEPT]] df = df[features] print(risk_predictive_columns(df)) end = time.time() print("Load data time", end - start) class_weight = {1: 1, 0: 1} #class_weight = {1: 100 , 0: 1} #class_weight = {1: 1 , 0: 100} #class_weight = {1: 1 , 0: 0.001} classifiers = { 'Tree_ms50_md3': DecisionTreeClassifier(min_samples_leaf=200, max_depth=3, class_weight=class_weight), 'Tree_default': DecisionTreeClassifier(class_weight=class_weight), 'Tree_ms50': DecisionTreeClassifier(min_samples_leaf=200, class_weight=class_weight), 'Tree_md3': DecisionTreeClassifier(max_depth=3, class_weight=class_weight), 'RandomForest': RandomForestClassifier(n_estimators=10, min_samples_leaf=50) } for model_name in classifiers.keys(): print(model_name) start = time.time() regressor = classifiers[model_name] regressor, performance = build_basic_model( df, concept=CONCEPT, classifier=regressor, model_file_name='{}.pkl'.format(model_name), performance_file=os.path.join(PERFORMANCE_PATH, '{}.json'.format(model_name))) if 'Tree' in model_name: plot_tree(regressor, dot_file_path=os.path.join(FIGURES_PATH, '{}.dot'.format(model_name)), png_file_path=os.path.join(FIGURES_PATH, '{}.png'.format(model_name)), feature_names=smells) tree_to_sql(tree=regressor, feature_names=smells, function_name="tree", output_file=os.path.join(MODELS_PATH, '{}.sql'.format(model_name))) else: plot_random_forest( regressor, dot_files_prefix=os.path.join(FIGURES_PATH, 'rf1'), png_files_prefix=os.path.join(FIGURES_PATH, 'rf1'), feature_names=smells) random_forest_to_sql(regressor, feature_names=smells, function_name_prefix="rf", output_file_prefix=os.path.join( MODELS_PATH, 'rf')) end = time.time() print("Model running time", end - start) return regressor, df
def run_feature_evaluation(): df = get_per_year_dataset() df = df[df.year == MAX_YEAR] evaluate_features(df) evaluate_features_length_control(df=df, features=SINGLE_SMELL)
def model_groups_influence(): SMELLS_COUNT = 'smells_count' CLASSIFIER = 'has_smells' LOW_QUALITY = 'low_quality' HIGH_QUALITY = 'high_quality' df = get_per_year_dataset() df = df[df.year == MAX_YEAR - 1] for l in [['short'], ['medium'], ['long'], ['short', 'medium', 'long']]: print(l) df = get_per_year_dataset() df = df[df.year == MAX_YEAR - 1] df = df[df['length_group'].isin(l)] rows = [] for concept in CONCEPTS_DICT.keys(): row = [CONCEPT_NAMES[concept]] stats = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept))) smells = potential_smells(stats, concept) #smells = robust_smells(stats # , concept) df[SMELLS_COUNT] = df[smells].sum(axis=1) df[CLASSIFIER] = df[SMELLS_COUNT].map(lambda x: x == 0) row.append(df[CLASSIFIER].mean()) q25 = df[concept].quantile(0.25) df[HIGH_QUALITY] = df[concept].map(lambda x: x <= q25) cm = pair_analysis(df, first_metric=CLASSIFIER, second_metric=HIGH_QUALITY) row.append(cm['precision_lift']) rows.append(row) q75 = df[concept].quantile(0.75) df[LOW_QUALITY] = df[concept].map(lambda x: x >= q75) cm = pair_analysis(df, first_metric=CLASSIFIER, second_metric=LOW_QUALITY) row.append(cm['precision_lift']) #row.append(df[concept].mean()) #row.append(df[df[CLASSIFIER]][concept].mean()) features_df = pd.DataFrame( rows, columns=[ 'Metric', 'Hit Rate', 'High Quality', 'Low Quality' #, 'Mean', 'CMean' ]).sort_values('Metric') print() df_to_latex_table( features_df, '\label{tab:group_smell_influence} Smells Groups Influence ')