def run_wilcoxon_test(df, name, folder): columns = df.columns.values t_df = pd.DataFrame(columns=columns, index=columns) for i in range(0, len(columns)-1): for j in range(i+1, len(columns)): stats, p_value = scipy.stats.mannwhitneyu( df[columns[i]].values, df[columns[j]].values, alternative='two-sided') t_df.iloc[j, i] = p_value write_df_to_csv(folder, t_df, name)
def do_ground_truth_all(files=None, kws=None): print('\n> Defining ground truth for feature vector/s:') [print("\t- " + os.path.relpath(file[0])) for file in files] for p, n in files: methods = get_fv_methods(p) ground_truth = do_ground_truth(methods, kws) write_df_to_csv(GT_DIR, gt_to_df(ground_truth, kws), n) print('> Ground truth/s has/ve been written to folder "%s"' % os.path.abspath(GT_DIR)) return GT_DIR
def compute_stats(metrics, classifiers, folder): print(indent('\n- Computing metrics statistics ... '), end='') stats = pd.DataFrame(columns=classifiers) for key, val in metrics.items(): name = str(capitalize(key)) mean = val.mean(axis=0) mean = mean.rename(name+' Mean') stats = stats.append(mean) median = val.median(axis=0) median = median.rename(name+' Median') stats = stats.append(median) std = val.std(axis=0) std = std.rename(name+' Standard Deviation') stats = stats.append(std) print('result:') print(indent(stats.to_string(), spaces=10)) out = write_df_to_csv(folder, stats, 'stats') print(indent('\n- Statistics written to file "%s"' % out))
def biased_clf_metrics_to_csv(labels, folder): prec, rec, fscore, sup = precision_recall_fscore_support(labels, ones(len(labels)), average='binary') df = pd.DataFrame({'precision': prec, 'recall': rec, 'fscore': fscore}, index=[0]) out = write_df_to_csv(folder, df, 'biased_metrics') print(indent('\n- Biased classifier metrics ("precision", "recall" and "fscore") written to file "%s"' % out))
def do_all_cluster_from_path( path=None, target=None, f=None, n=5): path = os.path.abspath(path) target = os.path.abspath(target) print('\n> Clustering God class methods in file/folder "%s"' % path) # get all the feature vectors in the folder path paths_and_names = get_paths_and_names(path) for el in paths_and_names: # applies teh function "f" (either k-means or hierarchical agglomerative) df = f(el[0], n) write_df_to_csv(target, df, el[1]) print('> Clusters have been written to folder "%s"' % target) return target
def extract_feature_vectors(god_classes): print('\n> Starting feature vector extraction...') class_names = god_classes.class_name.tolist() all_feat_vectors = {} for src_path in god_classes.path_to_source.tolist(): # open the class source with (open(src_path, 'r')) as jsc: # parse the class tree = jl.parse.parse(jsc.read()) # iterates through the file classes for path, node in tree.filter(jl.parser.tree.ClassDeclaration): if node.name in class_names: # check whether the class is a god class # Generates the feature vector for each class all_feat_vectors[node.name] = generate_all(node) write_df_to_csv(FV_DIR, all_feat_vectors[node.name], node.name) fv_dir = os.path.abspath(FV_DIR) print('> Feature vector/s has/ve been written to folder "%s"' % fv_dir) return fv_dir
def label_feature_vectors(fv, fv_path, buggy_classes_dir): print('\n> Creating labels for feature vector "%s"' % fv_path) buggy_classes = get_buggy_classes(buggy_classes_dir) label_feature_vector = get_label_feature_vector(fv, buggy_classes) path = write_df_to_csv( DEF_LFV_DIR, label_feature_vector, gen_name_with_suffix('label_feature_vector', get_dir_time_suffix(fv_path, 'feature_vector'))) print('> Labeled feature vector has been written to file "%s"' % os.path.abspath(path)) return label_feature_vector, path
def run_training(classifier, classifier_name, fv_path, tt, r_num): l_precision, l_recall, l_fscore, l_accurancy = [], [], [], [] print(indent('\n- Training classifier "%s"...' % classifier_name)) for i in range(0, len(r_num)): pred, acc = run_classifier( classifier, tt['x_trains'][i], tt['x_tests'][i], tt['y_trains'][i], tt['y_tests'][i]) prec, rec, f1 = get_prec_recall_fscore(tt['y_tests'][i], pred) l_accurancy.append(acc) l_precision.append(prec) l_recall.append(rec) l_fscore.append(f1) df = pd.DataFrame( { "r_num": r_num, "accuracy": l_accurancy, "precision": l_precision, "recall": l_recall, "fscore": l_fscore }) tr_folder = DEF_TR_DIR + '/' + get_dir_time_suffix(fv_path, 'label_feature_vector-') path = write_df_to_csv(tr_folder, df, classifier_name.replace(' ', '')) print(indent('\nResults written to file "%s"' % path, spaces=10)) print_averages(df) make_plot(r_num, df[['accuracy', 'precision', 'recall', 'fscore']], classifier_name, tr_folder)
def extract_feature_vectors(root): print('\n> Starting feature vector extraction for project "%s"' % root) df = pd.DataFrame(columns=FV_COLS) for t_class in get_top_classes(root): mth, fld, rfc, ints = get_class_metrics(t_class) sz, cpx, ex, ret = get_methods_metrics(t_class) bcm, nml, wrd, dcm = get_npl_metrics(t_class) df = df.append( { 'class': t_class.name, # class name 'MTH': mth, 'FLD': fld, 'RFC': rfc, 'INT': ints, # CLASS METRICS 'SZ': sz, 'CPX': cpx, 'EX': ex, 'RET': ret, # METHOD METRICS 'BCM': bcm, 'NML': nml, 'WRD': wrd, 'DCM': dcm # NPL METRICS }, ignore_index=True, sort=-1) df = df_sort_cols(df, FV_COLS) path = write_df_to_csv(DEF_FV_DIR, df, gen_name_with_time('feature_vector')) print('> Feature vector/s has/ve been written to file "%s"' % os.path.abspath(path)) return df, path