def run_self_CFS(project): X, y = load_data(project) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=18) loc = X_test.CountLineCode df_smote = pd.concat([X_train, y_train], axis=1) df_smote = apply_smote(df_smote) df_smote, cols = apply_cfs(df_smote) y_train = df_smote.Bugs X_train = df_smote.drop('Bugs', axis=1) clf = RandomForestClassifier() clf.fit(X_train, y_train) predicted = clf.predict(X_test[cols[:-1]]) abcd = metrices.measures(y_test, predicted, loc) pf = abcd.get_pf() recall = abcd.calculate_recall() precision = abcd.calculate_precision() f1 = abcd.calculate_f1_score() g_score = abcd.get_g_score() pci_20 = abcd.get_pci_20() ifa = abcd.get_ifa() try: auc = roc_auc_score(y_test, predicted) except: auc = 0 print(classification_report(y_test, predicted)) return recall, precision, pf, f1, g_score, auc, pci_20, ifa
def run_self_k(project, metric): precision = [] recall = [] pf = [] f1 = [] g_score = [] auc = [] pci_20 = [] ifa = [] importance = [] X, y = load_both_data(project, metric) for _ in range(5): skf = StratifiedKFold(n_splits=5) for train_index, test_index in skf.split(X, y): X_train, X_test = X.loc[train_index], X.loc[test_index] y_train, y_test = y.loc[train_index], y.loc[test_index] if metric == 'process': loc = X_test['file_la'] + X_test['file_lt'] elif metric == 'product': loc = X_test.CountLineCode else: loc = X_test['file_la'] + X_test['file_lt'] df_smote = pd.concat([X_train, y_train], axis=1) df_smote = apply_smote(df_smote) y_train = df_smote.Bugs X_train = df_smote.drop('Bugs', axis=1) clf = SVC() clf.fit(X_train, y_train) importance = 0 predicted = clf.predict(X_test) abcd = metrices.measures(y_test, predicted, loc) pf.append(abcd.get_pf()) recall.append(abcd.calculate_recall()) precision.append(abcd.calculate_precision()) f1.append(abcd.calculate_f1_score()) g_score.append(abcd.get_g_score()) pci_20.append(abcd.get_pci_20()) ifa.append(abcd.get_ifa()) try: auc.append(roc_auc_score(y_test, predicted)) except: auc.append(0) # print(classification_report(y_test, predicted)) return recall, precision, pf, f1, g_score, auc, pci_20, ifa, importance
def run_self_release(project,metric): precision = [] recall = [] pf = [] f1 = [] g_score = [] auc = [] pci_20 = [] ifa = [] X_train,y_train,test_df = load_data_commit_level(project,metric) df_smote = pd.concat([X_train,y_train],axis = 1) df_smote = apply_smote(df_smote) y_train = df_smote.Bugs X_train = df_smote.drop('Bugs',axis = 1) clf = RandomForestClassifier() clf.fit(X_train,y_train) importance = clf.feature_importances_ unique_commits_list = np.array_split(test_df.commit_hash.unique(), 5) for i in range(len(unique_commits_list)): test_df_subset = test_df[test_df.commit_hash.isin(unique_commits_list[i])] y_test = test_df_subset.Bugs X_test = test_df_subset.drop(['Bugs','commit_hash'],axis = 1) if metric == 'process': loc = X_test['file_la'] + X_test['file_lt'] elif metric == 'product': loc = X_test.CountLineCode else: loc = X_test['file_la'] + X_test['file_lt'] predicted = clf.predict(X_test) abcd = metrices.measures(y_test,predicted,loc) pf.append(abcd.get_pf()) recall.append(abcd.calculate_recall()) precision.append(abcd.calculate_precision()) f1.append(abcd.calculate_f1_score()) g_score.append(abcd.get_g_score()) pci_20.append(abcd.get_pci_20()) ifa.append(abcd.get_ifa()) try: auc.append(roc_auc_score(y_test, predicted)) except: auc.append(0) print(classification_report(y_test, predicted)) # print(recall,precision,pf,f1,g_score,auc,pci_20) return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance
def run_self_k(project,projects,metric): precision = [] recall = [] pf = [] f1 = [] g_score = [] auc = [] pci_20 = [] ifa = [] importance = [] X_train,y_train = load_both_data(project,metric) df_smote = pd.concat([X_train,y_train],axis = 1) df_smote = apply_smote(df_smote) y_train = df_smote.Bugs X_train = df_smote.drop('Bugs',axis = 1) clf = RandomForestClassifier() clf.fit(X_train,y_train) importance = clf.feature_importances_ for _project in projects: try: X_test,y_test = load_both_data(_project,metric) if metric == 'process': loc = X_test['file_la'] + X_test['file_lt'] elif metric == 'product': loc = X_test.CountLineCode else: loc = X_test['file_la'] + X_test['file_lt'] predicted = clf.predict(X_test) abcd = metrices.measures(y_test,predicted,loc) pf.append(abcd.get_pf()) recall.append(abcd.calculate_recall()) precision.append(abcd.calculate_precision()) f1.append(abcd.calculate_f1_score()) g_score.append(abcd.get_g_score()) pci_20.append(abcd.get_pci_20()) ifa.append(abcd.get_ifa()) try: auc.append(roc_auc_score(y_test, predicted)) except: auc.append(0) except Exception as e: print('error in test',_project,e) continue return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance
def run_self(project, projects): precision_list = [] recall_list = [] pf_list = [] f1_list = [] g_list = [] auc_list = [] pci_20_list = [] ifa_list = [] featue_importance = [] X_train, y_train = load_data(project) df_smote = pd.concat([X_train, y_train], axis=1) df_smote = apply_smote(df_smote) y_train = df_smote.Bugs X_train = df_smote.drop('Bugs', axis=1) clf = RandomForestClassifier() clf.fit(X_train, y_train) importance = 0 for project in projects: try: X_test, y_test = load_data(project) loc = X_test.CountLineCode predicted = clf.predict(X_test) abcd = metrices.measures(y_test, predicted, loc) pf_list.append(abcd.get_pf()) recall_list.append(abcd.calculate_recall()) precision_list.append(abcd.calculate_precision()) f1_list.append(abcd.calculate_f1_score()) g_list.append(abcd.get_g_score()) pci_20_list.append(abcd.get_pci_20()) ifa_list.append(abcd.get_ifa()) try: auc_list.append(roc_auc_score(y_test, predicted)) except: precision_list.append(0) except: continue return np.nanmedian(recall_list), np.nanmedian( precision_list), np.nanmedian(pf_list), np.nanmedian( f1_list), np.nanmedian(g_list), np.nanmedian( auc_list), np.nanmedian(pci_20_list), np.nanmedian( ifa_list), importance