Esempio n. 1
0
 def bellwether(self):
     final_score = {}
     count = 0
     for s_project in self.selected_projects:
         try:
             s_path = self.data_source + '/' + s_project
             print(s_project)
             df = self.prepare_data(s_path)
             if df.shape[0] < 50:
                 continue
             else:
                 count += 1
             df.reset_index(drop=True, inplace=True)
             d = {'buggy': True, 'clean': False}
             df['Buggy'] = df['Buggy'].map(d)
             y = df.Buggy
             X = df.drop(labels=['Buggy'], axis=1)
             kf = StratifiedKFold(n_splits=5)
             score = {}
             F = {}
             for i in range(5):
                 for train_index, tune_index in kf.split(X, y):
                     X_train, X_tune = X.iloc[train_index], X.iloc[
                         tune_index]
                     y_train, y_tune = y[train_index], y[tune_index]
                     clf = LogisticRegression()
                     clf.fit(X_train, y_train)
                     destination_projects = copy.deepcopy(
                         self.selected_projects)
                     destination_projects.remove(s_project)
                     for d_project in destination_projects:
                         #print(d_project)
                         try:
                             d_path = self.data_source + '/' + d_project
                             dest_df = self.prepare_data(d_path)
                             if dest_df.shape[0] < 50:
                                 continue
                             dest_df.reset_index(drop=True, inplace=True)
                             d = {'buggy': True, 'clean': False}
                             dest_df['Buggy'] = dest_df['Buggy'].map(d)
                             test_y = dest_df.Buggy
                             test_X = dest_df.drop(labels=['Buggy'], axis=1)
                             predicted = clf.predict(test_X)
                             _df_test_loc = test_X.LOC
                             abcd = metrices.measures(
                                 test_y, predicted, _df_test_loc)
                             F['f1'] = [abcd.calculate_f1_score()]
                             F['precision'] = [abcd.calculate_precision()]
                             F['recall'] = [abcd.calculate_recall()]
                             F['g-score'] = [abcd.get_g_score()]
                             F['d2h'] = [abcd.calculate_d2h()]
                             F['pci_20'] = [abcd.get_pci_20()]
                             F['ifa'] = [abcd.get_ifa()]
                             F['pd'] = [abcd.get_pd()]
                             F['pf'] = [abcd.get_pf()]
                             _F = copy.deepcopy(F)
                             if 'f1' not in score.keys():
                                 score[d_project] = _F
                             else:
                                 score[d_project]['f1'].append(F['f1'][0])
                                 score[d_project]['precision'].append(
                                     F['precision'][0])
                                 score[d_project]['recall'].append(
                                     F['recall'][0])
                                 score[d_project]['g-score'].append(
                                     F['g-score'][0])
                                 score[d_project]['d2h'].append(F['d2h'][0])
                                 score[d_project]['pci_20'].append(
                                     F['pci_20'][0])
                                 score[d_project]['ifa'].append(F['ifa'][0])
                                 score[d_project]['pd'].append(F['pd'][0])
                                 score[d_project]['pf'].append(F['pf'][0])
                         except Exception as e:
                             print(e)
                             continue
                 final_score[s_project] = score
         except Exception as e:
             print(e)
             continue
     return final_score
Esempio n. 2
0
 def bellwether(self,selected_projects,all_projects):
     final_score = {}
     count = 0
     for s_project in selected_projects:
         try:
             print(s_project,selected_projects.shape[0])
             s_path = self.data_path + s_project
             #print(s_project)
             df = self.prepare_data(s_path)
             if df.shape[0] < 50:
                 continue
             else:
                 count+=1
             df.reset_index(drop=True,inplace=True)
             d = {'buggy': True, 'clean': False}
             df['Buggy'] = df['Buggy'].map(d)
             df, s_cols = self.apply_cfs(df)
             #s_cols = df.columns.tolist()
             df = self.apply_smote(df)
             y = df.Buggy
             X = df.drop(labels = ['Buggy'],axis = 1)
             kf = StratifiedKFold(n_splits = 5)
             score = {}
             F = {}
             for i in range(1):
                 clf = LogisticRegression()
                 # clf = RandomForestClassifier()
                 clf.fit(X,y)
                 destination_projects = copy.deepcopy(all_projects)
                     #destination_projects.remove(s_project)
                 for d_project in destination_projects:
                     try:
                         d_path = self.data_path + d_project
                         _test_df = self.prepare_data(d_path)
                         _df_test_loc = _test_df.LOC
                         test_df = _test_df[s_cols]
                         if test_df.shape[0] < 50:
                             continue
                         test_df.reset_index(drop=True,inplace=True)
                         d = {'buggy': True, 'clean': False}
                         test_df['Buggy'] = test_df['Buggy'].map(d)
                         test_y = test_df.Buggy
                         test_X = test_df.drop(labels = ['Buggy'],axis = 1)
                         predicted = clf.predict(test_X)
                         abcd = metrices.measures(test_y,predicted,_df_test_loc)
                         F['f1'] = [abcd.calculate_f1_score()]
                         F['precision'] = [abcd.calculate_precision()]
                         F['recall'] = [abcd.calculate_recall()]
                         F['g-score'] = [abcd.get_g_score()]
                         F['d2h'] = [abcd.calculate_d2h()]
                         F['pci_20'] = [abcd.get_pci_20()]
                         F['ifa'] = [abcd.get_ifa()]
                         F['pd'] = [abcd.get_pd()]
                         F['pf'] = [abcd.get_pf()]
                         _F = copy.deepcopy(F)
                         if 'f1' not in score.keys():
                             score[d_project] = _F
                         else:
                             score[d_project]['f1'].append(F['f1'][0])
                             score[d_project]['precision'].append(F['precision'][0])
                             score[d_project]['recall'].append(F['recall'][0])
                             score[d_project]['g-score'].append(F['g-score'][0])
                             score[d_project]['d2h'].append(F['d2h'][0])
                             score[d_project]['pci_20'].append(F['pci_20'][0])
                             score[d_project]['ifa'].append(F['ifa'][0])
                             score[d_project]['pd'].append(F['pd'][0])
                             score[d_project]['pf'].append(F['pf'][0])
                     except Exception as e:
                         print("dest",d_project,e)
                         continue
                 final_score[s_project] = score 
         except Exception as e:
             print("src",s_project,e)
             continue
     return final_score
Esempio n. 3
0
File: RQ4.py Progetto: ai-se/GENERAL
def get_predicted(cluster_data_loc, metrices_loc, fold, data_location,
                  default_bellwether_loc):
    train_data = pd.read_pickle(cluster_data_loc + '/train_data.pkl')
    cluster, cluster_tree, max_depth = cluster_driver(train_data)
    t_df = pd.DataFrame()
    for project in train_data.index.values.tolist():
        _s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + project
        s_df = prepare_data(_s_path)
        t_df = pd.concat([t_df, s_df])
        break
    t_df.reset_index(drop=True, inplace=True)
    d = {'buggy': True, 'clean': False}
    t_df['Buggy'] = t_df['Buggy'].map(d)
    t_df, g_s_cols = apply_cfs(t_df)
    t_df = apply_smote(t_df)
    train_y = t_df.Buggy
    train_X = t_df.drop(labels=['Buggy'], axis=1)
    clf_global = LogisticRegression()
    clf_global.fit(train_X, train_y)
    test_data = pd.read_pickle(cluster_data_loc + '/test_data.pkl')
    #print(test_data)
    test_projects = test_data.index.values.tolist()
    goals = ['recall', 'precision', 'pf', 'pci_20', 'ifa']
    levels = [2, 1, 0]
    results = {}
    bellwether_models = {}
    bellwether0_models = {}
    bellwether0_s_cols = {}
    bellwether_s_cols = {}
    self_model = {}
    self_model_test = {}
    for level in levels:
        test_data = test_data
        predicted_cluster = cluster.predict(test_data, level)
        #print(level,predicted_cluster)
        for i in range(len(predicted_cluster)):
            try:
                F = {}
                _F = {}
                b_F = {}
                g_F = {}
                r_F = {}
                c_id = predicted_cluster[i]
                s_project_df = pd.read_csv(cluster_data_loc +
                                           '/bellwether_cdom_' + str(level) +
                                           '.csv')
                if level == 1:
                    s_project_df.rename(columns={'Unnamed: 0': 'id'},
                                        inplace=True)
                if level == 0:
                    s_project = s_project_df.bellwether.values[0]
                else:
                    s_project = s_project_df[
                        s_project_df['id'] ==
                        predicted_cluster[i]].bellwether.values[0]
                if s_project not in bellwether_models.keys():
                    s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
                    df = prepare_data(s_path)
                    df.reset_index(drop=True, inplace=True)
                    d = {'buggy': True, 'clean': False}
                    df['Buggy'] = df['Buggy'].map(d)
                    df, s_cols = apply_cfs(df)
                    bellwether_s_cols[s_project] = s_cols
                    df = apply_smote(df)
                    y = df.Buggy
                    X = df.drop(labels=['Buggy'], axis=1)
                    clf_bellwether = LogisticRegression()
                    clf_bellwether.fit(X, y)
                    bellwether_models[s_project] = clf_bellwether
                else:
                    clf_bellwether = bellwether_models[s_project]
                    s_cols = bellwether_s_cols[s_project]

                b_s_project_df = pd.read_csv(default_bellwether_loc +
                                             '/cdom_bellwether.csv')
                b_s_project_df.columns = [
                    'bellwether', 'recall', 'precision', 'pf', 'wins'
                ]
                b_s_project = b_s_project_df.bellwether.values[
                    b_s_project_df.wins.idxmax()]
                if b_s_project not in bellwether0_models.keys():
                    b_s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + b_s_project
                    b_df = prepare_data(b_s_path)
                    b_df.reset_index(drop=True, inplace=True)
                    d = {'buggy': True, 'clean': False}
                    b_df['Buggy'] = b_df['Buggy'].map(d)
                    b_df, b_s_cols = apply_cfs(b_df)
                    bellwether0_s_cols[b_s_project] = b_s_cols
                    b_df = apply_smote(b_df)
                    b_y = b_df.Buggy
                    b_X = b_df.drop(labels=['Buggy'], axis=1)
                    b_clf_bellwether = LogisticRegression()
                    b_clf_bellwether.fit(b_X, b_y)
                    bellwether0_models[b_s_project] = b_clf_bellwether
                else:
                    b_clf_bellwether = bellwether0_models[b_s_project]
                    b_s_cols = bellwether0_s_cols[b_s_project]

                d_project = test_projects[i]
                kf = StratifiedKFold(n_splits=2)
                d_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + d_project
                test_df = prepare_data(d_path)
                test_df.reset_index(drop=True, inplace=True)
                d = {'buggy': True, 'clean': False}
                test_df['Buggy'] = test_df['Buggy'].map(d)
                #test_df, x_s_cols = apply_cfs(test_df)
                test_y = test_df.Buggy
                test_X = test_df.drop(labels=['Buggy'], axis=1)
                for train_index, test_index in kf.split(test_X, test_y):
                    X_train, X_test = test_X.iloc[train_index], test_X.iloc[
                        test_index]
                    y_train, y_test = test_y[train_index], test_y[test_index]
                    x_test_df = pd.concat([X_train, y_train], axis=1)
                    x_test_df = apply_smote(x_test_df)
                    y_train = x_test_df.Buggy
                    X_train = x_test_df.drop(labels=['Buggy'], axis=1)
                    clf_self = LogisticRegression()
                    clf_self.fit(X_train, y_train)
                    self_model[d_project] = clf_self
                    self_model_test[d_project] = [X_test, y_test]

                    _test_df = pd.concat(self_model_test[d_project], axis=1)
                    _df_test_loc = _test_df.LOC
                    _test_df_1 = copy.deepcopy(_test_df[s_cols])
                    _test_df_2 = copy.deepcopy(_test_df)
                    _test_df_3 = copy.deepcopy(_test_df[b_s_cols])
                    _test_df_4 = copy.deepcopy(_test_df[g_s_cols])
                    _test_df_5 = copy.deepcopy(_test_df)

                    y_test = _test_df_1.Buggy
                    X_test = _test_df_1.drop(labels=['Buggy'], axis=1)
                    predicted_bellwether = clf_bellwether.predict(X_test)
                    abcd = metrices.measures(y_test, predicted_bellwether,
                                             _df_test_loc)
                    if 'f1' not in F.keys():
                        F['f1'] = []
                        F['precision'] = []
                        F['recall'] = []
                        F['g-score'] = []
                        F['d2h'] = []
                        F['pci_20'] = []
                        F['ifa'] = []
                        F['pd'] = []
                        F['pf'] = []
                    F['f1'].append(abcd.calculate_f1_score())
                    F['precision'].append(abcd.calculate_precision())
                    F['recall'].append(abcd.calculate_recall())
                    F['g-score'].append(abcd.get_g_score())
                    F['d2h'].append(abcd.calculate_d2h())
                    F['pci_20'].append(abcd.get_pci_20())
                    try:
                        F['ifa'].append(abcd.get_ifa_roc())
                    except:
                        F['ifa'].append(0)
                    F['pd'].append(abcd.get_pd())
                    F['pf'].append(abcd.get_pf())

                    try:
                        y_test = _test_df_2.Buggy
                        X_test = _test_df_2.drop(labels=['Buggy'], axis=1)
                        predicted_self = clf_self.predict(X_test)
                        abcd = metrices.measures(y_test, predicted_self,
                                                 _df_test_loc)
                        if 'f1' not in _F.keys():
                            _F['f1'] = []
                            _F['precision'] = []
                            _F['recall'] = []
                            _F['g-score'] = []
                            _F['d2h'] = []
                            _F['pci_20'] = []
                            _F['ifa'] = []
                            _F['pd'] = []
                            _F['pf'] = []
                        _F['f1'].append(abcd.calculate_f1_score())
                        _F['precision'].append(abcd.calculate_precision())
                        _F['recall'].append(abcd.calculate_recall())
                        _F['g-score'].append(abcd.get_g_score())
                        _F['d2h'].append(abcd.calculate_d2h())
                        _F['pci_20'].append(abcd.get_pci_20())
                        try:
                            _F['ifa'].append(abcd.get_ifa_roc())
                        except:
                            _F['ifa'].append(0)
                        _F['pd'].append(abcd.get_pd())
                        _F['pf'].append(abcd.get_pf())
                    except:
                        _F['f1'].append(0)
                        _F['precision'].append(0)
                        _F['recall'].append(0)
                        _F['g-score'].append(0)
                        _F['d2h'].append(0)
                        _F['pci_20'].append(0)
                        _F['ifa'].append(0)
                        _F['pd'].append(0)
                        _F['pf'].append(0)

                    b_y_test = _test_df_3.Buggy
                    b_X_test = _test_df_3.drop(labels=['Buggy'], axis=1)
                    predicted_bell0 = b_clf_bellwether.predict(b_X_test)
                    abcd = metrices.measures(b_y_test, predicted_bell0,
                                             _df_test_loc)
                    if 'f1' not in b_F.keys():
                        b_F['f1'] = []
                        b_F['precision'] = []
                        b_F['recall'] = []
                        b_F['g-score'] = []
                        b_F['d2h'] = []
                        b_F['pci_20'] = []
                        b_F['ifa'] = []
                        b_F['pd'] = []
                        b_F['pf'] = []
                    b_F['f1'].append(abcd.calculate_f1_score())
                    b_F['precision'].append(abcd.calculate_precision())
                    b_F['recall'].append(abcd.calculate_recall())
                    b_F['g-score'].append(abcd.get_g_score())
                    b_F['d2h'].append(abcd.calculate_d2h())
                    b_F['pci_20'].append(abcd.get_pci_20())
                    try:
                        b_F['ifa'].append(abcd.get_ifa_roc())
                    except:
                        b_F['ifa'].append(0)
                    b_F['pd'].append(abcd.get_pd())
                    b_F['pf'].append(abcd.get_pf())

                    g_y_test = _test_df_4.Buggy
                    g_X_test = _test_df_4.drop(labels=['Buggy'], axis=1)
                    predicted_global = clf_global.predict(g_X_test)
                    abcd = metrices.measures(g_y_test, predicted_global,
                                             _df_test_loc)
                    if 'f1' not in g_F.keys():
                        g_F['f1'] = []
                        g_F['precision'] = []
                        g_F['recall'] = []
                        g_F['g-score'] = []
                        g_F['d2h'] = []
                        g_F['pci_20'] = []
                        g_F['ifa'] = []
                        g_F['pd'] = []
                        g_F['pf'] = []
                    g_F['f1'].append(abcd.calculate_f1_score())
                    g_F['precision'].append(abcd.calculate_precision())
                    g_F['recall'].append(abcd.calculate_recall())
                    g_F['g-score'].append(abcd.get_g_score())
                    g_F['d2h'].append(abcd.calculate_d2h())
                    g_F['pci_20'].append(abcd.get_pci_20())
                    try:
                        g_F['ifa'].append(abcd.get_ifa_roc())
                    except:
                        g_F['ifa'].append(0)
                    g_F['pd'].append(abcd.get_pd())
                    g_F['pf'].append(abcd.get_pf())

                    r_y_test = _test_df_5.Buggy
                    r_X_test = _test_df_5.drop(labels=['Buggy'], axis=1)
                    _count_major = Counter(y_train)
                    _count_major = _count_major.most_common(1)[0][0]
                    predicted_random = [_count_major] * r_X_test.shape[0]
                    abcd = metrices.measures(r_y_test, predicted_random,
                                             _df_test_loc)
                    if 'f1' not in r_F.keys():
                        r_F['f1'] = []
                        r_F['precision'] = []
                        r_F['recall'] = []
                        r_F['g-score'] = []
                        r_F['d2h'] = []
                        r_F['pci_20'] = []
                        r_F['ifa'] = []
                        r_F['pd'] = []
                        r_F['pf'] = []
                    r_F['f1'].append(abcd.calculate_f1_score())
                    r_F['precision'].append(abcd.calculate_precision())
                    r_F['recall'].append(abcd.calculate_recall())
                    r_F['g-score'].append(abcd.get_g_score())
                    r_F['d2h'].append(abcd.calculate_d2h())
                    r_F['pci_20'].append(abcd.get_pci_20())
                    try:
                        r_F['ifa'].append(abcd.get_ifa_roc())
                    except:
                        r_F['ifa'].append(0)
                    r_F['pd'].append(abcd.get_pd())
                    r_F['pf'].append(abcd.get_pf())

                for goal in goals:
                    if goal == 'g':
                        _goal = 'g-score'
                    else:
                        _goal = goal
                    if _goal not in results.keys():
                        results[_goal] = {}
                    if d_project not in results[_goal].keys():
                        results[_goal][d_project] = []
                    results[_goal][d_project].append(np.median(F[_goal]))
                    results[_goal][d_project].append(np.median(b_F[_goal]))
                    results[_goal][d_project].append(np.median(_F[_goal]))
                    results[_goal][d_project].append(np.median(g_F[_goal]))
                    results[_goal][d_project].append(np.median(r_F[_goal]))
            except Exception as e:
                print(e)
                continue
    _cols = [
        'level2_bellwether', 'default_bellwether', 'self', 'global', 'random',
        'level1_bellwether', 'default_bellwether', 'self', 'global', 'random',
        'level0_bellwether', 'default_bellwether', 'self', 'global', 'random'
    ]
    print(results)
    for key in results:
        df = pd.DataFrame.from_dict(results[key],
                                    orient='index',
                                    columns=_cols)
        if not Path(data_location).is_dir():
            os.makedirs(Path(data_location))
            df.to_csv(data_location + '/bellwether_' + key + '.csv')
    return results