def discoverMore(datasets, Runtimes, clf_index=2):
    Identify Bellwether Datasets for more projects
    :param datasets: type: list, each element includes absolute path of .csv,
    :return: Return data with best prediction score
    For example, 3 projects
    1 as data1;2+3 as data2 --> 1v(2+3)--score1--->mean1 = (score1).mean ;
    2 as data1;1+3 as data2 --> 2v(1+3)--score2--->mean2 = (score2).mean;
    3 as data1;1+2 as data2 --> 3v(1+2)--score3 --->mean3 = (score3).mean ;
    max(mean1, mean2, mean3)=x -->x project of(1,2,3) projects is a bellwether dataset.
    score_values = []  # 初始化每个data作为data1时得到的score均值
    for file1 in datasets:
        data1 = NumericStringLabel2BinaryLabel(
            file1)  # train data:Binary Label
        head = data1.columns
        # data1_score_values = []
        # data1_score_mean = 0  # 初始化data1作为训练集,其他data分别作为测试集时的score均值
        data2 = pd.DataFrame()
        for file2 in datasets:
            if file1 != file2:
                data_Temp = NumericStringLabel2BinaryLabel(
                    file2)  # test data, type: DataFrame
                data_Temp.columns = head  # keep the columns name same
                data2 = pd.concat([data2, data_Temp], sort=False)
        data1_score_mean = Score(data1, data2, Runtimes, clf_index)
        # print("train:", file1, data1_score_mean)
    dataset_index = score_values.index(max(score_values))
    bellwether_path = datasets[dataset_index]
    bellwether = NumericStringLabel2BinaryLabel(
        bellwether_path)  # type, dataframe
    return bellwether, bellwether_path
def discover(datasets, Runtimes, clf_index=2):
    Identify Bellwether Datasets
    :param datasets: type: list, each element includes absolute path of .csv,
    :return: Return data with best prediction score
    For example, 3 projects
    1 as data1;2&3 as data2, respectively. --> 1v2-score12,1v3--score13 --->mean1 = (score12+score13).mean ;
    2 as data1;1&3 as data2, respectively. --> 2v1-score21,2v3--score23 --->mean2 = (score21+score23).mean;
    3 as data1;1&2 as data2, respectively. --> 3v1-score31,3v2--score32 --->mean3 = (score31+score32).mean ;
    max(mean1, mean2, mean3)=x -->x project of(1,2,3) projects is a bellwether dataset.
    score_values = []  # 初始化每个data作为data1时得到的score均值
    for file1 in datasets:
        data1 = NumericStringLabel2BinaryLabel(
            file1)  # train data:Binary Label
        data1_score_values = []
        data1_score_mean = 0  # 初始化data1作为训练集,其他data分别作为测试集时的score均值
        for file2 in datasets:
            if file1 != file2:
                data2 = NumericStringLabel2BinaryLabel(file2)  # test data
                value_mean = Score(data1, data2, Runtimes, clf_index)
                data1_score_mean += value_mean
        data1_score_mean = data1_score_mean / (len(datasets) - 1)
        # print("train:", file1, data1_score_mean)
    dataset_index = score_values.index(max(score_values))
    bellwether_path = datasets[dataset_index]
    bellwether = NumericStringLabel2BinaryLabel(bellwether_path)
    return bellwether, bellwether_path
def WeightedIsolationForestwithLabelinformationFilter(mode, clf_index,
    pwd = os.getcwd()
    father_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + ".")
    # print(father_path)
    datapath = father_path + '/dataset-inOne/'
    spath = father_path + '/results/'
    if not os.path.exists(spath):

    # example datasets for test
    # datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv'], ['Apache.csv', 'Safe.csv', 'Zxing.csv']]
    # datasets = [['Apache.csv', 'Safe.csv', 'Zxing.csv']]

    # # datasets for RQ1
    datasets = [[
        'ant-1.7.csv', 'arc-1.csv', 'camel-1.6.csv', 'ivy-2.0.csv',
        'jedit-4.3.csv', 'log4j-1.1.csv', 'lucene-2.0.csv', 'poi-2.0.csv',
        'redaktor-1.csv', 'synapse-1.2.csv', 'tomcat-6.0.389418.csv',
        'velocity-1.6.csv', 'xalan-2.6.csv', 'xerces-1.3.csv'

    datanum = 0
    for i in range(len(datasets)):
        datanum = datanum + len(datasets[i])
    # print(datanum)
    # mode = [preprocess_mode, train_mode, save_file_name]
    # preprocess_mode = mode[0]
    iForest_parameters = mode[0]
    train_mode = mode[1]
    save_file_name = mode[2]

    df_file_measures = pd.DataFrame(
    )  # the measures of all files in all runtimes
    classifiername = []
    # file_list = os.listdir(fpath)

    n = 0
    for i in range(len(datasets)):
        for file_te in datasets[i]:
            n = n + 1
            print('----------%s:%d/%d------' % ('Dataset', n, datanum))
            print('testfile:', file_te)
            start_time = time.time()

            Address_te = datapath + file_te
            Samples_te = NumericStringLabel2BinaryLabel(
                Address_te)  # DataFrame
            data = Samples_te.values  # DataFrame2Array
            X = data[:, :-1]  # test features
            y = data[:, -1]  # test labels
            Sample_tr0 = NumericStringLabel2BinaryLabel(Address_te)
            column_name = Sample_tr0.columns.values  # the column name of the data

            df_r_measures = pd.DataFrame(
            )  # the measures of each file in runtimes
            for r in range(runtimes):
                if train_mode == 'M2O_CPDP':  # train data contains more files different from the test data project
                    X_test = X
                    y_test = y
                    Samples_tr_all = pd.DataFrame(
                    )  # initialize the candidate training data of all cp data
                    for file_tr in datasets[i]:
                        if file_tr != file_te:
                            print('train_file:', file_tr)
                            Address_tr = datapath + file_tr
                            Samples_tr = NumericStringLabel2BinaryLabel(
                                Address_tr)  # original train data, DataFrame
                            Samples_tr.columns = column_name.tolist()  # 批量更改列名
                            Samples_tr_all = pd.concat(
                                [Samples_tr_all, Samples_tr],
                    # Samples_tr_all.to_csv(f2, index=None, columns=None)  # 将类标签二元化的数据保存,保留列名,不增加行索引

                    # /* step1: data preprocessing */
                    Sample_tr_pos, Sample_tr_neg, Sample_pos_index, Sample_neg_index \
                        = Random_Stratified_Sample_fraction(Samples_tr_all, string='bug',
                                                            r=r)  # random sample 90% negative samples and 90% positive samples
                    Sample_tr = np.concatenate((Sample_tr_neg, Sample_tr_pos),
                                               axis=0)  # array垂直拼接
                    data_train = pd.DataFrame(Sample_tr)
                    data_train_unique = Drop_Duplicate_Samples(
                        data_train)  # drop duplicate samples
                    data_train_unique = data_train_unique.values
                    X_train = data_train_unique[:, :-1]
                    y_train = data_train_unique[:, -1]

                    # /* step2: Zscore_SMOTE_WiFLF_CPDP(smote+weighted iForest Filter+label information) */
                    # *******************Zscore_SMOTE_WiFLF_CPDP*********************************
                    method_name = mode[1] + '_' + mode[
                        2]  # scenario + filter method
                    print('----------%s:%d/%d------' %
                          (method_name, r + 1, runtimes))
                    X_train_zscore, X_test_zscore = Standard_Features(
                        X_train, 'zscore', X_test)  # data transformation
                    data_train = pd.DataFrame(np.c_[X_train_zscore, y_train])

                    # iForest_parameters = [itree_num, subsamples, hlim, mode, s0, alpha]
                    itree_num = iForest_parameters[0]
                    subsamples = iForest_parameters[1]
                    hlim = iForest_parameters[2]
                    mode_if = iForest_parameters[3]
                    s0 = iForest_parameters[4]
                    alpha = iForest_parameters[5]
                    X_train_new_idx, X_train_new, y_train_new = detect_anomalies_improvedplusLabel(
                        random_state=r + 1,
                    # print("X_train_new:", X_train_new_idx, X_train_new, y_train_new)
                    # /* step3: Model building and evaluation */
                    # Train model: classifier / model requiresSelection_Classifications the label must beong to {0, 1}.
                    modelname_wiffl, model_wiffl = Selection_Classifications(
                        clf_index, r)  # select classifier
                    # print("modelname_wiffl:", modelname_wiffl)
                    measures_wiffl = Build_Evaluation_Classification_Model(
                        model_wiffl, X_train_new, y_train_new, X_test_zscore,
                        y_test)  # build and evaluate models
                    end_time = time.time()
                    run_time = end_time - start_time
                        'train_len_before': len(X_train),
                        'train_len_after': len(X_train_new),
                        'test_len': len(X_test),
                        'runtime': run_time,
                        'clfindex': clf_index,
                        'clfname': modelname_wiffl,
                        'testfile': file_te,
                        'trainfile': 'More1',
                        'runtimes': r + 1
                    df_m2ocp_measures = pd.DataFrame(measures_wiffl, index=[r])
                    # print('df_m2ocp_measures:\n', df_m2ocp_measures)
                    df_r_measures = pd.concat(
                        [df_r_measures, df_m2ocp_measures],

            df_file_measures = pd.concat(
                [df_file_measures, df_r_measures],
                ignore_index=False)  # the measures of all files in runtimes

    modelname = np.unique(classifiername)
    # pathname = spath + '\\' + (save_file_name + '_clf' + str(clf_index) + '.csv')
    pathname = spath + '\\' + (save_file_name + '_' + modelname[0] + '.csv')
    # print('df_file_measures:\n', df_file_measures)
    return df_file_measures
def HISNN_main(mode, clf_index, runtimes):

    pwd = os.getcwd()
    father_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + ".")
    # print(father_path)
    datapath = father_path + '/dataset-inOne/'
    spath = father_path + '/results/'
    if not os.path.exists(spath):

    # datasets for RQ2
    datasets = [[
        'ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv', 'ivy-1.4.csv',
        'jedit-3.2.csv', 'log4j-1.0.csv', 'lucene-2.0.csv', 'poi-2.0.csv',
        'redaktor-1.csv', 'synapse-1.0.csv', 'tomcat-6.0.389418.csv',
        'velocity-1.6.csv', 'xalan-2.4.csv', 'xerces-init.csv'
                    'ant-1.7.csv', 'arc-1.csv', 'camel-1.6.csv', 'ivy-2.0.csv',
                    'jedit-4.3.csv', 'log4j-1.1.csv', 'lucene-2.0.csv',
                    'poi-2.0.csv', 'redaktor-1.csv', 'synapse-1.2.csv',
                    'tomcat-6.0.389418.csv', 'velocity-1.6.csv',
                    'xalan-2.6.csv', 'xerces-1.3.csv'
                ], ['EQ.csv', 'JDT.csv', 'LC.csv', 'ML.csv', 'PDE.csv'],
                ['Apache.csv', 'Safe.csv', 'Zxing.csv']]

    # datasets for example
    # datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv'], ['Apache.csv', 'Safe.csv', 'Zxing.csv']]

    datanum = 0
    for i in range(len(datasets)):
        datanum = datanum + len(datasets[i])
    # print(datanum)
    # mode = [preprocess_mode, train_mode, save_file_name]
    preprocess_mode = mode[0]
    train_mode = mode[1]
    save_file_name = mode[2]
    df_file_measures = pd.DataFrame(
    )  # the measures of all files in all runtimes
    classifiername = []
    # file_list = os.listdir(fpath)

    n = 0
    for i in range(len(datasets)):
        for file_te in datasets[i]:
            n = n + 1
            print('----------%s:%d/%d------' % ('Dataset', n, datanum))
            # print('testfile', file_te)
            start_time = time.time()

            Address_te = datapath + file_te
            Samples_te = NumericStringLabel2BinaryLabel(
                Address_te)  # DataFrame
            data = Samples_te.values  # DataFrame2Array
            X = data[:, :-1]  # test features
            y = data[:, -1]  # test labels
            Sample_tr0 = NumericStringLabel2BinaryLabel(Address_te)
            column_name = Sample_tr0.columns.values  # the column name of the data

            df_r_measures = pd.DataFrame(
            )  # the measures of each file in runtimes
            for r in range(runtimes):
                if train_mode == 'M2O_CPDP':  # train data contains more files different from the test data project
                    X_test = X
                    y_test = y
                    Samples_tr_all = pd.DataFrame(
                    )  # initialize the candidate training data of all cp data
                    for file_tr in datasets[i]:
                        if file_tr != file_te:
                            # print('train_file:', file_tr)
                            Address_tr = datapath + file_tr
                            Samples_tr = NumericStringLabel2BinaryLabel(
                                Address_tr)  # original train data, DataFrame
                            Samples_tr.columns = column_name.tolist()  # 批量更改列名
                            Samples_tr_all = pd.concat(
                                [Samples_tr_all, Samples_tr],
                    # Samples_tr_all.to_csv(f2, index=None, columns=None)  # 将类标签二元化的数据保存,保留列名,不增加行索引

                    # /* step1: data preprocessing */
                    Sample_tr_pos, Sample_tr_neg, Sample_pos_index, Sample_neg_index \
                                = Random_Stratified_Sample_fraction(Samples_tr_all, string='bug', r=r)  # random sample 90% negative samples and 90% positive samples
                    Sample_tr = np.concatenate((Sample_tr_neg, Sample_tr_pos),
                                               axis=0)  # array垂直拼接
                    data_train = pd.DataFrame(Sample_tr)
                    data_train_unique = Drop_Duplicate_Samples(
                        data_train)  # drop duplicate samples
                    data_train_unique = data_train_unique.values
                    X_train = data_train_unique[:, :-1]
                    y_train = data_train_unique[:, -1]
                    X_train_zscore, X_test_zscore = Standard_Features(
                        X_train, 'zscore', X_test)  # data transformation
                    # source = np.concatenate((X_train_zscore, y_train), axis=0)  # array垂直拼接
                    source = np.c_[X_train_zscore, y_train]
                    target = np.c_[X_test_zscore, y_test]

                    # /* step2: data filtering */
                    # *******************HISNN*********************************
                    method_name = mode[1] + '_' + mode[
                        2]  # scenario + filter method
                    print('----------%s:%d/%d------' %
                          (method_name, r + 1, runtimes))
                    source_HISNN = HISNN_train(source, target, K=10)

                    # /* step3: Model building and evaluation */
                    # Train model: LR as classifier, model requires the label must beong to {0, 1}.
                    modelname_hisnn, model_hisnn = Selection_Classifications(
                        clf_index, r)  # select classifier
                    # print("modelname_hisnn:", modelname_hisnn)
          [:, :-1],
                                    source_HISNN[:, -1])  # build models
                    measures_hisnn = HISNN_test(
                        model_hisnn, target, source
                    )  # measures_hisnn = HISNN_test(model, target_log, source_log)  # evaluate models
                    end_time = time.time()
                    run_time = end_time - start_time
                        'train_len_before': len(X_train),
                        'train_len_after': len(source),
                        'test_len': len(target),
                        'runtime': run_time,
                        'clfindex': clf_index,
                        'clfname': modelname_hisnn,
                        'testfile': file_te,
                        'trainfile': 'More1',
                        'runtimes': r + 1
                    df_m2ocp_measures = pd.DataFrame(measures_hisnn, index=[r])
                    # print('df_m2ocp_measures:\n', df_m2ocp_measures)
                    df_r_measures = pd.concat(
                        [df_r_measures, df_m2ocp_measures],

            # print('df_file_measures:\n', df_file_measures)
            # print('所有文件运行一次的结果为:\n', df_file_measures)
            df_file_measures = pd.concat(
                [df_file_measures, df_r_measures],
                ignore_index=False)  # the measures of all files in runtimes
            # df_r_measures['testfile'] = file_list
            # print('df_r_measures1:\n', df_r_measures)

    modelname = np.unique(classifiername)
    # pathname = spath + '\\' + (save_file_name + '_clf' + str(clf_index) + '.csv')
    pathname = spath + '\\' + (save_file_name + '_' + modelname[0] + '.csv')
    # print('df_file_measures:\n', df_file_measures)
    return df_file_measures
def _test_SMOTE_Data():
    fpath = r'D:\PycharmProjects\cuican\data2020\Relink\data'
    file = fpath + "\\" + 'Apache.csv'
    df_data = NumericStringLabel2BinaryLabel(file)
    X_resampled, y_resampled = SMOTE_Data(df_data, r=0)
    print(X_resampled, y_resampled)