Esempio n. 1
0
def _test(res=''):

    raw_data, labels = readfile("../../data/UCI/" + res + ".csv")

    metric = "d2h"
    final = {}
    final_auc = {}
    e_value = [0.05, 0.1, 0.2]
    start_time = time.time()
    dic = {}
    dic_func = {}

    for mn in range(500 + file_inc[res] * 10, 521 + file_inc[res] * 10):
        for e in e_value:
            np.random.seed(mn)
            seed(mn)
            preprocess = [
                standard_scaler, minmax_scaler,
                maxabs_scaler, [robust_scaler] * 20, kernel_centerer,
                [quantile_transform] * 200, normalizer, [binarize] * 100
            ]  # ,[polynomial]*5
            MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30,
                   [LR] * 50]  # [SVM]*100,
            preprocess_list = unpack(preprocess)
            MLs_list = unpack(MLs)
            combine = [[r[0], r[1]]
                       for r in product(preprocess_list, MLs_list)]

            if e not in final_auc.keys():
                final_auc[e] = []
                dic[e] = {}

            func_str_dic = {}
            func_str_counter_dic = {}
            lis_value = []
            dic_auc = {}
            for i in combine:
                scaler, tmp1 = i[0]()
                model, tmp2 = i[1]()
                string1 = tmp1 + "|" + tmp2
                func_str_dic[string1] = [scaler, model]
                func_str_counter_dic[string1] = 0

            counter = 0
            while counter != 200:
                if counter not in dic_func.keys():
                    dic_func[counter] = []

                keys = [k for k, v in func_str_counter_dic.items() if v == 0]
                key = _randchoice(keys)

                cut_off = int(len(raw_data) * 0.8)
                scaler, model = func_str_dic[key]
                df1 = transform(raw_data, scaler)
                df1.loc[:, "bug"] = labels

                train_data, test_data = df1.iloc[:cut_off, :], df1.iloc[
                    cut_off:, :]
                measurement = run_model(train_data,
                                        test_data,
                                        model,
                                        metric,
                                        training=-1)

                if all(abs(t - measurement) > e for t in lis_value):
                    lis_value.append(measurement)
                    func_str_counter_dic[key] += 1
                else:
                    func_str_counter_dic[key] += -1

                if counter not in dic[e].keys():
                    dic[e][counter] = []
                    dic_func[counter] = []
                if e == 0.05:
                    dic_func[counter].append(key)
                dic[e][counter].append(min(lis_value))
                dic_auc[counter] = min(lis_value)

                counter += 1

            dic1 = OrderedDict(sorted(dic_auc.items(),
                                      key=itemgetter(0))).values()
            area_under_curve = round(auc(list(range(len(dic1))), dic1), 3)
            final[e] = dic_auc
            final_auc[e].append(area_under_curve)
    total_run = time.time() - start_time
    final_auc["temp"] = final
    final_auc["time"] = total_run
    final_auc["counter_full"] = dic
    final_auc["settings"] = dic_func
    print(final_auc)
    with open('dump/d2h_' + res + '.pickle', 'wb') as handle:
        pickle.dump(final_auc, handle)
Esempio n. 2
0
def _test(res=''):

    raw_data, labels = readfile("../../data/textmining/" + res + ".txt")

    metric = "Pf_Auc"
    final = {}
    final_auc = {}
    e_value = [0.2]
    start_time = time.time()
    dic = {}
    dic_func = {}
    for mn in range(500 + file_inc[res] * 10, 511 + file_inc[res] * 10):
        for e in e_value:
            transformation = [[TF] * 30, [TFIDF] * 30, [HASHING] * 8,
                              [LDA_] * 50]
            preprocess = [no_transformation]
            MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30, [LR] * 50]
            preprocess_list = unpack(preprocess)
            MLs_list = unpack(MLs)
            trans_list = unpack(transformation)
            combine = [[r[0], r[1], r[2]]
                       for r in product(trans_list, preprocess_list, MLs_list)]
            if e not in final_auc.keys():
                final_auc[e] = []
                dic[e] = {}

            func_str_dic = {}
            func_str_counter_dic = {}
            lis_value = []
            dic_auc = {}
            for i in combine:
                trans, tmp = i[0]()
                scaler, tmp1 = i[1]()
                model, tmp2 = i[2]()

                string1 = tmp + "|" + tmp1 + "|" + tmp2
                func_str_dic[string1] = [trans, scaler, model]
                func_str_counter_dic[string1] = 0

            counter = 0
            while counter != 40:
                print(counter)
                if counter not in dic_func.keys():
                    dic_func[counter] = []

                keys = [k for k, v in func_str_counter_dic.items() if v == 0]
                key = _randchoice(keys)

                cut_off = int(len(raw_data) * 0.8)
                vector, scaler, model = func_str_dic[key]
                df = extraction(raw_data, vector)
                df1 = transform(df, scaler)
                df1.loc[:, "bug"] = labels

                train_data, test_data = df1.iloc[:cut_off, :], df1.iloc[
                    cut_off:, :]
                measurement = run_model(train_data,
                                        test_data,
                                        model,
                                        metric,
                                        training=-1)

                if all(abs(t - measurement) > e for t in lis_value):
                    lis_value.append(measurement)
                    func_str_counter_dic[key] += 1
                else:
                    func_str_counter_dic[key] += -1

                if counter not in dic[e].keys():
                    dic[e][counter] = []
                    dic_func[counter] = []
                dic_func[counter].append(key)
                dic[e][counter].append(min(lis_value))
                dic_auc[counter] = min(lis_value)

                counter += 1

            dic1 = OrderedDict(sorted(dic_auc.items(),
                                      key=itemgetter(0))).values()
            area_under_curve = round(auc(list(range(len(dic1))), list(dic1)),
                                     3)
            final[e] = dic_auc
            final_auc[e].append(area_under_curve)
    total_run = time.time() - start_time
    final_auc["temp"] = final
    final_auc["time"] = total_run
    final_auc["counter_full"] = dic
    final_auc["settings"] = dic_func
    print(final_auc)
Esempio n. 3
0
def _test(res=''):


    #df=readfile("../data/defect/"+res+".csv")
    paths = [os.path.join(data_path, file_name) for file_name in file_dic[res]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])

    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size=train_df["bug"].count()
    df=pd.concat([train_df,test_df],ignore_index=True)
    df['bug']=df['bug'].apply(lambda x: 0 if x == 0 else 1)


    metric="d2h"
    final = {}
    final_auc={}
    e_value = [0.025, 0.05, 0.1, 0.2]
    start_time=time.time()
    dic={}
    dic_func={}
    for mn in range(500+file_inc[res]*10,521+file_inc[res]*10):

        for e in e_value:
            np.random.seed(mn)
            seed(mn)
            preprocess = [standard_scaler, minmax_scaler, maxabs_scaler, [robust_scaler] * 20, kernel_centerer,
                          [quantile_transform] * 200
                , normalizer, [binarize] * 100]  # ,[polynomial]*5
            MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30, [LR] * 50]  # [SVM]*100,
            preprocess_list = unpack(preprocess)
            MLs_list = unpack(MLs)
            combine = [[r[0], r[1]] for r in product(preprocess_list, MLs_list)]

            if e not in final_auc.keys():
                final_auc[e]=[]
                dic[e] = {}


            func_str_dic = {}
            func_str_counter_dic = {}
            lis_value = []
            dic_auc={}
            for i in combine:
                scaler, tmp1 = i[0]()
                model, tmp2 = i[1]()
                string1 = tmp1 + "|" + tmp2
                func_str_dic[string1] = [scaler, model]
                func_str_counter_dic[string1] = 0

            counter=0
            while counter!=1000:
                if counter not in dic_func.keys():
                    dic_func[counter]=[]
                try:
                    keys = [k for k, v in func_str_counter_dic.items() if v == 0]
                    key = _randchoice(keys)
                    scaler,model=func_str_dic[key]
                    df1=transform(df,scaler)

                    train_data, test_data = df1.iloc[:train_size,:], df1.iloc[train_size:,:]
                    measurement = run_model(train_data, test_data, model, metric,training=-2)

                    if all(abs(t - measurement) > e for t in lis_value):
                        lis_value.append(measurement)
                        func_str_counter_dic[key] += 1
                    else:
                        func_str_counter_dic[key] += -1

                    if counter not in dic[e].keys():
                        dic[e][counter] = []
                        dic_func[counter]=[]
                    if e == 0.025:
                        dic_func[counter].append(key)
                    dic[e][counter].append(min(lis_value))
                    dic_auc[counter]=min(lis_value)

                    # sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
                    # data,labels=df1[df1.columns[:-2]], df1[df1.columns[-2]]
                    # for train_index, test_index in sss.split(data,labels):
                    #     train_data, test_data=df1.iloc[train_index], df1.iloc[test_index]
                    #     measurement=run_model(train_data,test_data,model,metric)
                    #
                    #     if all(abs(t-measurement)>e for t in lis_value):
                    #         lis_value.append(measurement)
                    #         func_str_counter_dic[key] += 1
                    #     else:
                    #         func_str_counter_dic[key] += -1
                    #
                    #     dic[counter]=max(lis_value)

                    counter+=1
                except:
                    pass

            dic1 = OrderedDict(sorted(dic_auc.items(), key=itemgetter(0))).values()
            area_under_curve=round(auc(list(range(len(dic1))), dic1), 3)
            final[e]=dic_auc
            final_auc[e].append(area_under_curve)
    total_run=time.time()-start_time
    final_auc["temp"]=final
    final_auc["time"] = total_run
    final_auc["counter_full"]=dic
    final_auc["settings"]=dic_func
    print(final_auc)
    with open('../../dump/d2h_' + res + '.pickle', 'wb') as handle:
        pickle.dump(final_auc, handle)
Esempio n. 4
0
def _test(res=''):
    paths = [os.path.join(data_path, file_name) for file_name in file_dic[res]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]],
                         ignore_index=True)
    test_df = pd.read_csv(paths[-1])

    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)

    metric = "popt20"

    final = {}
    final_auc = {}
    e_value = [0.2]
    start_time = time.time()
    dic = {}
    dic_func = {}
    for mn in range(500 + file_inc[res] * 10, 520 + file_inc[res] * 10):
        for e in e_value:
            preprocess = [standard_scaler, minmax_scaler,
                          [normalizer] * 10]  # ,[polynomial]*5
            MLs = [[DeepLearner] * 20]  # [SVM]*100,
            preprocess_list = unpack(preprocess)
            MLs_list = unpack(MLs)
            combine = [[r[0], r[1]]
                       for r in product(preprocess_list, MLs_list)]

            if e not in final_auc.keys():
                final_auc[e] = []
                dic[e] = {}

            func_str_dic = {}
            func_str_counter_dic = {}
            lis_value = []
            dic_auc = {}
            for i in combine:
                scaler, tmp1 = i[0]()
                model, tmp2 = i[1]()
                string1 = tmp1 + "|" + tmp2
                func_str_dic[string1] = [scaler, model]
                func_str_counter_dic[string1] = 0

            counter = 0
            while counter != 30:
                if counter not in dic_func.keys():
                    dic_func[counter] = []
                try:
                    keys = [
                        k for k, v in func_str_counter_dic.items() if v == 0
                    ]
                    key = _randchoice(keys)
                    scaler, model = func_str_dic[key]
                    df1 = transform(df, scaler)

                    train_data, test_data = df1.iloc[:train_size, :], df1.iloc[
                        train_size:, :]
                    measurement = run_model(train_data,
                                            test_data,
                                            model,
                                            metric,
                                            training=-2)

                    if all(abs(t - measurement) > e for t in lis_value):
                        lis_value.append(measurement)
                        func_str_counter_dic[key] += 1
                    else:
                        func_str_counter_dic[key] += -1

                    if counter not in dic[e].keys():
                        dic[e][counter] = []
                        dic_func[counter] = []
                    if e == 0.05:
                        dic_func[counter].append(key)
                    dic[e][counter].append(max(lis_value))
                    dic_auc[counter] = max(lis_value)

                    counter += 1
                except:
                    raise

            dic1 = OrderedDict(sorted(dic_auc.items(),
                                      key=itemgetter(0))).values()
            area_under_curve = round(auc(list(range(len(dic1))), list(dic1)),
                                     3)
            print("AUC: ", area_under_curve)
            final[e] = dic_auc
            final_auc[e].append(area_under_curve)
    total_run = time.time() - start_time
    final_auc["temp"] = final
    final_auc["time"] = total_run
    final_auc["counter_full"] = dic
    final_auc["settings"] = dic_func
    print(final_auc)