def _test(res=''): raw_data, labels = readfile("../../data/UCI/" + res + ".csv") metric = "d2h" final = {} final_auc = {} e_value = [0.05, 0.1, 0.2] start_time = time.time() dic = {} dic_func = {} for mn in range(500 + file_inc[res] * 10, 521 + file_inc[res] * 10): for e in e_value: np.random.seed(mn) seed(mn) preprocess = [ standard_scaler, minmax_scaler, maxabs_scaler, [robust_scaler] * 20, kernel_centerer, [quantile_transform] * 200, normalizer, [binarize] * 100 ] # ,[polynomial]*5 MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30, [LR] * 50] # [SVM]*100, preprocess_list = unpack(preprocess) MLs_list = unpack(MLs) combine = [[r[0], r[1]] for r in product(preprocess_list, MLs_list)] if e not in final_auc.keys(): final_auc[e] = [] dic[e] = {} func_str_dic = {} func_str_counter_dic = {} lis_value = [] dic_auc = {} for i in combine: scaler, tmp1 = i[0]() model, tmp2 = i[1]() string1 = tmp1 + "|" + tmp2 func_str_dic[string1] = [scaler, model] func_str_counter_dic[string1] = 0 counter = 0 while counter != 200: if counter not in dic_func.keys(): dic_func[counter] = [] keys = [k for k, v in func_str_counter_dic.items() if v == 0] key = _randchoice(keys) cut_off = int(len(raw_data) * 0.8) scaler, model = func_str_dic[key] df1 = transform(raw_data, scaler) df1.loc[:, "bug"] = labels train_data, test_data = df1.iloc[:cut_off, :], df1.iloc[ cut_off:, :] measurement = run_model(train_data, test_data, model, metric, training=-1) if all(abs(t - measurement) > e for t in lis_value): lis_value.append(measurement) func_str_counter_dic[key] += 1 else: func_str_counter_dic[key] += -1 if counter not in dic[e].keys(): dic[e][counter] = [] dic_func[counter] = [] if e == 0.05: dic_func[counter].append(key) dic[e][counter].append(min(lis_value)) dic_auc[counter] = min(lis_value) counter += 1 dic1 = OrderedDict(sorted(dic_auc.items(), key=itemgetter(0))).values() area_under_curve = round(auc(list(range(len(dic1))), dic1), 3) final[e] = dic_auc final_auc[e].append(area_under_curve) total_run = time.time() - start_time final_auc["temp"] = final final_auc["time"] = total_run final_auc["counter_full"] = dic final_auc["settings"] = dic_func print(final_auc) with open('dump/d2h_' + res + '.pickle', 'wb') as handle: pickle.dump(final_auc, handle)
def _test(res=''): raw_data, labels = readfile("../../data/textmining/" + res + ".txt") metric = "Pf_Auc" final = {} final_auc = {} e_value = [0.2] start_time = time.time() dic = {} dic_func = {} for mn in range(500 + file_inc[res] * 10, 511 + file_inc[res] * 10): for e in e_value: transformation = [[TF] * 30, [TFIDF] * 30, [HASHING] * 8, [LDA_] * 50] preprocess = [no_transformation] MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30, [LR] * 50] preprocess_list = unpack(preprocess) MLs_list = unpack(MLs) trans_list = unpack(transformation) combine = [[r[0], r[1], r[2]] for r in product(trans_list, preprocess_list, MLs_list)] if e not in final_auc.keys(): final_auc[e] = [] dic[e] = {} func_str_dic = {} func_str_counter_dic = {} lis_value = [] dic_auc = {} for i in combine: trans, tmp = i[0]() scaler, tmp1 = i[1]() model, tmp2 = i[2]() string1 = tmp + "|" + tmp1 + "|" + tmp2 func_str_dic[string1] = [trans, scaler, model] func_str_counter_dic[string1] = 0 counter = 0 while counter != 40: print(counter) if counter not in dic_func.keys(): dic_func[counter] = [] keys = [k for k, v in func_str_counter_dic.items() if v == 0] key = _randchoice(keys) cut_off = int(len(raw_data) * 0.8) vector, scaler, model = func_str_dic[key] df = extraction(raw_data, vector) df1 = transform(df, scaler) df1.loc[:, "bug"] = labels train_data, test_data = df1.iloc[:cut_off, :], df1.iloc[ cut_off:, :] measurement = run_model(train_data, test_data, model, metric, training=-1) if all(abs(t - measurement) > e for t in lis_value): lis_value.append(measurement) func_str_counter_dic[key] += 1 else: func_str_counter_dic[key] += -1 if counter not in dic[e].keys(): dic[e][counter] = [] dic_func[counter] = [] dic_func[counter].append(key) dic[e][counter].append(min(lis_value)) dic_auc[counter] = min(lis_value) counter += 1 dic1 = OrderedDict(sorted(dic_auc.items(), key=itemgetter(0))).values() area_under_curve = round(auc(list(range(len(dic1))), list(dic1)), 3) final[e] = dic_auc final_auc[e].append(area_under_curve) total_run = time.time() - start_time final_auc["temp"] = final final_auc["time"] = total_run final_auc["counter_full"] = dic final_auc["settings"] = dic_func print(final_auc)
def _test(res=''): #df=readfile("../data/defect/"+res+".csv") paths = [os.path.join(data_path, file_name) for file_name in file_dic[res]] train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True) test_df = pd.read_csv(paths[-1]) train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:] train_size=train_df["bug"].count() df=pd.concat([train_df,test_df],ignore_index=True) df['bug']=df['bug'].apply(lambda x: 0 if x == 0 else 1) metric="d2h" final = {} final_auc={} e_value = [0.025, 0.05, 0.1, 0.2] start_time=time.time() dic={} dic_func={} for mn in range(500+file_inc[res]*10,521+file_inc[res]*10): for e in e_value: np.random.seed(mn) seed(mn) preprocess = [standard_scaler, minmax_scaler, maxabs_scaler, [robust_scaler] * 20, kernel_centerer, [quantile_transform] * 200 , normalizer, [binarize] * 100] # ,[polynomial]*5 MLs = [NB, [KNN] * 20, [RF] * 50, [DT] * 30, [LR] * 50] # [SVM]*100, preprocess_list = unpack(preprocess) MLs_list = unpack(MLs) combine = [[r[0], r[1]] for r in product(preprocess_list, MLs_list)] if e not in final_auc.keys(): final_auc[e]=[] dic[e] = {} func_str_dic = {} func_str_counter_dic = {} lis_value = [] dic_auc={} for i in combine: scaler, tmp1 = i[0]() model, tmp2 = i[1]() string1 = tmp1 + "|" + tmp2 func_str_dic[string1] = [scaler, model] func_str_counter_dic[string1] = 0 counter=0 while counter!=1000: if counter not in dic_func.keys(): dic_func[counter]=[] try: keys = [k for k, v in func_str_counter_dic.items() if v == 0] key = _randchoice(keys) scaler,model=func_str_dic[key] df1=transform(df,scaler) train_data, test_data = df1.iloc[:train_size,:], df1.iloc[train_size:,:] measurement = run_model(train_data, test_data, model, metric,training=-2) if all(abs(t - measurement) > e for t in lis_value): lis_value.append(measurement) func_str_counter_dic[key] += 1 else: func_str_counter_dic[key] += -1 if counter not in dic[e].keys(): dic[e][counter] = [] dic_func[counter]=[] if e == 0.025: dic_func[counter].append(key) dic[e][counter].append(min(lis_value)) dic_auc[counter]=min(lis_value) # sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2) # data,labels=df1[df1.columns[:-2]], df1[df1.columns[-2]] # for train_index, test_index in sss.split(data,labels): # train_data, test_data=df1.iloc[train_index], df1.iloc[test_index] # measurement=run_model(train_data,test_data,model,metric) # # if all(abs(t-measurement)>e for t in lis_value): # lis_value.append(measurement) # func_str_counter_dic[key] += 1 # else: # func_str_counter_dic[key] += -1 # # dic[counter]=max(lis_value) counter+=1 except: pass dic1 = OrderedDict(sorted(dic_auc.items(), key=itemgetter(0))).values() area_under_curve=round(auc(list(range(len(dic1))), dic1), 3) final[e]=dic_auc final_auc[e].append(area_under_curve) total_run=time.time()-start_time final_auc["temp"]=final final_auc["time"] = total_run final_auc["counter_full"]=dic final_auc["settings"]=dic_func print(final_auc) with open('../../dump/d2h_' + res + '.pickle', 'wb') as handle: pickle.dump(final_auc, handle)
def _test(res=''): paths = [os.path.join(data_path, file_name) for file_name in file_dic[res]] train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True) test_df = pd.read_csv(paths[-1]) train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:] train_size = train_df["bug"].count() df = pd.concat([train_df, test_df], ignore_index=True) df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1) metric = "popt20" final = {} final_auc = {} e_value = [0.2] start_time = time.time() dic = {} dic_func = {} for mn in range(500 + file_inc[res] * 10, 520 + file_inc[res] * 10): for e in e_value: preprocess = [standard_scaler, minmax_scaler, [normalizer] * 10] # ,[polynomial]*5 MLs = [[DeepLearner] * 20] # [SVM]*100, preprocess_list = unpack(preprocess) MLs_list = unpack(MLs) combine = [[r[0], r[1]] for r in product(preprocess_list, MLs_list)] if e not in final_auc.keys(): final_auc[e] = [] dic[e] = {} func_str_dic = {} func_str_counter_dic = {} lis_value = [] dic_auc = {} for i in combine: scaler, tmp1 = i[0]() model, tmp2 = i[1]() string1 = tmp1 + "|" + tmp2 func_str_dic[string1] = [scaler, model] func_str_counter_dic[string1] = 0 counter = 0 while counter != 30: if counter not in dic_func.keys(): dic_func[counter] = [] try: keys = [ k for k, v in func_str_counter_dic.items() if v == 0 ] key = _randchoice(keys) scaler, model = func_str_dic[key] df1 = transform(df, scaler) train_data, test_data = df1.iloc[:train_size, :], df1.iloc[ train_size:, :] measurement = run_model(train_data, test_data, model, metric, training=-2) if all(abs(t - measurement) > e for t in lis_value): lis_value.append(measurement) func_str_counter_dic[key] += 1 else: func_str_counter_dic[key] += -1 if counter not in dic[e].keys(): dic[e][counter] = [] dic_func[counter] = [] if e == 0.05: dic_func[counter].append(key) dic[e][counter].append(max(lis_value)) dic_auc[counter] = max(lis_value) counter += 1 except: raise dic1 = OrderedDict(sorted(dic_auc.items(), key=itemgetter(0))).values() area_under_curve = round(auc(list(range(len(dic1))), list(dic1)), 3) print("AUC: ", area_under_curve) final[e] = dic_auc final_auc[e].append(area_under_curve) total_run = time.time() - start_time final_auc["temp"] = final final_auc["time"] = total_run final_auc["counter_full"] = dic final_auc["settings"] = dic_func print(final_auc)