print("learning...") data_cond_pro = bayes_funcs.bayes_cal(class_data_table, clas_pro, bin_counts, all_borders) # data_cond_pro=bayes_cal(class_data_table,K_size) if print_cond: print("predicting...") pred_y = bayes_funcs.predict(data_cond_pro, test_x, all_borders) """ KxK economic gain matrix 0 1 0 1 -1 1 -2 3 """ print("results smoothing off, with test data") theaccuracy = bayes_funcs.accuracy(pred_y, test_y) best, confusion_matrix_no = bayes_funcs.true_expected_gain2( gain_matrix, test_y, test_y) gain, confusion_matrix = bayes_funcs.true_expected_gain2( gain_matrix, pred_y, test_y) if print_cond: print("\naccuracy : {:4.3f}".format(theaccuracy)) print("gain/best: {:4.3f} gain:{} best:{} \n".format( gain / best, gain, best)) print(confusion_matrix) print("results smoothing on, with test data") js = [1, 2, 3, 4] ks = [] for j in js:
def read_train_dev(clas_pro, gain_matrix, bin_counts=None, limit=-1, sampling=0.1, M=10000, feature_count=None, range_boundaries=None, print_cond=False): # freq=[data_x[m].value_counts() for m in range(5)] # in our data, we have 5 features each one of them are between 1 and 0 if print_cond: print("reading data...") alldata_x, alldata_y, K_size = dpf.read_data("pr_data.txt", limit, sampling) if range_boundaries == None: range_boundaries = alldata_x.apply( lambda x: pd.Series([x.min(), x.max()])).T.values.tolist() if feature_count == None: feature_count = alldata_x.shape[1] ## shuffling # data_x_y=alldata_x.copy() # data_x_y[5]=alldata_y # data_x_y=data_x_y.sample(frac=1).reset_index(drop=True) # alldata_x=data_x_y[[0,1,2,3,4]] # alldata_y=data_x_y[5].values.tolist() # if print_cond: print("processing data...") n = len(alldata_x) train_end = int((n * 3) / 10) dev_end = int((n * 6) / 10) train_x = alldata_x[:train_end] train_y = alldata_y[:train_end] dev_x = alldata_x[train_end:dev_end] dev_y = alldata_y[train_end:dev_end] test_x = alldata_x[dev_end:] test_y = alldata_y[dev_end:] # print("total size is {}".format(n)) # print("size of train is {}".format(len(train_x))) # print("size of dev is {}".format(len(dev_x))) # print("size of test is {}".format(len(test_x))) # TODO make multi class count = np.sum(dev_y) # we will calculate entropy with those bins # number_of_bins=n/10 number_of_bins = 10000 K_size = {1: count, 0: len(dev_y) - count} M = number_of_bins if bin_counts == None: entropies = dpf.cal_entropies(range_boundaries, number_of_bins, train_x) print("entropies", entropies) bin_counts = dpf.cal_bin_count(entropies, M) print("bin_counts", bin_counts) # index_adress((0,2,1,3,4),distent) # if print_pro: # print("v_orders calculation") all_borders = dpf.cal_vorders(bin_counts, train_x) # print("filling memory table") """ class_data_table= {0: {(1, 6, 2, 4, 1): 6.0, (0, 2, 0, 6, 6): 7.0, (0, 5, 0, 3, 4): 4.0, (3, 4, 6, 2, 2): 5.0, 1: ...} """ # deprecated using new one # class_data_table=fill_memory_table(K_size,train_x,train_y,all_borders) if print_cond: print("quantising...") # class_data_table quantisation and storing in dict # v1.2 class_data_table = dpf.fill_memory_table(train_x, train_y, all_borders, K_size.keys()) # class_data_table=smoot_Dimensional(class_data_table,bin_counts,j,expanding_limit) if print_cond: print("learning...") data_cond_pro = bayes_funcs.bayes_cal(class_data_table, clas_pro, bin_counts, all_borders) # data_cond_pro=bayes_cal(class_data_table,K_size) if print_cond: print("predicting...") pred_y = bayes_funcs.predict(data_cond_pro, dev_x, all_borders) """ KxK economic gain matrix 0 1 0 1 -1 1 -2 3 """ theaccuracy = bayes_funcs.accuracy(pred_y, dev_y) best = bayes_funcs.true_expected_gain(gain_matrix, dev_y, dev_y) gain = bayes_funcs.true_expected_gain(gain_matrix, pred_y, dev_y) if print_cond: print("\nwith random quantisation points:") print("\naccuracy : {:4.3f}".format(theaccuracy)) print("gain/best: {:4.3f} gain:{} best:{} \n".format( gain / best, gain, best)) return theaccuracy, all_borders, train_x, train_y, dev_x, dev_y, test_x, test_y, best, bin_counts, K_size, class_data_table, gain
def optimise_fixed_time(train_x,train_y,dev_x,dev_y,all_borders,gain,K_size,clas_pro,bin_counts,gain_matrix,best,alpha=0.01,M=5,limit_opti=-1): """ this part is for optimising quantisation borders: we choose a feature and quant. border randomly: move border by -(alpha*M), then move it to otherside by alpha for 2*M times, calculate decision rule with train_data, calculate expected gain with dev_data at the end pick highest expected gain """ if limit_opti==-1: limit_opti=2*(M+1) # alpha=0.01 # M=5 # if it does not improve after limit_opti times then stop trying # limit_opti=12 # populate all boundaries first, to select randomly or to go on until finishing them # [[1,2,3],[1,2,3,4,5]...] boundaries=[] for border in all_borders: boundaries.append([i+1 for i in range(len(border)-2)]) boundary_indexes=[i for i in range(len(all_borders))] # provided by previous function call # best = expected_gain(gain_matrix,dev_y,dev_y) while boundary_indexes: # print ("left {} feature to optimise".format(len(boundaries))) # choose a feature randomly feature_index=random.choice(boundary_indexes) # choose a boundary randomly bound_index=random.randint(0,len(boundaries[feature_index])-1) boundary=boundaries[feature_index].pop(bound_index) # if feature do not have any other bounadry then remove it if boundaries[feature_index]==[]: boundary_indexes.remove(feature_index) # get value of the border, bkj=all_borders[feature_index][boundary] # move border to left by alpha*(M+1) bkj_start=bkj-(alpha*(M+1)) bkj_end=bkj+(alpha*(M+1)) # bkj=bkj-(alpha*(M+1)) # if border smaller than previous border or bigger than next border than pass it if bkj_start < all_borders[feature_index][boundary-1] or bkj_end >= all_borders[feature_index][boundary+1]: if all_borders[feature_index][boundary-1]>bkj_start: bkj_start=all_borders[feature_index][boundary-1]+alpha if all_borders[feature_index][boundary+1]<=bkj_end: bkj_end=all_borders[feature_index][boundary+1]-alpha # continue # print ("feature:{},boundary: {}".format(feature_index,boundary)) # print("end:{} current:{} start:{}".format(bkj_end,bkj,bkj_start)) count=0 # we will add 2M, 2M-1, 2M-2 ... mulitplied with alpha # so we are starting from right side to search opti_index=1 new_bkj=bkj_start+(alpha*opti_index) while(new_bkj<=bkj_end): # change corresponding border with new value all_borders[feature_index][boundary]=new_bkj class_data_table=dpf.fill_memory_table(train_x,train_y,all_borders,K_size.keys()) # print ("learning") data_cond_pro=bayes_funcs.bayes_cal(class_data_table,clas_pro,bin_counts,all_borders) # print("predicting") pred_y=bayes_funcs.predict(data_cond_pro,dev_x,all_borders) new_gain = bayes_funcs.true_expected_gain(gain_matrix,pred_y,dev_y) if new_gain<=gain: count+=1 all_borders[feature_index][boundary]=bkj # print ("lost:{},bkj:{}".format(gain-new_gain,new_bkj)) # print ("{} did not update line ".format(count)) else: count=0 # print("--------------------------improved by {}".format(new_gain-gain)) gain=new_gain bkj=new_bkj if count>=limit_opti: # opti_index+=limit_opti count=0 break opti_index+=1 new_bkj=bkj_start+(alpha*opti_index) #print("gain/best: {:4.3f} gain:{} best:{} \n".format(gain/best,gain,best)) theaccuracy = bayes_funcs.accuracy(pred_y,dev_y) #print("\naccuracy : {:4.3f}".format(theaccuracy)) return all_borders,gain,theaccuracy
def optimise_hill_climb_single(train_x,train_y,dev_x,dev_y,all_borders,gain,K_size,clas_pro, bin_counts,gain_matrix,best,alpha=0.01,percent=0.5,limit_opti=-1,qindex=0): """ this part is for optimising quantisation borders: we choose a feature and quant. border randomly: move border by -(alpha*M), then move it to otherside by alpha for 2*M times, calculate decision rule with train_data, calculate expected gain with dev_data at the end pick highest expected gain """ if limit_opti==-1: limit_opti=inf # populate all boundaries first, to select randomly or to go on until finishing them # passes first one and last one since they are infinite # [[1,2,3],[1,2,3,4,5]...] boundaries=[] for border in all_borders: boundaries.append([i+1 for i in range(len(border)-2)]) #boundary_indexes=[i for i in range(len(all_borders))] #optimise only one given features feature_indexes=[qindex] # provided by previous function call # best = expected_gain(gain_matrix,dev_y,dev_y) while feature_indexes: #print ("gain {}".format(gain)) #left_count=sum([len(alen) for alen in boundaries ]) #if left_count%5==0: # print ("left {} feature to optimise".format(left_count)) #if left_count%7==0: # print (all_borders) # choose a feature randomly feature_index=random.choice(feature_indexes) # choose a boundary randomly boundary=random.choice(boundaries[feature_index]) boundaries[feature_index].remove(boundary) # if feature do not have any other bounadry then remove it if boundaries[feature_index]==[]: feature_indexes.remove(feature_index) # get value of the border, bkj=all_borders[feature_index][boundary] # upper and lower bounds, all_borders includes inf as well, upper_bound=all_borders[feature_index][boundary+1] lower_bound=all_borders[feature_index][boundary-1] if (upper_bound !=inf) and (lower_bound!=-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (upper_bound ==inf) and (lower_bound==-inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-0)*percent elif (upper_bound ==inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (lower_bound==-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-0)*percent bkj_end=bkj+upper_distance bkj_start=bkj-lower_distance random_point=random.uniform(bkj_start+alpha,bkj_end-alpha) left_side_point = random_point-alpha right_side_point = random_point+alpha all_borders[feature_index][boundary]=left_side_point left_side_point_gain,pred_y=train_test_gain_now(train_x,train_y,all_borders, K_size,clas_pro,bin_counts, dev_x,dev_y,gain_matrix) all_borders[feature_index][boundary]=right_side_point right_side_point_gain,pred_y=train_test_gain_now(train_x,train_y,all_borders, K_size,clas_pro,bin_counts, dev_x,dev_y,gain_matrix) if (left_side_point_gain>gain) or (right_side_point_gain>gain): if left_side_point_gain>=right_side_point_gain: if (bkj_start<left_side_point): all_borders[feature_index][boundary]=left_side_point gain=left_side_point_gain bkj=left_side_point flag=True while (flag): left_side_point=bkj-alpha all_borders[feature_index][boundary]=left_side_point if (bkj_start<=left_side_point): left_side_point_gain,pred_y=train_test_gain_now(train_x,train_y, all_borders,K_size, clas_pro,bin_counts, dev_x,dev_y,gain_matrix) #do not put equal so it moves to onto one and another TT if (gain>left_side_point_gain): flag=False all_borders[feature_index][boundary]=bkj else: bkj=left_side_point gain=left_side_point_gain else: flag=False all_borders[feature_index][boundary]=bkj else: all_borders[feature_index][boundary]=bkj else: if (bkj_end>right_side_point): all_borders[feature_index][boundary]=right_side_point gain=right_side_point_gain bkj=right_side_point flag=True while (flag): right_side_point=bkj+alpha all_borders[feature_index][boundary]=right_side_point if (right_side_point<=bkj_end): right_side_point_gain,pred_y=train_test_gain_now(train_x,train_y, all_borders,K_size, clas_pro,bin_counts, dev_x,dev_y,gain_matrix) #do not put equal so it moves to onto one and another TT if (gain>right_side_point_gain): flag=False all_borders[feature_index][boundary]=bkj else: bkj=right_side_point gain=right_side_point_gain else: flag=False all_borders[feature_index][boundary]=bkj else: all_borders[feature_index][boundary]=bkj else: all_borders[feature_index][boundary]=bkj #print("gain/best: {:4.3f} gain:{} best:{} \n".format(gain/best,gain,best)) theaccuracy = bayes_funcs.accuracy(pred_y,dev_y) #print("\naccuracy : {:4.3f}".format(theaccuracy)) return all_borders,gain,theaccuracy
def optimise_by_alpha_single(train_x,train_y,dev_x,dev_y,all_borders,gain,K_size,clas_pro,bin_counts,gain_matrix,best,alpha=0.01,percent=0.5,limit_opti=-1,qindex=0): """ this part is for optimising quantisation borders: we choose a feature and quant. border randomly: move border by -(alpha*M), then move it to otherside by alpha for 2*M times, calculate decision rule with train_data, calculate expected gain with dev_data at the end pick highest expected gain """ if limit_opti==-1: limit_opti=inf # alpha=0.01 # M=5 # if it does not improve after limit_opti times then stop trying # limit_opti=12 # populate all boundaries first, to select randomly or to go on until finishing them # passes first one and last one since they are infinite # [[1,2,3],[1,2,3,4,5]...] boundaries=[] for border in all_borders: boundaries.append([i+1 for i in range(len(border)-2)]) #CH #we do not need all of them #feature_indexes=[i for i in range(len(all_borders))] feature_indexes=[qindex] # provided by previous function call # best = expected_gain(gain_matrix,dev_y,dev_y) #CH while feature_indexes: feature_index=random.choice(feature_indexes) # choose a boundary randomly boundary=random.choice(boundaries[feature_index]) boundaries[feature_index].remove(boundary) # if feature do not have any other bounadry then remove it if boundaries[feature_index]==[]: feature_indexes.remove(feature_index) # get value of the border, bkj=all_borders[feature_index][boundary] # move border to left by alpha*(M+1) upper_bound=all_borders[feature_index][boundary+1] lower_bound=all_borders[feature_index][boundary-1] if (upper_bound !=inf) and (lower_bound!=-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (upper_bound ==inf) and (lower_bound==-inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-0)*percent elif (upper_bound ==inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (lower_bound==-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-0)*percent bkj_end=bkj+upper_distance bkj_start=bkj-lower_distance # continue # we will add 2M, 2M-1, 2M-2 ... mulitplied with alpha # so we are starting from right side to search opti_index=1 new_bkj=bkj_start+(alpha*opti_index) while(new_bkj<bkj_end): # change corresponding border with new value all_borders[feature_index][boundary]=new_bkj # do not need to fill the memory every time new_gain,pred_y=train_test_gain_now(train_x,train_y,all_borders,K_size,clas_pro,bin_counts, dev_x,dev_y,gain_matrix) #do not put equal so it moves to onto one and another TT if new_gain<gain: all_borders[feature_index][boundary]=bkj else: gain=new_gain bkj=new_bkj new_bkj=bkj_start+(alpha*opti_index) opti_index+=1 theaccuracy = bayes_funcs.accuracy(pred_y,dev_y) return all_borders,gain,theaccuracy
def optimise_by_alpha(train_x,train_y,dev_x,dev_y,all_borders,gain,K_size,clas_pro,bin_counts,gain_matrix,best,alpha=0.01,percent=0.5,limit_opti=-1): """ this part is for optimising quantisation borders: we choose a feature and quant. border randomly: move border by -(alpha*M), then move it to otherside by alpha for 2*M times, calculate decision rule with train_data, calculate expected gain with dev_data at the end pick highest expected gain """ # if it does not improve after limit_opti times then stop trying if limit_opti==-1: limit_opti=inf # limit_opti=12 # populate all boundaries first, to select randomly or to go on until finishing them # passes first one and last one since they are infinite # [[1,2,3],[1,2,3,4,5]...] boundaries=[] for border in all_borders: boundaries.append([i+1 for i in range(len(border)-2)]) #all indexes of features [0,1,2,3,4,5] feature_indexes=[i for i in range(len(all_borders))] #while there are features to optimise while feature_indexes: # choose a feature randomly feature_index=random.choice(feature_indexes) # choose a boundary randomly boundary=random.choice(boundaries[feature_index]) boundaries[feature_index].remove(boundary) #bound_index=random.randint(0,len(boundaries[feature_index])-1) #boundary=boundaries[feature_index].pop(bound_index) # if feature do not have any other bounadry then remove it if boundaries[feature_index]==[]: feature_indexes.remove(feature_index) # get value of the border, bkj=all_borders[feature_index][boundary] # move border to left by alpha*(M+1) upper_bound=all_borders[feature_index][boundary+1] lower_bound=all_borders[feature_index][boundary-1] if (upper_bound !=inf) and (lower_bound!=-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (upper_bound ==inf) and (lower_bound==-inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-0)*percent elif (upper_bound ==inf): upper_distance=(1-bkj)*percent lower_distance=(bkj-lower_bound)*percent elif (lower_bound==-inf): upper_distance=(upper_bound-bkj)*percent lower_distance=(bkj-0)*percent bkj_end=bkj+upper_distance bkj_start=bkj-lower_distance # continue # print ("feature:{},boundary: {}".format(feature_index,boundary)) # print("end:{} current:{} start:{}".format(bkj_end,bkj,bkj_start)) # we will add 2M, 2M-1, 2M-2 ... mulitplied with alpha # so we are starting from right side to search opti_index=1 new_bkj=bkj_start+(alpha*opti_index) while(new_bkj<bkj_end): # change corresponding border with new value all_borders[feature_index][boundary]=new_bkj new_gain,pred_y=train_test_gain_now(train_x,train_y,all_borders,K_size,clas_pro,bin_counts, dev_x,dev_y,gain_matrix) #do not put equal so it moves to onto one and another TT if new_gain<gain: all_borders[feature_index][boundary]=bkj else: # print("--------------------------improved by {}".format(new_gain-gain)) gain=new_gain bkj=new_bkj opti_index+=1 new_bkj=bkj_start+(alpha*opti_index) #print("gain/best: {:4.3f} gain:{} best:{} \n".format(gain/best,gain,best)) theaccuracy = bayes_funcs.accuracy(pred_y,dev_y) #print("\naccuracy : {:4.3f}".format(theaccuracy)) return all_borders,gain,theaccuracy