def level_ranks(self, level, name): """ return the ranking of different samples in this classifier """ config_instance = Config('') #choose log_move transfered data load_data_instance = load_origin_data.Load_origin_data(config_instance) X_0, test_X_0, X_1, test_X_1, uid_0, test_uid_0, uid_1, test_uid_1 = load_data_instance.local_verify( ) test_uid_0 = test_uid_0.astype('int').tolist() test_uid_1 = test_uid_1.astype('int').tolist() ranks = {} column_dict = self.load_clf_file(level, name) column_dict2 = sorted( column_dict.items(), key=lambda d: d[1]) #sort from small num to large num i = 0 for uid, score in column_dict2: rank = ranks.get(uid, []) rank.append(i) ranks[uid] = rank i += 1 return ranks
def level_data(self): """ read the result from last level as the features for the next level """ level = self.level clf_name = self.__clf_name config_instance = Config('') #choose log_move transfered data load_data_instance = load_origin_data.Load_origin_data(config_instance) X, uids = load_data_instance.load_final_test() print('X shape: ', X.shape, 'uids length', uids.shape) d = {} for name in clf_name: column_dict = self.load_clf_file(level, name) for uid in uids: temp = d.get(uid, []) temp.append(column_dict[uid] ) #change fromtemp.append(column_dict[uid]) d[uid] = temp X = [] for i in range(len(uids)): X.append(d[uids[i]]) return np.array(X), np.array(uids)
def level_data(self): """ prepare input for normal stacking, which is the output of classifiers in level_one return the ranking of all observations across predictions of all classifiers on the last level for instance: uid ranking_lr ranking_xgb1 ... ranking_rf2 5662 6372 7352 72 5663 782 672 673 ... ... """ level=self.level clf_name=self.__clf_name config_instance=Config('')#choose log_move transfered data load_data_instance=load_origin_data.Load_origin_data(config_instance) X,y,uids,X_0,X_1,uid_00,uid_1q=load_data_instance.load_final() #uncomment the three lines of code when locan verification is in need #comment them when you do final prediction and testing #scripts below are loading data which is splited into train and validation locally) #take 20% of training data as validation set to do local training #X_00,test_X_00,X_11,test_X_11,uid_00,test_uid_00,uid_11,test_uid_11=load_data_instance.train_test_xy() #y=np.hstack((np.ones(len(X_00)),np.zeros(len(X_11)))) #uids=np.hstack((uid_00,uid_11)) column_important=[] d={} for name in clf_name: column_dict=self.load_clf_file(level,name)#dict contains uid and score of each observation in this specific classifier column_score=self.load_clf_score(level,name)#averge auc score this this classifier on the last_level column_important.append(column_score)# column_important list contains the average socre of each classifier in clf_name for uid in uids: temp=d.get(uid,[]) temp.append(column_dict[uid])#append the uid in column_dict into dict d d[uid]=temp#now d contains uid, end of loop X_0=[] X_1=[] uid_0=[] uid_1=[] #reverse neg and pos in dict again, return reveresed X_0, X_1, uid_0,uid_1 for i in range(len(y)): if y[i]==1: X_1.append(d[uids[i]]) uid_1.append(uids[i]) else: X_0.append(d[uids[i]]) uid_0.append(uids[i]) print( "shape of X_0 is ",(np.array(X_0).shape),"shape of X_1 is ",(np.array(X_1).shape),"shape of uid_0 is ",(np.array(uid_0)).shape,"shape of uid_1 is ",(np.array(uid_1)).shape) return np.array(X_0),np.array(X_1),np.array(uid_0),np.array(uid_1)
def level_data(self): level=self.level clf_name=self.__clf_name #load data and split into training and validation config_instance=Config('')#choose log_move transfered data load_data_instance=load_origin_data.Load_origin_data(config_instance) X_0,test_X_0,X_1,test_X_1,uid_0,test_uid_0,uid_1,test_uid_1=load_data_instance.local_verify() #convert format to int and list test_uid_0=test_uid_0.astype('int').tolist() test_uid_1=test_uid_1.astype('int').tolist() #loop through the classifiers for name in clf_name: prob=[] real=[] prob_1=[] prob_0=[] column_dict=self.load_clf_file(level,name)#obtain score of this classifier column_dict2=sorted(column_dict.items(),key=lambda d:d[1])#sort the score from small to large clf=[ '_lr_sag', #'_lr_newton', #'_lr_lbfgs', #'_lr_liblinear', #'log_move_rf100', # 'log_move_rf200', # 'log_move_rf500', # 'log_move_rf1000', # 'log_move_gbdt20', # 'log_move_gbdt50', #'log_move_gbdt100', # 'log_move_ada20', # 'log_move_ada50', #'log_move_ada100', #'_xgb2000', #'_xgb2500', #'_xgb2000_2', #'_xgb2500_2' ] #call level_ranks to return the ranking of samples, the smaller num in score, #the smaller num in ranking, since it sorts from small to large ranks=[]#ranking in level two of another classifier??? for f_name in clf: rank=self.level_ranks('level_one',f_name)#level_two ranks.append(rank) column_ranks=self.level_ranks(level,name)#ranking in level one i=0 aa=0 correct_count=0 strategy_2_region_1_correct_count=0 strategy_2_region_1_wrong_count=0 strategy_2_region_2_correct_count=0 strategy_2_region_3_correct_count=0 strategy_2_region_2_wrong_count=0 strategy_2_region_3_wrong_count=0 #benchmark AUC_BBM_Level_One=0.7941285732932 AUC_BBM_Level_One_hold_out=0.780055100126 AUC_BBM_Level_two=0.781605557816#gbdt20 wrong_count=0 r_lr=0 one_diff=[] zero_diff=[] one_index=[] zero_index=[] #choose interval of samples to blend # xgb_ranks_true=[] # xgb_ranks_false=[] # lr_ranks_true=[] # lr_ranks_false=[] # for k in range(21): # xgb_ranks_true.append(0) # xgb_ranks_false.append(0) # lr_ranks_true.append(0) # lr_ranks_false.append(0) # print(xgb_ranks_true) for uid, score in column_dict2: # if i<2000: # i+=1 # continue diff=0#diff is the diff of observations' ranking in level one than that in level two for rank in ranks: diff+=column_ranks[uid][0]-rank[uid][0]#column_ranks contains classifier's score ranking of level one, rank contains that in level two ########################## # strategy 2: interval # ########################## auc: 0.754049839922 > auc: 0.753618206126 the benchmark auc, correct 25 good users #the first interval if i>=9000/4 and i <=14000/4: if diff>9000/4: #column_dict[uid]=0 r_lr+=1 if uid in test_uid_0: strategy_2_region_1_correct_count+=1 # if uid in test_uid_1: strategy_2_region_1_wrong_count+=1 #the second interval if i>=14000/4 and i <=16000/4:#25000/4 (more radical) will triger if diff>12000/4: #column_dict[uid]=0 r_lr+=1 if uid in test_uid_0: strategy_2_region_2_correct_count+=1 # if uid in test_uid_1: strategy_2_region_2_wrong_count+=1 #the third interval if i>=20000/4 and i <=23000/4:#25000/4 (more radical) will triger if diff>15000/4: #column_dict[uid]=0 r_lr+=1 if uid in test_uid_0: strategy_2_region_3_correct_count+=1 if uid in test_uid_1: strategy_2_region_3_wrong_count+=1 ################################# # strategy 3: subselect train # ################################# #the first interval #choose if diff>2000/4+i*0.5:#or if rank[uid][0]<160:#50,100,150,200, 170 optimal column_dict[uid]=0 r_lr+=1 if uid in test_uid_0: correct_count+=1 if uid in test_uid_1: wrong_count+=1 if uid in test_uid_0: zero_diff.append(diff) zero_index.append(i) aa+=1 pass if uid in test_uid_1: one_diff.append(diff) one_index.append(i) pass i+=1 print('hold-out',500) print("test uid size: ",(len(test_uid_0)+len(test_uid_1))) print(aa) print("numbers of estimation fixed: ",r_lr) print("correct fixing: ",correct_count) print("wrong fixing: ",wrong_count) print("strategy_2_region_1_correct_count: ",strategy_2_region_1_correct_count) print("strategy_2_region_1_wrong_count: ",strategy_2_region_1_wrong_count) print("strategy_2_region_2_correct_count: ",strategy_2_region_2_correct_count) print("strategy_2_region_2_wrong_count: ",strategy_2_region_2_wrong_count) print("strategy_2_region_3_correct_count: ",strategy_2_region_3_correct_count) print("strategy_2_region_3_wrong_count: ",strategy_2_region_3_wrong_count) #calculate AUC after blending for uid,score in column_dict.items(): prob.append(score) if uid in test_uid_0: real.append(0) prob_0.append(score) elif uid in test_uid_1: real.append(1) prob_1.append(score) else: print("error") auc_score=metrics.roc_auc_score(real,prob)#benchmark auc: 0.753618206126,auc: 0.760720180713 print( "auc :",(auc_score)) print( "auc increase:",(auc_score-AUC_BBM_Level_One_hold_out))#drop to auc: 0.753348228282 when one bad is estimated as good print( '0:',max(prob_0),min(prob_0)) print( "1:",max(prob_1),min(prob_1)) #plot the ranking difference among classifiers idex=0 #self.print_diff(zero_diff[idex:],zero_index[idex:],one_diff[idex:],one_index[idex:]) return
def level_data_part(self): """ select samples that are close to default points and have relative large ranking differences between the predicted scores on BBM(Usually XGB) and DBM(linear model usually) use these samples to do next level LR training """ level=self.level clf_name=self.__clf_name ############################### # data prepariation # ############################### config_instance=Config('') load_data_instance=load_origin_data.Load_origin_data(config_instance) X,y,uids,X_0,X_1,uid_00,uid_11=load_data_instance.load_final() #uncomment the three lines of code when locan verification is in need #comment them when you do final prediction and testing #scripts below are loading data which is splited into train and validation locally) #take 20% of training data as validation set to do local training #X_00,test_X_00,X_11,test_X_11,uid_00,test_uid_00,uid_11,test_uid_11=load_data_instance.train_test_xy(1) #y=np.hstack((np.zeros(len(X_00)),np.ones(len(X_11)))) #uids=np.hstack((uid_00,uid_11)) ############################### # stacking ensemble # ############################### #pick up samples with high volatility on change of predicted auc score ranking among all samples column_important=[] d={} diff_uid=set([])#store samples locate above the line #begin of looping through the classifiers list for name in clf_name: column_dict=self.load_clf_file(level,name)#call load_clf_file to obtain log-transformed prediction result on each observation by this classifier column_score=self.load_clf_score(level,name)#call load_clf_score to obtain the average AUC socre of this classifier column_important.append(column_score)#add the average performance of this classifier column_rank=self.level_ranks(column_dict)#call level_ranks to obtain the ranking of observations, the smaller score, the higher rank #lr_dict2=self.load_clf_file('level_two','_lr_sag')#DBM_Sub,#obtain the prediction results of selected samples are re-fitted by lr in Level_Two #lr_rank2=self.level_ranks(lr_dict2) _lr_liblinear_dict=self.load_clf_file(level,'_lr_sag')#_xgb1000, lr_sag, _lr_sag_1500 _lr_liblinear_rank=self.level_ranks(_lr_liblinear_dict) #print('lr_rank2',len(lr_rank2)) print("classifier ",name, " in level_one model fitting achieved average AUC: ",column_score) column_dict2=sorted(column_dict.items(),key=lambda d:d[1])#observation ranking again max_column=max([v for k,v in column_dict.items()])#highest score in this classifier's prediction min_column=min([v for k,v in column_dict.items()])#smallest score in this classifier's prediction #max_lr=max([v for k,v in lr_dict2.items()])#highest score in DBM's prediction #min_lr=min([v for k,v in lr_dict2.items()])#smallest score in DBM's prediction print( 'highest score in BBM is: '+str(max_column),' ','lowest score in BBM: '+str(min_column)) #print( 'highest score in DBM_Sub : '+str(max_lr),' ','lowest score in DBM_Sub: '+str(min_lr)) i=0 r_lr=0 correct_count=0 wrong_count=0 prob=[] real=[] prob_1=[] prob_0=[] one_diff=[] zero_diff=[] one_index=[] zero_index=[] yy=[] scores=[] #start of loops to generate diff_uid for uid,score in column_dict2: #score=(score-min_column)/(max_column-min_column)#standardization #score is now the probability to default temp=d.get(uid,[]) temp.append(column_dict[uid]) d[uid]=temp ############################ # choose benchmarks here # ############################ #calculate the difference of the sample's ranking in by xgb and by benchmark model diff=column_rank[uid]-_lr_liblinear_rank[uid] #append the difference value into uid_00, yy appends 0 if uid in uid_00: zero_diff.append(diff) zero_index.append(i) yy.append(0) #append the difference value into uid_1, yy appends 1 else: one_diff.append(diff) one_index.append(i) yy.append(1) ################ # strategy 2 # ################ #choose samples above the line with a = 0.4 and b = 2000 #if diff>3000+i*0.42:#or diff>2500+i*0.2, choose a and b here # diff_uid.add(uid) # if __lr_liblinear_dict[uid]<500:#>200 # #score=-100#let the score be very small, # score=0.7+0.3*((score-min_lr)/(max_lr-min_lr))# #score=-100 ################ # strategy 3 # ################ #choose samples above the line with a = 0.4 and b = 2000 if diff>2000+i*0.5: diff_uid.add(uid) # if lr_rank2[uid][0]<200: # column_dict[uid]=0 #score=10000 # r_lr+=1 # if uid in uid_00: # correct_count+=1 # if uid in uid_11: # wrong_count+=1 scores.append(score) i+=1 #end of loops idex=0 #calculate AUC after blending for uid,score in column_dict.items(): prob.append(score) if uid in uid_00: real.append(0) prob_0.append(score) elif uid in uid_11: real.append(1) prob_1.append(score) else: print("error") auc_score=metrics.roc_auc_score(real,prob)#benchmark BBM in level_one 0.7505509665787999 print("num of samples are above the line: ",len(diff_uid)) print("numbers of estimation fixed: ",r_lr) print("correct fixing: ",correct_count) print("wrong fixing: ",wrong_count) print ('auc increase:',auc_score-column_score)#a pos value will justify this para value self.print_diff(zero_diff[idex:],zero_index[idex:],one_diff[idex:],one_index[idex:]) break #end of looping through the classifiers list X_0=[] X_1=[] uid_0=[] uid_1=[] for i in list(range(len(y))): if uids[i] in diff_uid: if y[i]==0: #print i X_0.append(X[i])#select samples above the line uid_0.append(uids[i]) else: X_1.append(X[i])#select samples above the line uid_1.append(uids[i]) return np.array(X_0),np.array(X_1),np.array(uid_0),np.array(uid_1)
def level_one_wrapper(): ftype=''#the choosen data preprocessing type level='level_one'#the level of training config_instance=Config('')#choose log_move transfered data load_data_instance=load_origin_data.Load_origin_data(config_instance) X,y,uid,X_0,X_1,uid_0,uid_1=load_data_instance.load_final() """ append a variety of individual classifiers into Level_train_thread obtain multiple results that are based on different classifiers and parameters classifiers include: linear classifiers, random forest, gbm, ada boost, bagging of classifiers and xgb """ #the list of threads threads=[] #for all classifiers except for xgb, call Level_train_thread and call Mboost.level_train threads.append(Level_train_thread(config_instance,LogisticRegression(solver='sag'),level,'_lr_sag',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,solver='sag'),level,'_lr_sag_1000',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,solver='sag'),level,'_lr_sag_1500',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,LogisticRegression(solver='newton-cg'),level,ftype+'_lr_newton',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,LogisticRegression(solver='lbfgs'),level,ftype+'_lr_lbfgs',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,LogisticRegression(solver='liblinear'),level,ftype+'_lr_liblinear',X_0,X_1,uid_0,uid_1)) #try to assign class weights to LogisticRegression, not working #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},solver='sag'),level,'weighted_lr_sag_1000',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:4092,1:27908},solver='sag-cg'),level,'weighted_lr_sag_1500',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:27908,1:4092},solver='sag'),level,'weighted_lr_sag_1000_reverse',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:27908,1:4092},solver='sag-cg'),level,'weighted_lr_sag_1500_reverse',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:4092,1:27908},n_jobs=-1,solver='newton-cg',verbose=2),level,ftype+'weighted_lr_newton',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},n_jobs=-1,solver='lbfgs',verbose=2),level,ftype+'weighted_lr_lbfgs',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},n_jobs=-1,solver='liblinear',verbose=2),level,ftype+'weighted_lr_liblinear',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=9),level,ftype+'_rf100',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=200,max_depth=8,min_samples_split=9),level,ftype+'_rf200',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=500,max_depth=8,min_samples_split=9),level,ftype+'_rf500',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=1000,max_depth=8,min_samples_split=9),level,ftype+'_rf1000',X_0,X_1,uid_0,uid_1)) #try to assign class weights to RandomForestClassifier, not working #threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=100,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=3,min_samples_split=7),level,ftype+'shorter_weighted_rf100',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=200,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=5,min_samples_split=8),level,ftype+'shorter_weighted_rf200',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=500,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=6,min_samples_split=9),level,ftype+'shorter_weighted_rf500',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=1000,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=7,min_samples_split=9),level,ftype+'shorter_weighted_rf1000',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=20,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt20',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=50,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt50',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=100,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt100',X_0,X_1,uid_0,uid_1)) #threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=200,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'more_esti_gbdt200',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20,learning_rate=0.02),level,ftype+'_ada20',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50,learning_rate=0.02),level,ftype+'_ada50',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=2),n_estimators=100,learning_rate=0.02),level,ftype+'_ada100',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20),level,ftype+'_bag20',X_0,X_1,uid_0,uid_1)) threads.append(Level_train_thread(config_instance,BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,min_samples_split=4),n_estimators=50),level,ftype+'_bag50',X_0,X_1,uid_0,uid_1)) params={ 'booster':'gbtree', 'objective': 'binary:logistic', 'scale_pos_weight':40920/3080.0, 'eval_metric': 'auc', 'gamma':0, 'max_depth':8, 'lambda':700, 'subsample':0.7, 'colsample_bytree':0.3, 'min_child_weight':5, 'eta': 0.02, 'seed':7, } #for all classifiers of xgb, call Xgb_level_train_thread and call Mboost.xgb_level_train threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000',X_0,X_1,uid_0,uid_1,params,1000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000',X_0,X_1,uid_0,uid_1,params,2000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500',X_0,X_1,uid_0,uid_1,params,2500)) params['scale_pos_weight']=40920/2000.0 threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_2',X_0,X_1,uid_0,uid_1,params,500)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_2',X_0,X_1,uid_0,uid_1,params,1000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_2',X_0,X_1,uid_0,uid_1,params,2500)) params['colsample_bytree']=0.6 threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_3',X_0,X_1,uid_0,uid_1,params,1000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_3',X_0,X_1,uid_0,uid_1,params,2000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_3',X_0,X_1,uid_0,uid_1,params,2500)) params['eta']=0.005 threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_4',X_0,X_1,uid_0,uid_1,params,500)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_4',X_0,X_1,uid_0,uid_1,params,2000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_4',X_0,X_1,uid_0,uid_1,params,2500)) params['eta']=0.01 params['max_depth']=7 threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_5',X_0,X_1,uid_0,uid_1,params,1000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_5',X_0,X_1,uid_0,uid_1,params,2000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_5',X_0,X_1,uid_0,uid_1,params,2500)) params['max_depth']=9 threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_6',X_0,X_1,uid_0,uid_1,params,1000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_6',X_0,X_1,uid_0,uid_1,params,2000)) threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_6',X_0,X_1,uid_0,uid_1,params,2500)) for thread in threads: thread.run()
def level_one_predict(): ftype='' config_instance=Config(ftype) level='level_one' config_instance=Config('')#choose log_move transfered data load_data_instance=load_origin_data.Load_origin_data(config_instance) #for local test X,y,uid,X_0,X_1,uid_0,uid_1=load_data_instance.load_final() predict_X,predict_uid=load_data_instance.load_final_test()#this is test set #uncomment the three lines of code when locan verification is in need #comment them when you do final prediction and testing #scripts below are loading data which is splited into train and validation locally #take 20% of training data as validation set to do local training #X_0,test_X_0,X_1,test_X_1,uid_0,test_uid_0,uid_1,test_uid_1=load_data_instance.train_test_xy() #predict_X=np.vstack((test_X_0,test_X_1)) #predict_uid=np.hstack((test_uid_0,test_uid_1)) threads=[] #for all classifiers except for xgb, call Level_predict_thread and call Mboost.level_predict #threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='sag'),level,ftype+'_lr_sag',X_0,X_1,predict_X,predict_uid)) #threads.append(Level_predict_thread(config_instance,LogisticRegression(max_iter=1000,solver='sag'),level,ftype+'_lr_sag_1000',X_0,X_1,predict_X,predict_uid)) #threads.append(Level_predict_thread(config_instance,LogisticRegression(max_iter=1500,solver='sag'),level,ftype+'_lr_sag_1500',X_0,X_1,predict_X,predict_uid)) #threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='newton-cg'),level,ftype+'_lr_newton',X_0,X_1,predict_X,predict_uid)) #threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='lbfgs'),level,ftype+'_lr_lbfgs',X_0,X_1,predict_X,predict_uid)) #threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='liblinear'),level,ftype+'_lr_liblinear',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=9),level,ftype+'_rf100',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=200,max_depth=8,min_samples_split=9),level,ftype+'_rf200',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=500,max_depth=8,min_samples_split=9),level,ftype+'_rf500',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=1000,max_depth=8,min_samples_split=9),level,ftype+'_rf1000',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=20,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt20',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=50,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt50',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=100,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt100',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20,learning_rate=0.02),level,ftype+'_ada20',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50,learning_rate=0.02),level,ftype+'_ada50',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=100,learning_rate=0.02),level,ftype+'_ada100',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20),level,ftype+'_bag20',X_0,X_1,predict_X,predict_uid)) threads.append(Level_predict_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50),level,ftype+'_bag50',X_0,X_1,predict_X,predict_uid)) params={ 'booster':'gbtree', 'objective': 'binary:logistic', 'scale_pos_weight':40920/3080.0, 'eval_metric': 'auc', 'gamma':0, 'max_depth':8, 'lambda':700, 'subsample':0.7, 'colsample_bytree':0.3, 'min_child_weight':5, 'eta': 0.02, 'seed':7, 'nthread':8 } #for all classifiers of xgb, call Xgb_level_predict_thread and call Mboost.output_level_predict #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500',X_0,X_1,predict_X,predict_uid,params,2500)) #params['scale_pos_weight']=40920/2000.0 #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_2',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_2',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_2',X_0,X_1,predict_X,predict_uid,params,2500)) #params['colsample_bytree']=0.6 #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_3',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_3',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_3',X_0,X_1,predict_X,predict_uid,params,2500)) params['eta']=0.005 #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_4',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_4',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_4',X_0,X_1,predict_X,predict_uid,params,2500)) #params['eta']=0.01 #params['max_depth']=7 #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_5',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_5',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_5',X_0,X_1,predict_X,predict_uid,params,2500)) params['max_depth']=9 #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_6',X_0,X_1,predict_X,predict_uid,params,1000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_6',X_0,X_1,predict_X,predict_uid,params,2000)) #threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_6',X_0,X_1,predict_X,predict_uid,params,2500)) for thread in threads: thread.run()