def level_ranks(self, level, name):
        """
         return the ranking of different samples in this classifier
		"""
        config_instance = Config('')  #choose log_move transfered data
        load_data_instance = load_origin_data.Load_origin_data(config_instance)
        X_0, test_X_0, X_1, test_X_1, uid_0, test_uid_0, uid_1, test_uid_1 = load_data_instance.local_verify(
        )

        test_uid_0 = test_uid_0.astype('int').tolist()
        test_uid_1 = test_uid_1.astype('int').tolist()

        ranks = {}
        column_dict = self.load_clf_file(level, name)
        column_dict2 = sorted(
            column_dict.items(),
            key=lambda d: d[1])  #sort from small num to large num
        i = 0

        for uid, score in column_dict2:
            rank = ranks.get(uid, [])
            rank.append(i)
            ranks[uid] = rank
            i += 1

        return ranks
Ejemplo n.º 2
0
    def level_data(self):
        """
         read the result from last level as the features for the next level
		"""
        level = self.level
        clf_name = self.__clf_name
        config_instance = Config('')  #choose log_move transfered data
        load_data_instance = load_origin_data.Load_origin_data(config_instance)

        X, uids = load_data_instance.load_final_test()
        print('X shape: ', X.shape, 'uids length', uids.shape)

        d = {}
        for name in clf_name:
            column_dict = self.load_clf_file(level, name)
            for uid in uids:
                temp = d.get(uid, [])
                temp.append(column_dict[uid]
                            )  #change fromtemp.append(column_dict[uid])
                d[uid] = temp

        X = []
        for i in range(len(uids)):
            X.append(d[uids[i]])

        return np.array(X), np.array(uids)
Ejemplo n.º 3
0
	def level_data(self):
		"""
         prepare input for normal stacking, which is the output of classifiers in level_one
         return the  ranking of all observations across predictions of all classifiers on the last level
         for instance: uid  ranking_lr ranking_xgb1 ... ranking_rf2
                       5662       6372       7352         72
                       5663       782         672         673
                       ...         ...

		"""
		level=self.level
		clf_name=self.__clf_name

		config_instance=Config('')#choose log_move transfered data
		load_data_instance=load_origin_data.Load_origin_data(config_instance)

		X,y,uids,X_0,X_1,uid_00,uid_1q=load_data_instance.load_final()

		#uncomment the three lines of code when locan verification is in need
		#comment them when you do final prediction and testing
		#scripts below are loading data which is splited into train and validation locally)
		#take 20% of training data as validation set to do local training
		#X_00,test_X_00,X_11,test_X_11,uid_00,test_uid_00,uid_11,test_uid_11=load_data_instance.train_test_xy()
		#y=np.hstack((np.ones(len(X_00)),np.zeros(len(X_11))))
		#uids=np.hstack((uid_00,uid_11))

		column_important=[]
		d={}
		for name in clf_name:

			column_dict=self.load_clf_file(level,name)#dict contains uid and score of each observation in this specific classifier
			column_score=self.load_clf_score(level,name)#averge auc score this this classifier on the last_level
			column_important.append(column_score)# column_important list contains the average socre of each classifier in clf_name


			for uid in uids:
				temp=d.get(uid,[])
				temp.append(column_dict[uid])#append the uid in column_dict into dict d
				d[uid]=temp#now d contains uid, end of loop
				
                
		X_0=[]
		X_1=[]
		uid_0=[]
		uid_1=[]

		#reverse neg and pos in dict again, return reveresed X_0, X_1, uid_0,uid_1
		for i in range(len(y)):
			if y[i]==1:
				X_1.append(d[uids[i]])
				uid_1.append(uids[i])
			else:
				X_0.append(d[uids[i]])
				uid_0.append(uids[i])

		print( "shape of X_0 is ",(np.array(X_0).shape),"shape of X_1 is ",(np.array(X_1).shape),"shape of uid_0 is ",(np.array(uid_0)).shape,"shape of uid_1 is ",(np.array(uid_1)).shape)
		return np.array(X_0),np.array(X_1),np.array(uid_0),np.array(uid_1)
Ejemplo n.º 4
0
	def level_data(self):
		level=self.level
		clf_name=self.__clf_name
		
         #load data and split into training and validation
		config_instance=Config('')#choose log_move transfered data
		load_data_instance=load_origin_data.Load_origin_data(config_instance)
		X_0,test_X_0,X_1,test_X_1,uid_0,test_uid_0,uid_1,test_uid_1=load_data_instance.local_verify()

         #convert format to int and list 
		test_uid_0=test_uid_0.astype('int').tolist()
		test_uid_1=test_uid_1.astype('int').tolist()

         #loop through the classifiers
		for name in clf_name:
			prob=[]
			real=[]
			prob_1=[]
			prob_0=[]

			column_dict=self.load_clf_file(level,name)#obtain score of this classifier
			
			column_dict2=sorted(column_dict.items(),key=lambda d:d[1])#sort the score from small to large

			clf=[
				'_lr_sag',
				#'_lr_newton',
				#'_lr_lbfgs',
				#'_lr_liblinear',
				#'log_move_rf100',
				# 'log_move_rf200',
				# 'log_move_rf500',
				# 'log_move_rf1000',
				# 'log_move_gbdt20',
				# 'log_move_gbdt50',
				#'log_move_gbdt100',
				# 'log_move_ada20',
				# 'log_move_ada50',
				#'log_move_ada100',
				#'_xgb2000',
				#'_xgb2500',
				#'_xgb2000_2',
				#'_xgb2500_2'

			]
            
             #call level_ranks to return the ranking of samples, the smaller num in score, 
             #the smaller num in ranking, since it sorts from small to large
             
			ranks=[]#ranking in level two of another classifier???
			for f_name in clf:
				rank=self.level_ranks('level_one',f_name)#level_two
				ranks.append(rank)

			column_ranks=self.level_ranks(level,name)#ranking in level one


			i=0
			aa=0
			correct_count=0
			strategy_2_region_1_correct_count=0
			strategy_2_region_1_wrong_count=0

			strategy_2_region_2_correct_count=0
			strategy_2_region_3_correct_count=0
			strategy_2_region_2_wrong_count=0
			strategy_2_region_3_wrong_count=0

#benchmark
			AUC_BBM_Level_One=0.7941285732932
			AUC_BBM_Level_One_hold_out=0.780055100126

			AUC_BBM_Level_two=0.781605557816#gbdt20

			wrong_count=0
			r_lr=0
			one_diff=[]
			zero_diff=[]
			one_index=[]
			zero_index=[]
            
			#choose interval of samples to blend

			# xgb_ranks_true=[]
			# xgb_ranks_false=[]
			# lr_ranks_true=[]
			# lr_ranks_false=[]
			# for k in range(21):
			# 	xgb_ranks_true.append(0)
			# 	xgb_ranks_false.append(0)
			# 	lr_ranks_true.append(0)
			# 	lr_ranks_false.append(0)
			# print(xgb_ranks_true)

			for uid, score in column_dict2:
				# if i<2000:
				# 	i+=1
				# 	continue
				diff=0#diff is the diff of observations' ranking in level one than that in level two
				for rank in ranks:
					diff+=column_ranks[uid][0]-rank[uid][0]#column_ranks contains classifier's score ranking of level one, rank contains that in level two


##########################
#  strategy 2: interval  #
########################## auc: 0.754049839922 > auc: 0.753618206126 the benchmark auc, correct 25 good users
				#the first interval
				if i>=9000/4 and i <=14000/4:
					if diff>9000/4:
						#column_dict[uid]=0				    
						r_lr+=1

						if uid in test_uid_0:					
						    strategy_2_region_1_correct_count+=1
#
						if uid in test_uid_1:					
						    strategy_2_region_1_wrong_count+=1
		
				#the second interval
				if i>=14000/4 and i <=16000/4:#25000/4 (more radical) will triger
					if diff>12000/4:
						#column_dict[uid]=0
						r_lr+=1

						if uid in test_uid_0:					
						    strategy_2_region_2_correct_count+=1
#
						if uid in test_uid_1:					
						    strategy_2_region_2_wrong_count+=1

				#the third interval
				if i>=20000/4 and i <=23000/4:#25000/4 (more radical) will triger
					if diff>15000/4:
						#column_dict[uid]=0
						r_lr+=1

						if uid in test_uid_0:					
						    strategy_2_region_3_correct_count+=1

						if uid in test_uid_1:					
						    strategy_2_region_3_wrong_count+=1

#################################
#  strategy 3: subselect train  #
#################################
				#the first interval

                  #choose 
				if diff>2000/4+i*0.5:#or 
					if rank[uid][0]<160:#50,100,150,200, 170 optimal
						column_dict[uid]=0

						r_lr+=1
						if uid in test_uid_0:					
						    correct_count+=1
						if uid in test_uid_1:					
						    wrong_count+=1





				

				if uid in test_uid_0:
					zero_diff.append(diff)
					zero_index.append(i)
					aa+=1
					pass

				if uid in test_uid_1:
					one_diff.append(diff)
					one_index.append(i)

					pass
					
				i+=1

			print('hold-out',500)

			print("test uid size: ",(len(test_uid_0)+len(test_uid_1)))
			print(aa)
			print("numbers of estimation fixed: ",r_lr)
			print("correct fixing: ",correct_count)
			print("wrong fixing: ",wrong_count)		
			print("strategy_2_region_1_correct_count: ",strategy_2_region_1_correct_count)
			print("strategy_2_region_1_wrong_count: ",strategy_2_region_1_wrong_count)			
			print("strategy_2_region_2_correct_count: ",strategy_2_region_2_correct_count)
			print("strategy_2_region_2_wrong_count: ",strategy_2_region_2_wrong_count)				
			print("strategy_2_region_3_correct_count: ",strategy_2_region_3_correct_count)
			print("strategy_2_region_3_wrong_count: ",strategy_2_region_3_wrong_count)			

			#calculate AUC after blending
			for uid,score in column_dict.items():
				prob.append(score)
				if uid in test_uid_0:
					real.append(0)
					prob_0.append(score)
				elif uid in test_uid_1:
					real.append(1)
					prob_1.append(score)
				else:
					print("error")

			auc_score=metrics.roc_auc_score(real,prob)#benchmark auc: 0.753618206126,auc: 0.760720180713
			print( "auc :",(auc_score))

			print( "auc increase:",(auc_score-AUC_BBM_Level_One_hold_out))#drop to auc: 0.753348228282 when one bad is estimated as good 
			print( '0:',max(prob_0),min(prob_0))
			print( "1:",max(prob_1),min(prob_1))
 

			#plot the ranking difference among classifiers
			idex=0
			#self.print_diff(zero_diff[idex:],zero_index[idex:],one_diff[idex:],one_index[idex:])
			return
Ejemplo n.º 5
0
	def level_data_part(self):
		"""
         select samples that are close to default points and have relative large ranking differences 
         between the predicted scores on BBM(Usually XGB) and DBM(linear model usually)
         use these samples to do next level LR training
		"""
		level=self.level
		clf_name=self.__clf_name
        
        ############################### 
        #      data prepariation      #
        ###############################
        

		config_instance=Config('')
		load_data_instance=load_origin_data.Load_origin_data(config_instance)

		X,y,uids,X_0,X_1,uid_00,uid_11=load_data_instance.load_final()


		#uncomment the three lines of code when locan verification is in need
		#comment them when you do final prediction and testing
		#scripts below are loading data which is splited into train and validation locally)
		#take 20% of training data as validation set to do local training
		#X_00,test_X_00,X_11,test_X_11,uid_00,test_uid_00,uid_11,test_uid_11=load_data_instance.train_test_xy(1)
		#y=np.hstack((np.zeros(len(X_00)),np.ones(len(X_11))))
		#uids=np.hstack((uid_00,uid_11))
	               
        ############################### 
        #       stacking ensemble     #
        ###############################        
        #pick up samples with high volatility on change of predicted auc score ranking among all samples
        
		column_important=[]
		d={}
		diff_uid=set([])#store samples locate above the line

         #begin of looping through the classifiers list
		for name in clf_name:

			column_dict=self.load_clf_file(level,name)#call load_clf_file to obtain log-transformed prediction result on each observation by this classifier
            
			column_score=self.load_clf_score(level,name)#call load_clf_score to obtain the average AUC socre of this classifier 
            
			column_important.append(column_score)#add the average performance of this classifier

			column_rank=self.level_ranks(column_dict)#call level_ranks to obtain the ranking of observations, the smaller score, the higher rank
                              

			#lr_dict2=self.load_clf_file('level_two','_lr_sag')#DBM_Sub,#obtain the prediction results of selected samples are re-fitted by lr in Level_Two
			#lr_rank2=self.level_ranks(lr_dict2)

			_lr_liblinear_dict=self.load_clf_file(level,'_lr_sag')#_xgb1000, lr_sag, _lr_sag_1500
			_lr_liblinear_rank=self.level_ranks(_lr_liblinear_dict)

			#print('lr_rank2',len(lr_rank2))

			print("classifier ",name, " in level_one model fitting achieved average AUC: ",column_score)
            
			column_dict2=sorted(column_dict.items(),key=lambda d:d[1])#observation ranking again
            
			max_column=max([v for k,v in column_dict.items()])#highest score in this classifier's prediction
			min_column=min([v for k,v in column_dict.items()])#smallest score in this classifier's prediction

			#max_lr=max([v for k,v in lr_dict2.items()])#highest score in DBM's prediction
			#min_lr=min([v for k,v in lr_dict2.items()])#smallest score in  DBM's prediction
            
			print( 'highest score in BBM is: '+str(max_column),' ','lowest score in BBM: '+str(min_column))
			#print( 'highest score in DBM_Sub : '+str(max_lr),' ','lowest score in DBM_Sub: '+str(min_lr))

			i=0
			r_lr=0
			correct_count=0
			wrong_count=0
			prob=[]
			real=[]
			prob_1=[]
			prob_0=[]
			one_diff=[]
			zero_diff=[]
			one_index=[]
			zero_index=[]
			yy=[]
			scores=[]
            
             #start of loops to generate diff_uid
			for uid,score in column_dict2:
				#score=(score-min_column)/(max_column-min_column)#standardization
                 #score is now the probability to default
				temp=d.get(uid,[])
				temp.append(column_dict[uid])
				d[uid]=temp

############################
#  choose benchmarks here  #
############################

                  #calculate the difference of the sample's ranking in by xgb and by benchmark model
				diff=column_rank[uid]-_lr_liblinear_rank[uid]

                  #append the difference value into uid_00, yy appends 0
				if uid in uid_00:
					zero_diff.append(diff)
					zero_index.append(i)
					yy.append(0)
                  #append the difference value into uid_1, yy appends 1 
				else:
					one_diff.append(diff)
					one_index.append(i)
					yy.append(1)
################ 
#  strategy 2  #
################
                  #choose samples above the line with a = 0.4 and b = 2000
				#if diff>3000+i*0.42:#or diff>2500+i*0.2, choose a and b here
				#	diff_uid.add(uid)
				#	if __lr_liblinear_dict[uid]<500:#>200
				#		#score=-100#let the score be very small, 
				#		score=0.7+0.3*((score-min_lr)/(max_lr-min_lr))# 
						#score=-100

################ 
#  strategy 3  #
################
                  #choose samples above the line with a = 0.4 and b = 2000
				if diff>2000+i*0.5:
					diff_uid.add(uid)
			#		if lr_rank2[uid][0]<200:
			#			column_dict[uid]=0
						#score=10000

			#			r_lr+=1
			#			if uid in uid_00:					
			#			    correct_count+=1
			#			if uid in uid_11:					
			#			    wrong_count+=1
				scores.append(score)
				i+=1
             #end of loops

			idex=0

			#calculate AUC after blending
			for uid,score in column_dict.items():
				prob.append(score)
				if uid in uid_00:
					real.append(0)
					prob_0.append(score)
				elif uid in uid_11:
					real.append(1)
					prob_1.append(score)
				else:
					print("error")

			auc_score=metrics.roc_auc_score(real,prob)#benchmark BBM in level_one 0.7505509665787999

			print("num of samples are above the line: ",len(diff_uid))
			print("numbers of estimation fixed: ",r_lr)
			print("correct fixing: ",correct_count)
			print("wrong fixing: ",wrong_count)
			print ('auc increase:',auc_score-column_score)#a pos value will justify this para value
			self.print_diff(zero_diff[idex:],zero_index[idex:],one_diff[idex:],one_index[idex:])
			break
        
         #end of looping through the classifiers list		
		
		X_0=[]
		X_1=[]
		uid_0=[]
		uid_1=[]

		for i in list(range(len(y))):
			if uids[i] in diff_uid:
				if y[i]==0:
					#print i
					X_0.append(X[i])#select samples above the line
					uid_0.append(uids[i])
				else:
					X_1.append(X[i])#select samples above the line
					uid_1.append(uids[i])

		return np.array(X_0),np.array(X_1),np.array(uid_0),np.array(uid_1)
Ejemplo n.º 6
0
def level_one_wrapper():
     
	ftype=''#the choosen data preprocessing type
    
	level='level_one'#the level of training
     
	config_instance=Config('')#choose log_move transfered data

	load_data_instance=load_origin_data.Load_origin_data(config_instance)

	X,y,uid,X_0,X_1,uid_0,uid_1=load_data_instance.load_final()


	"""
	append a variety of individual classifiers into Level_train_thread
    obtain multiple results that are based on different classifiers and parameters 
    classifiers include: linear classifiers, random forest, gbm, ada boost, bagging of classifiers and xgb
	"""
     #the list of threads
	threads=[]
     #for all classifiers except for xgb, call Level_train_thread and call Mboost.level_train
	threads.append(Level_train_thread(config_instance,LogisticRegression(solver='sag'),level,'_lr_sag',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,solver='sag'),level,'_lr_sag_1000',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,solver='sag'),level,'_lr_sag_1500',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,LogisticRegression(solver='newton-cg'),level,ftype+'_lr_newton',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,LogisticRegression(solver='lbfgs'),level,ftype+'_lr_lbfgs',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,LogisticRegression(solver='liblinear'),level,ftype+'_lr_liblinear',X_0,X_1,uid_0,uid_1))
     
	#try to assign class weights to LogisticRegression, not working
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},solver='sag'),level,'weighted_lr_sag_1000',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:4092,1:27908},solver='sag-cg'),level,'weighted_lr_sag_1500',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:27908,1:4092},solver='sag'),level,'weighted_lr_sag_1000_reverse',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:27908,1:4092},solver='sag-cg'),level,'weighted_lr_sag_1500_reverse',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1500,class_weight={0:4092,1:27908},n_jobs=-1,solver='newton-cg',verbose=2),level,ftype+'weighted_lr_newton',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},n_jobs=-1,solver='lbfgs',verbose=2),level,ftype+'weighted_lr_lbfgs',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,LogisticRegression(max_iter=1000,class_weight={0:4092,1:27908},n_jobs=-1,solver='liblinear',verbose=2),level,ftype+'weighted_lr_liblinear',X_0,X_1,uid_0,uid_1))
	
	threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=9),level,ftype+'_rf100',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=200,max_depth=8,min_samples_split=9),level,ftype+'_rf200',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=500,max_depth=8,min_samples_split=9),level,ftype+'_rf500',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=1000,max_depth=8,min_samples_split=9),level,ftype+'_rf1000',X_0,X_1,uid_0,uid_1))

	#try to assign class weights to RandomForestClassifier, not working
	#threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=100,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=3,min_samples_split=7),level,ftype+'shorter_weighted_rf100',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=200,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=5,min_samples_split=8),level,ftype+'shorter_weighted_rf200',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=500,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=6,min_samples_split=9),level,ftype+'shorter_weighted_rf500',X_0,X_1,uid_0,uid_1))
	#threads.append(Level_train_thread(config_instance,RandomForestClassifier(n_estimators=1000,class_weight={0:4092,1:27908},n_jobs=-1,max_depth=7,min_samples_split=9),level,ftype+'shorter_weighted_rf1000',X_0,X_1,uid_0,uid_1))

	threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=20,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt20',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=50,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt50',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=100,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt100',X_0,X_1,uid_0,uid_1))

	#threads.append(Level_train_thread(config_instance,GradientBoostingClassifier(n_estimators=200,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'more_esti_gbdt200',X_0,X_1,uid_0,uid_1))

	threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20,learning_rate=0.02),level,ftype+'_ada20',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50,learning_rate=0.02),level,ftype+'_ada50',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=2),n_estimators=100,learning_rate=0.02),level,ftype+'_ada100',X_0,X_1,uid_0,uid_1))

	threads.append(Level_train_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20),level,ftype+'_bag20',X_0,X_1,uid_0,uid_1))
	threads.append(Level_train_thread(config_instance,BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,min_samples_split=4),n_estimators=50),level,ftype+'_bag50',X_0,X_1,uid_0,uid_1))

	params={
	    'booster':'gbtree',
	    'objective': 'binary:logistic',
	   	'scale_pos_weight':40920/3080.0,
	    'eval_metric': 'auc',
	    'gamma':0,
	    'max_depth':8,
	    'lambda':700,
	    'subsample':0.7,
	    'colsample_bytree':0.3,
	    'min_child_weight':5,
	    'eta': 0.02,
	    'seed':7,
	    }
    #for all classifiers of xgb, call Xgb_level_train_thread and call Mboost.xgb_level_train
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000',X_0,X_1,uid_0,uid_1,params,1000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000',X_0,X_1,uid_0,uid_1,params,2000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500',X_0,X_1,uid_0,uid_1,params,2500))

	params['scale_pos_weight']=40920/2000.0
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_2',X_0,X_1,uid_0,uid_1,params,500))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_2',X_0,X_1,uid_0,uid_1,params,1000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_2',X_0,X_1,uid_0,uid_1,params,2500))

	params['colsample_bytree']=0.6
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_3',X_0,X_1,uid_0,uid_1,params,1000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_3',X_0,X_1,uid_0,uid_1,params,2000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_3',X_0,X_1,uid_0,uid_1,params,2500))

	params['eta']=0.005
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_4',X_0,X_1,uid_0,uid_1,params,500))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_4',X_0,X_1,uid_0,uid_1,params,2000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_4',X_0,X_1,uid_0,uid_1,params,2500))

	params['eta']=0.01
	params['max_depth']=7
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_5',X_0,X_1,uid_0,uid_1,params,1000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_5',X_0,X_1,uid_0,uid_1,params,2000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_5',X_0,X_1,uid_0,uid_1,params,2500))

	params['max_depth']=9
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb1000_6',X_0,X_1,uid_0,uid_1,params,1000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2000_6',X_0,X_1,uid_0,uid_1,params,2000))
	threads.append(Xgb_level_train_thread(config_instance,level,ftype+'_xgb2500_6',X_0,X_1,uid_0,uid_1,params,2500))
	
	for thread in threads:
		thread.run()
Ejemplo n.º 7
0
def level_one_predict():
	ftype=''
	config_instance=Config(ftype)
	level='level_one'

	config_instance=Config('')#choose log_move transfered data

	load_data_instance=load_origin_data.Load_origin_data(config_instance)

    #for local test
	X,y,uid,X_0,X_1,uid_0,uid_1=load_data_instance.load_final()
	predict_X,predict_uid=load_data_instance.load_final_test()#this is test set
    
    #uncomment the three lines of code when locan verification is in need
    #comment them when you do final prediction and testing
    #scripts below are loading data which is splited into train and validation locally	
	#take 20% of training data as validation set to do local training
	#X_0,test_X_0,X_1,test_X_1,uid_0,test_uid_0,uid_1,test_uid_1=load_data_instance.train_test_xy()
	#predict_X=np.vstack((test_X_0,test_X_1))
	#predict_uid=np.hstack((test_uid_0,test_uid_1))

	threads=[]
     #for all classifiers except for xgb, call Level_predict_thread and call Mboost.level_predict
	#threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='sag'),level,ftype+'_lr_sag',X_0,X_1,predict_X,predict_uid))
	#threads.append(Level_predict_thread(config_instance,LogisticRegression(max_iter=1000,solver='sag'),level,ftype+'_lr_sag_1000',X_0,X_1,predict_X,predict_uid))
	#threads.append(Level_predict_thread(config_instance,LogisticRegression(max_iter=1500,solver='sag'),level,ftype+'_lr_sag_1500',X_0,X_1,predict_X,predict_uid))


	#threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='newton-cg'),level,ftype+'_lr_newton',X_0,X_1,predict_X,predict_uid))
	#threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='lbfgs'),level,ftype+'_lr_lbfgs',X_0,X_1,predict_X,predict_uid))
	#threads.append(Level_predict_thread(config_instance,LogisticRegression(solver='liblinear'),level,ftype+'_lr_liblinear',X_0,X_1,predict_X,predict_uid))

	threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=9),level,ftype+'_rf100',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=200,max_depth=8,min_samples_split=9),level,ftype+'_rf200',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=500,max_depth=8,min_samples_split=9),level,ftype+'_rf500',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,RandomForestClassifier(n_estimators=1000,max_depth=8,min_samples_split=9),level,ftype+'_rf1000',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=20,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt20',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=50,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt50',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,GradientBoostingClassifier(n_estimators=100,max_depth=8,min_samples_split=9,learning_rate=0.02,subsample=0.7),level,ftype+'_gbdt100',X_0,X_1,predict_X,predict_uid))

	threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20,learning_rate=0.02),level,ftype+'_ada20',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50,learning_rate=0.02),level,ftype+'_ada50',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=100,learning_rate=0.02),level,ftype+'_ada100',X_0,X_1,predict_X,predict_uid))

	threads.append(Level_predict_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=20),level,ftype+'_bag20',X_0,X_1,predict_X,predict_uid))
	threads.append(Level_predict_thread(config_instance,BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50,max_depth=8,min_samples_split=9),n_estimators=50),level,ftype+'_bag50',X_0,X_1,predict_X,predict_uid))

	params={
	    'booster':'gbtree',
	    'objective': 'binary:logistic',
	   	'scale_pos_weight':40920/3080.0,
	    'eval_metric': 'auc',
	    'gamma':0,
	    'max_depth':8,
	    'lambda':700,
	    'subsample':0.7,
	    'colsample_bytree':0.3,
	    'min_child_weight':5,
	    'eta': 0.02,
	    'seed':7,
	    'nthread':8
	    }
    
    #for all classifiers of xgb, call Xgb_level_predict_thread and call Mboost.output_level_predict
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500',X_0,X_1,predict_X,predict_uid,params,2500))

	#params['scale_pos_weight']=40920/2000.0
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_2',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_2',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_2',X_0,X_1,predict_X,predict_uid,params,2500))

	#params['colsample_bytree']=0.6
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_3',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_3',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_3',X_0,X_1,predict_X,predict_uid,params,2500))

	params['eta']=0.005
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_4',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_4',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_4',X_0,X_1,predict_X,predict_uid,params,2500))

	#params['eta']=0.01
	#params['max_depth']=7
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_5',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_5',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_5',X_0,X_1,predict_X,predict_uid,params,2500))

	params['max_depth']=9
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb1000_6',X_0,X_1,predict_X,predict_uid,params,1000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2000_6',X_0,X_1,predict_X,predict_uid,params,2000))
	#threads.append(Xgb_level_predict_thread(config_instance,level,ftype+'_xgb2500_6',X_0,X_1,predict_X,predict_uid,params,2500))
	for thread in threads:
		thread.run()