Example #1
0
	if model_stacking:
		print 'Performing model stacking....'

		### JUST CALL SKLEARN AND RANDOMLY SPLIT DATA ####
		from sklearn.cross_validation import train_test_split
		stack_train_x=dummies_train.values
		stack_train_y=train_data.values[:,0].astype(int)
		stack_test_x=dummies_test.values

		x_train_A, x_train_B, y_train_A, y_train_B = train_test_split(stack_train_x,stack_train_y,test_size=0.50, random_state=42)

		print 'Size of A and B:'
		print x_train_A.shape, x_train_B.shape

		print 'running part A of stacking'
		pred_train_A=udf.ridge_dummy_regression(x_train_B,y_train_B,x_train_A,1/5.6234132519)
		print 'running part B of stacking'
		pred_train_B=udf.ridge_dummy_regression(x_train_A,y_train_A,x_train_B,1/5.6234132519)
		print 'running test set of stacking'
		pred_test=udf.ridge_dummy_regression(stack_train_x,stack_train_y,stack_test_x,1.0)

		### Now drop the original categorical columns and append the new predictions from Ridge LR
		train_data=train_data.drop(cat_cols,axis=1)
		test_data=test_data.drop(cat_cols,axis=1)

		train_stacked_col=pd.concat([pd.DataFrame(pred_train_A),pd.DataFrame(pred_train_B)])
		train_stacked_col.reset_index(drop=True, inplace=True) #Very important!! concat joins based on indices
		train_stacked_col.columns=list(['stacked_LR'])

		pred_test_col=pd.DataFrame(pred_test)
		pred_test_col.columns=list(['stacked_LR'])
	testfile.close()
	print 'File written to disk...'

#########################################################################################################
### Randomly split the categorical data into halves, then perform model stacking on dummy var ###
data_size=len(x_train_dummy)
x_train_A=x_train_dummy[0:data_size/2,:]
x_train_B=x_train_dummy[data_size/2:,:]
y_train_A=y_train[0:data_size/2]
y_train_B=y_train[data_size/2:]

print 'Size of A and B:'
print x_train_A.shape, x_train_B.shape

print 'running part A of stacking'
pred_train_A=udf.ridge_dummy_regression(x_train_B,y_train_B,x_train_A,1.0/5.62)
print 'running part B of stacking'
pred_train_B=udf.ridge_dummy_regression(x_train_A,y_train_A,x_train_B,1.0/31.62)
print 'running test set of stacking'
pred_test=udf.ridge_dummy_regression(x_train_dummy,y_train,x_test_dummy,1.0/5.62)

##########################################################################################################
### Now we have the dummy variables predicted, append it onto the train + test dataset ###
"""
3. Second, we will append onto the orignal data-set
4. Take out columns with near-zero Gini
"""
### Unnamed:0 is a column that just appears after concatenation or reading from R csv file ###
train_data.drop(dummy_cols+['target','Unnamed: 0'], axis=1,inplace=True)
oot1_data.drop(dummy_cols+['Unnamed: 0'], axis=1,inplace=True)