if model_stacking: print 'Performing model stacking....' ### JUST CALL SKLEARN AND RANDOMLY SPLIT DATA #### from sklearn.cross_validation import train_test_split stack_train_x=dummies_train.values stack_train_y=train_data.values[:,0].astype(int) stack_test_x=dummies_test.values x_train_A, x_train_B, y_train_A, y_train_B = train_test_split(stack_train_x,stack_train_y,test_size=0.50, random_state=42) print 'Size of A and B:' print x_train_A.shape, x_train_B.shape print 'running part A of stacking' pred_train_A=udf.ridge_dummy_regression(x_train_B,y_train_B,x_train_A,1/5.6234132519) print 'running part B of stacking' pred_train_B=udf.ridge_dummy_regression(x_train_A,y_train_A,x_train_B,1/5.6234132519) print 'running test set of stacking' pred_test=udf.ridge_dummy_regression(stack_train_x,stack_train_y,stack_test_x,1.0) ### Now drop the original categorical columns and append the new predictions from Ridge LR train_data=train_data.drop(cat_cols,axis=1) test_data=test_data.drop(cat_cols,axis=1) train_stacked_col=pd.concat([pd.DataFrame(pred_train_A),pd.DataFrame(pred_train_B)]) train_stacked_col.reset_index(drop=True, inplace=True) #Very important!! concat joins based on indices train_stacked_col.columns=list(['stacked_LR']) pred_test_col=pd.DataFrame(pred_test) pred_test_col.columns=list(['stacked_LR'])
testfile.close() print 'File written to disk...' ######################################################################################################### ### Randomly split the categorical data into halves, then perform model stacking on dummy var ### data_size=len(x_train_dummy) x_train_A=x_train_dummy[0:data_size/2,:] x_train_B=x_train_dummy[data_size/2:,:] y_train_A=y_train[0:data_size/2] y_train_B=y_train[data_size/2:] print 'Size of A and B:' print x_train_A.shape, x_train_B.shape print 'running part A of stacking' pred_train_A=udf.ridge_dummy_regression(x_train_B,y_train_B,x_train_A,1.0/5.62) print 'running part B of stacking' pred_train_B=udf.ridge_dummy_regression(x_train_A,y_train_A,x_train_B,1.0/31.62) print 'running test set of stacking' pred_test=udf.ridge_dummy_regression(x_train_dummy,y_train,x_test_dummy,1.0/5.62) ########################################################################################################## ### Now we have the dummy variables predicted, append it onto the train + test dataset ### """ 3. Second, we will append onto the orignal data-set 4. Take out columns with near-zero Gini """ ### Unnamed:0 is a column that just appears after concatenation or reading from R csv file ### train_data.drop(dummy_cols+['target','Unnamed: 0'], axis=1,inplace=True) oot1_data.drop(dummy_cols+['Unnamed: 0'], axis=1,inplace=True)