def model_train_validation(ins_file, oos_file, classifier, var_list_filename, result_dir, output_suffix): """ train model evaluate on the train and validation data evaluate the model performance on the train and validation data """ #################### Load train and validation data #################### print 'Loading data for modeling starts ...' t0 = time.time() target_name = 'target' X, y = load_data(ins_file, var_list_filename, target_name) Xv, yv = load_data(oos_file, var_list_filename, target_name) print "Loading data done, taking ", time.time() - t0, "secs" # Train Model print '\nModel training starts...' t0 = time.time() model = classifier model.fit(X, y) print "Model training done, taking ", time.time() - t0, "secs" pickle.dump(model, open(result_dir + "model.p", 'wb')) # save model to disk # Predict Train y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:, 1] # Predict Validation yv_pred = model.predict(Xv) pv_pred = model.predict_proba(Xv) pv_pred = pv_pred[:, 1] # Performance Evaluation: Train and Validation performance_eval_train_validation(y, p_pred, yv, pv_pred, result_dir, output_suffix) #################### Random Forest Feature Importance ###################### try: varlist_file = open(var_list_filename, 'rU') varlist_csv = csv.reader(varlist_file) var_list = [] for row in varlist_csv: var_list.append(row[0]) out_feat_import = open( result_dir + 'feature_import_' + str(output_suffix) + '.csv', 'wb') feat_import_csv = csv.writer(out_feat_import) var_import = zip(range(len(var_list)), var_list, model.feature_importances_) feat_import_csv.writerow(['var seq num', 'var name', 'importance']) print "RandomForest classifier, var importance was output" for row in var_import: feat_import_csv.writerow(row) except: print "Not RandomForest classifier, var importance not created"
def model_train_validation(ins_file, oos_file, classifier, var_list_filename, output_dir, outpu): """ train model evaluate on the train and validation data evaluate the model performance on the train and validation data """ #################### Load train and validation data #################### print 'Loading data for modeling starts ...' t0=time.time() target_name='target' X,y = load_data_fast(ins_file, var_list_filename, target_name) Xv,yv = load_data_fast(oos_file, var_list_filename, target_name) print "Loading data done, taking ",time.time()-t0,"secs" # prepare trivial input values for generating reason code in production trivial_input_values_file = output_dir+'trivial_input_values.p' trivial_input_values = median(X,axis=0) pickle.dump(trivial_input_values,open(trivial_input_values_file,'wb')) # Train Model print '\nModel training starts...' t0=time.time() model = classifier model.fit(X, y) print "Model training done, taking ",time.time()-t0,"secs" pickle.dump(model,open(output_dir+"model.p",'wb')) # save model to disk ''' #export to tree graph in DOT format, tree only tree.export_graphviz(model,out_file=output_dir+'tree.dot') os.system("dot -Tpng "+output_dir+"tree.dot -o "+output_dir+"tree.png") ''' # Predict Train y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:,1] # Predict Validation yv_pred = model.predict(Xv) pv_pred = model.predict_proba(Xv) pv_pred = pv_pred[:,1] # Performance Evaluation: Train and Validation ks, auc, lorenz_curve_capt_rate = performance_eval_train_validation(y,p_pred,yv,pv_pred,output_dir,output_suffix) #################### Random Forest Feature Importance ###################### try: varlist_file=open(var_list_filename,'rU') varlist_csv=csv.reader(varlist_file) var_list=[] for row in varlist_csv: var_list.append(row[0]) out_feat_import = open(output_dir + 'feature_import_' + str(output_suffix)+'.csv', 'wb') feat_import_csv = csv.writer(out_feat_import) var_import = zip(range(len(var_list)),var_list,model.feature_importances_) feat_import_csv.writerow(['var seq num','var name','importance']) print "RandomForest classifier, var importance was output" for row in var_import: feat_import_csv.writerow(row) except: print "Not RandomForest classifier, var importance not created" return ks, auc, lorenz_curve_capt_rate