def find_best_feature(feature_name, cv_fold, train_data, train_label): # 为了寻找最佳的特征组合,这里是对LGBMClassifier XGBClassifier GBC三个模型的得分进行平均,来代表这个特征所代表的分数 get_ans_face = feature_name new_lgb_model = lgb.LGBMRegressor(n_estimators=300, random_state=1) cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='r2') new_lgb_model.fit(train_data[get_ans_face], train_label) m1 = cv_model.mean() new_xgb_model1 = xgb.XGBRegressor(n_estimators=300, random_state=1) cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='r2') new_xgb_model1.fit(train_data[get_ans_face].values, train_label) m2 = cv_model.mean() new_gbc_model = GBR(n_estimators=310) cv_model = cv(new_gbc_model, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='r2') new_gbc_model.fit(train_data[get_ans_face].values, train_label) m3 = cv_model.mean() return (m1 + m2 + m3) / 3
def get_model(nums, cv_fold): feature_name1 = train_data[feature_name].columns get_ans_face = list( set(get_pic(gbc_model, feature_name1).head(nums)['name']) & set(get_pic(xgb_model, feature_name1).head(nums)['name']) & set(get_pic(lgb_model, feature_name1).head(nums)['name'])) print('New Feature: ', len(get_ans_face)) if 'SNP32*SNP34' not in get_ans_face: get_ans_face.append('SNP32*SNP34') print('New Feature: ', len(get_ans_face)) new_lgb_model = lgb.LGBMClassifier(objective='binary', n_estimators=300, max_depth=3, min_child_samples=6, learning_rate=0.102, random_state=1) cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1') new_lgb_model.fit(train_data[get_ans_face], train_label) m1 = cv_model.mean() new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=300, max_depth=4, learning_rate=0.101, random_state=1) cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1') new_xgb_model1.fit(train_data[get_ans_face].values, train_label) m2 = cv_model.mean() new_gbc_model = GBC(n_estimators=310, subsample=1, min_samples_split=2, max_depth=3, learning_rate=0.1900, min_weight_fraction_leaf=0.1) kkk = train_data[get_ans_face].fillna(7) cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1') new_gbc_model.fit(kkk.fillna(7), train_label) m3 = cv_model.mean() print((m1 + m2 + m3) / 3) pro1 = new_lgb_model.predict_proba(test_data[get_ans_face]) pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values) pro3 = new_gbc_model.predict_proba( test_data[get_ans_face].fillna(7).values) ans = (pro1 + pro2 + pro3) / 3 return ans
def find_best_feature(feature_name, cv_fold): get_ans_face = feature_name new_lgb_model = lgb.LGBMClassifier(objective='binary', n_estimators=300, max_depth=3, min_child_samples=6, learning_rate=0.102, random_state=1) cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1') new_lgb_model.fit(train_data[get_ans_face], train_label) m1 = cv_model.mean() new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic', n_estimators=300, max_depth=4, learning_rate=0.101, random_state=1) cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1') new_xgb_model1.fit(train_data[get_ans_face].values, train_label) m2 = cv_model.mean() new_gbc_model = GBC(n_estimators=310, subsample=1, min_samples_split=2, max_depth=3, learning_rate=0.1900, min_weight_fraction_leaf=0.1) kkk = train_data[get_ans_face].fillna(7) cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1') new_gbc_model.fit(kkk.fillna(7), train_label) m3 = cv_model.mean() return (m1 + m2 + m3) / 3
scaler.fit(df2000_slim) df2000_slim = pandas.DataFrame(scaler.transform(df2000_slim), columns=df2000_slim.columns) df2000_slim.describe() # Predict values for the output predicted_values = final_estimator_used.predict(df2000_slim) # Create a confusion matrix to examine the results cm = pandas.crosstab( response_series2000, predicted_values, rownames=["True Label"], colnames=["Predicted Label"], margins=True ) print cm # Predicted Label 0 1 All # True Label # 0 131 75 206 # 1 22 35 57 # All 153 110 263 # Calculate cross val accuracy scores from sklearn.cross_validation import cross_val_score as cv accuracy_scores_best_OOS = cv(final_estimator_used, df2000_slim, response_series2000, cv=10, scoring="accuracy") accuracy_scores_best_oldData = cv(final_estimator_used, explanatory_df, response_series, cv=10, scoring="accuracy") print accuracy_scores_best_OOS.mean() # Accuracy of 81% print accuracy_scores_best_oldData.mean() # Accuracy of 80%
colnames=['Predicted'], margins=True) print nb_crosstab #BUILD RANDOM FOREST MODEL (SCALED DATA)------------------------------------------------ #instantiate Random Forest model rf = ens.RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, min_density=None, compute_importances=None) #fit model and check scores (mean = .9215, max = .9597) rf_cv = cv(rf, pre2000_exp_scaled, pre2000_res, cv=10, scoring='roc_auc') print rf_cv .mean() print rf_cv.max() #perform grid search to find the optimal number of trees rftree_range = range(10, 550, 10) param_grid = dict(n_estimators = rftree_range) rf_grid = gscv(rf, param_grid, cv=10, scoring='roc_auc') rf_grid.fit(pre2000_exp_scaled, pre2000_res) #check results from grid search rf_grid_mean_scores = [result[1] for result in rf_grid.grid_scores_] plt.figure() plt.plot(rftree_range, rf_grid_mean_scores) #identify best estimator
max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None) # I'm going to change this a bit. Instantiates the object rfhw = ensemble.RandomForestClassifier(n_estimators=500) roc_scores_rfhw = cv(rfhw, explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc') # let's do the same for the decision tree roc_score_treehw = cv(tree.DecisionTreeClassifier(), explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc') ## let's compare the mean ROC AUC print roc_scores_rfhw.mean() print roc_score_treehw.mean() #The random forest indeed is much better in accuracy here than the regular decision tree.
#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------ #run recursive feature search with 10-fold cv to identify potential features lr = lm.LogisticRegression() lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose=1) lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) #identify features features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()] print features #run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451) lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc') lr_cv.mean() #create dataset with response and selected features lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1) #BUILD FULL LOGISTIC REGRESSION MODEL------------------------------------------ #get model summary with ALL variables (except teamID_CAL because it leads to singular matrix) model_all = logit( 'inducted ~ b_atbat + b_runs + b_hits + b_hruns + b_strik + p_wins + p_loss + p_shout + p_saves + p_eruns + p_stout + f_puts + f_dplay + POS_C + POS_P + teamID_NYN + teamID_Other', data=lrset).fit(maxiter=5000) print model_all.summary()
criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto", max_leaf_nodes=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, min_density=None, compute_importances=None, ) # fit model and check scores (mean = .9215, max = .9597) rf_cv = cv(rf, pre2000_exp_scaled, pre2000_res, cv=10, scoring="roc_auc") print rf_cv.mean() print rf_cv.max() # perform grid search to find the optimal number of trees rftree_range = range(10, 550, 10) param_grid = dict(n_estimators=rftree_range) rf_grid = gscv(rf, param_grid, cv=10, scoring="roc_auc") rf_grid.fit(pre2000_exp_scaled, pre2000_res) # check results from grid search rf_grid_mean_scores = [result[1] for result in rf_grid.grid_scores_] plt.figure() plt.plot(rftree_range, rf_grid_mean_scores) # identify best estimator
explanatory_dffore = pandas.DataFrame(scalerfore.transform(explanatory_dffore), columns = explanatory_dffore.columns) ################# ## RANDOM FORESTS ################# # creating a random forest object. ## these are the default values of the classifier rfhw = ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None) # I'm going to change this a bit. Instantiates the object rfhw = ensemble.RandomForestClassifier(n_estimators= 500) roc_scores_rfhw = cv(rfhw, explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc') # let's do the same for the decision tree roc_score_treehw = cv(tree.DecisionTreeClassifier(), explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc') ## let's compare the mean ROC AUC print roc_scores_rfhw.mean() print roc_score_treehw.mean() #The random forest indeed is much better in accuracy here than the regular decision tree. ## perform grid search to find the optimal number of trees (tuning some parameters) trees_rangehw = range(10, 550, 10) #see what accuracy is like param_gridhw = dict(n_estimators = trees_rangehw)#tuning parameters is number estimators gridhw = GridSearchCV(rfhw, param_gridhw, cv=10, scoring='roc_auc')
# fitting the object on our data -- we do this so that we can save the # fit for our new data. imputer_object.fit(explanatory_df) explanatory_df = imputer_object.transform(explanatory_df) ########################## ### Naive Bayes Model ### ########################## ### creating naive bayes classifier ### naive_bayes_classifier = nb() accuracy_scores = cv(naive_bayes_classifier, explanatory_df, response_series, cv=10, scoring='accuracy') print accuracy_scores.mean() #looks like on average the model is 60% accurate, not very high ### calculating accuracy metrics for comparison ### ## ACCURACY METRIC 1: Cohen's Kappa ## mean_accuracy_score = accuracy_scores.mean() largest_class_percent_of_total = response_series.value_counts(normalize = True)[0] largest_class_percent_of_total #the largest class percent total is 90%, thus the model will correctly #predict 90% of the time that someone WILL NOT be in the hall of fame kappa = (mean_accuracy_score - largest_class_percent_of_total) / (1-largest_class_percent_of_total)
# Predict values for the output predicted_values = final_estimator_used.predict(df2000_slim) # Create a confusion matrix to examine the results cm = pd.crosstab(response_series2000, predicted_values, rownames=['True Label'], colnames=['Predicted Label'], margins=True) print cm # Calculate cross val accuracy scores accuracy_scores_best_OOS = cv(final_estimator_used, df2000_slim, response_series2000, cv=10, scoring='accuracy') accuracy_scores_best_oldData = cv(final_estimator_used, explanatory_dfhw, response_serieshw, cv=10, scoring='accuracy') print accuracy_scores_best_OOS.mean() #Accuracy of 87% here print accuracy_scores_best_oldData.mean() #Accuracy of 92% here
# fitting the object on our data -- we do this so that we can save the # fit for our new data. imputer_object.fit(explanatory_df) explanatory_df = imputer_object.transform(explanatory_df) ########################## ### Naive Bayes Model ### ########################## ### creating naive bayes classifier ### naive_bayes_classifier = nb() accuracy_scores = cv(naive_bayes_classifier, explanatory_df, response_series, cv=10, scoring='accuracy') print accuracy_scores.mean() #looks like on average the model is 60% accurate, not very high ### calculating accuracy metrics for comparison ### ## ACCURACY METRIC 1: Cohen's Kappa ## mean_accuracy_score = accuracy_scores.mean() largest_class_percent_of_total = response_series.value_counts( normalize=True)[0] largest_class_percent_of_total #the largest class percent total is 90%, thus the model will correctly
#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------ #run recursive feature search with 10-fold cv to identify potential features lr = lm.LogisticRegression() lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose = 1) lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) #identify features features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()] print features #run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451) lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc') lr_cv.mean() #create dataset with response and selected features lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1) #BUILD FULL LOGISTIC REGRESSION MODEL------------------------------------------ #get model summary with ALL variables (except teamID_CAL because it leads to singular matrix) model_all = logit('inducted ~ b_atbat + b_runs + b_hits + b_hruns + b_strik + p_wins + p_loss + p_shout + p_saves + p_eruns + p_stout + f_puts + f_dplay + POS_C + POS_P + teamID_NYN + teamID_Other', data = lrset).fit(maxiter=5000) print model_all.summary() #get predicted probabilities for future cases >= 2000
f = open('regression.pkl','rb') # open the file in read binary mode # load the data in the .pkl file into a new variable spmat regr = cPickle.load(f) f.close() print('Coefficients: \n', regr.coef_) ''' print 'Computing cross validation' clf = Ridge(alpha=1) clf.fit(X, Y.todense()) print clf.predict(X) print np.mean(cv(clf, X, Y.todense(),scoring='mean_squared_error')) #PLOT PREDICTED VS "TRUE" predict_table = clf.predict(X) n_groups = predict_table.shape[0] predicted_values = [] true_values = [] x = [] for i in range(0,n_groups): predicted_values.append(round(predict_table[i][0])) true_values.append(round(Y[i,0])) x.append(i) """