Example #1
0
def find_best_feature(feature_name, cv_fold, train_data, train_label):
    # 为了寻找最佳的特征组合,这里是对LGBMClassifier  XGBClassifier   GBC三个模型的得分进行平均,来代表这个特征所代表的分数
    get_ans_face = feature_name
    new_lgb_model = lgb.LGBMRegressor(n_estimators=300, random_state=1)
    cv_model = cv(new_lgb_model,
                  train_data[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_lgb_model.fit(train_data[get_ans_face], train_label)
    m1 = cv_model.mean()

    new_xgb_model1 = xgb.XGBRegressor(n_estimators=300, random_state=1)
    cv_model = cv(new_xgb_model1,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_xgb_model1.fit(train_data[get_ans_face].values, train_label)
    m2 = cv_model.mean()

    new_gbc_model = GBR(n_estimators=310)
    cv_model = cv(new_gbc_model,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_gbc_model.fit(train_data[get_ans_face].values, train_label)
    m3 = cv_model.mean()
    return (m1 + m2 + m3) / 3
def get_model(nums, cv_fold):
    feature_name1 = train_data[feature_name].columns
    get_ans_face = list(
        set(get_pic(gbc_model, feature_name1).head(nums)['name'])
        & set(get_pic(xgb_model, feature_name1).head(nums)['name'])
        & set(get_pic(lgb_model, feature_name1).head(nums)['name']))
    print('New Feature: ', len(get_ans_face))
    if 'SNP32*SNP34' not in get_ans_face:
        get_ans_face.append('SNP32*SNP34')
    print('New Feature: ', len(get_ans_face))
    new_lgb_model = lgb.LGBMClassifier(objective='binary',
                                       n_estimators=300,
                                       max_depth=3,
                                       min_child_samples=6,
                                       learning_rate=0.102,
                                       random_state=1)
    cv_model = cv(new_lgb_model,
                  train_data[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_lgb_model.fit(train_data[get_ans_face], train_label)
    m1 = cv_model.mean()

    new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',
                                       n_estimators=300,
                                       max_depth=4,
                                       learning_rate=0.101,
                                       random_state=1)
    cv_model = cv(new_xgb_model1,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_xgb_model1.fit(train_data[get_ans_face].values, train_label)
    m2 = cv_model.mean()

    new_gbc_model = GBC(n_estimators=310,
                        subsample=1,
                        min_samples_split=2,
                        max_depth=3,
                        learning_rate=0.1900,
                        min_weight_fraction_leaf=0.1)
    kkk = train_data[get_ans_face].fillna(7)
    cv_model = cv(new_gbc_model,
                  kkk[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_gbc_model.fit(kkk.fillna(7), train_label)

    m3 = cv_model.mean()
    print((m1 + m2 + m3) / 3)
    pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])
    pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)
    pro3 = new_gbc_model.predict_proba(
        test_data[get_ans_face].fillna(7).values)
    ans = (pro1 + pro2 + pro3) / 3
    return ans
def find_best_feature(feature_name, cv_fold):
    get_ans_face = feature_name
    new_lgb_model = lgb.LGBMClassifier(objective='binary',
                                       n_estimators=300,
                                       max_depth=3,
                                       min_child_samples=6,
                                       learning_rate=0.102,
                                       random_state=1)
    cv_model = cv(new_lgb_model,
                  train_data[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_lgb_model.fit(train_data[get_ans_face], train_label)
    m1 = cv_model.mean()

    new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',
                                       n_estimators=300,
                                       max_depth=4,
                                       learning_rate=0.101,
                                       random_state=1)
    cv_model = cv(new_xgb_model1,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_xgb_model1.fit(train_data[get_ans_face].values, train_label)
    m2 = cv_model.mean()

    new_gbc_model = GBC(n_estimators=310,
                        subsample=1,
                        min_samples_split=2,
                        max_depth=3,
                        learning_rate=0.1900,
                        min_weight_fraction_leaf=0.1)
    kkk = train_data[get_ans_face].fillna(7)
    cv_model = cv(new_gbc_model,
                  kkk[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='f1')
    new_gbc_model.fit(kkk.fillna(7), train_label)
    m3 = cv_model.mean()
    return (m1 + m2 + m3) / 3
scaler.fit(df2000_slim)
df2000_slim = pandas.DataFrame(scaler.transform(df2000_slim), columns=df2000_slim.columns)
df2000_slim.describe()

# Predict values for the output
predicted_values = final_estimator_used.predict(df2000_slim)

# Create a confusion matrix to examine the results
cm = pandas.crosstab(
    response_series2000, predicted_values, rownames=["True Label"], colnames=["Predicted Label"], margins=True
)

print cm
# Predicted Label    0    1  All
# True Label
# 0                131   75  206
# 1                 22   35   57
# All              153  110  263

# Calculate cross val accuracy scores
from sklearn.cross_validation import cross_val_score as cv

accuracy_scores_best_OOS = cv(final_estimator_used, df2000_slim, response_series2000, cv=10, scoring="accuracy")

accuracy_scores_best_oldData = cv(final_estimator_used, explanatory_df, response_series, cv=10, scoring="accuracy")

print accuracy_scores_best_OOS.mean()
# Accuracy of 81%
print accuracy_scores_best_oldData.mean()
# Accuracy of 80%
                          colnames=['Predicted'], margins=True)
print nb_crosstab



#BUILD RANDOM FOREST MODEL (SCALED DATA)------------------------------------------------

#instantiate Random Forest model
rf = ens.RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None, 
                                min_samples_split=2, min_samples_leaf=1, max_features='auto', 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, 
                                random_state=None, verbose=0, min_density=None, 
                                compute_importances=None)

#fit model and check scores (mean = .9215, max = .9597)
rf_cv = cv(rf, pre2000_exp_scaled, pre2000_res, cv=10, scoring='roc_auc')
print rf_cv .mean()
print rf_cv.max()

#perform grid search to find the optimal number of trees
rftree_range = range(10, 550, 10)
param_grid = dict(n_estimators = rftree_range)
rf_grid = gscv(rf, param_grid, cv=10, scoring='roc_auc')
rf_grid.fit(pre2000_exp_scaled, pre2000_res)

#check results from grid search
rf_grid_mean_scores = [result[1] for result in rf_grid.grid_scores_]
plt.figure()
plt.plot(rftree_range, rf_grid_mean_scores)

#identify best estimator
Example #6
0
                                       max_features='auto',
                                       max_leaf_nodes=None,
                                       bootstrap=True,
                                       oob_score=False,
                                       n_jobs=1,
                                       random_state=None,
                                       verbose=0,
                                       min_density=None,
                                       compute_importances=None)

# I'm going to change this a bit. Instantiates the object
rfhw = ensemble.RandomForestClassifier(n_estimators=500)

roc_scores_rfhw = cv(rfhw,
                     explanatory_dffore,
                     response_seriesfore,
                     cv=10,
                     scoring='roc_auc')

# let's do the same for the decision tree
roc_score_treehw = cv(tree.DecisionTreeClassifier(),
                      explanatory_dffore,
                      response_seriesfore,
                      cv=10,
                      scoring='roc_auc')

## let's compare the mean ROC AUC
print roc_scores_rfhw.mean()
print roc_score_treehw.mean()
#The random forest indeed is much better in accuracy here than the regular decision tree.
#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------

#run recursive feature search with 10-fold cv to identify potential features
lr = lm.LogisticRegression()
lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose=1)
lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

#identify features
features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()]
print features

#run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451)
lr_cv = cv(lr,
           pre2000_exp_scaled[features],
           pre2000_res,
           cv=10,
           scoring='roc_auc')
lr_cv.mean()

#create dataset with response and selected features
lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1)

#BUILD FULL LOGISTIC REGRESSION MODEL------------------------------------------

#get model summary with ALL variables (except teamID_CAL because it leads to singular matrix)
model_all = logit(
    'inducted ~ b_atbat + b_runs + b_hits + b_hruns + b_strik + p_wins + p_loss + p_shout + p_saves + p_eruns + p_stout + f_puts + f_dplay + POS_C + POS_P  + teamID_NYN  + teamID_Other',
    data=lrset).fit(maxiter=5000)
print model_all.summary()
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="auto",
    max_leaf_nodes=None,
    bootstrap=True,
    oob_score=False,
    random_state=None,
    verbose=0,
    min_density=None,
    compute_importances=None,
)

# fit model and check scores (mean = .9215, max = .9597)
rf_cv = cv(rf, pre2000_exp_scaled, pre2000_res, cv=10, scoring="roc_auc")
print rf_cv.mean()
print rf_cv.max()

# perform grid search to find the optimal number of trees
rftree_range = range(10, 550, 10)
param_grid = dict(n_estimators=rftree_range)
rf_grid = gscv(rf, param_grid, cv=10, scoring="roc_auc")
rf_grid.fit(pre2000_exp_scaled, pre2000_res)

# check results from grid search
rf_grid_mean_scores = [result[1] for result in rf_grid.grid_scores_]
plt.figure()
plt.plot(rftree_range, rf_grid_mean_scores)

# identify best estimator
explanatory_dffore = pandas.DataFrame(scalerfore.transform(explanatory_dffore), columns = explanatory_dffore.columns)


#################
## RANDOM FORESTS
#################


# creating a random forest object.
## these are the default values of the classifier
rfhw = ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None)

# I'm going to change this a bit. Instantiates the object
rfhw = ensemble.RandomForestClassifier(n_estimators= 500)

roc_scores_rfhw = cv(rfhw, explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc')

# let's do the same for the decision tree
roc_score_treehw = cv(tree.DecisionTreeClassifier(), explanatory_dffore, response_seriesfore, cv=10, scoring='roc_auc')

## let's compare the mean ROC AUC
print roc_scores_rfhw.mean()
print roc_score_treehw.mean()
#The random forest indeed is much better in accuracy here than the regular decision tree.

## perform grid search to find the optimal number of trees (tuning some parameters)

trees_rangehw = range(10, 550, 10) #see what accuracy is like
param_gridhw = dict(n_estimators = trees_rangehw)#tuning parameters is number estimators

gridhw = GridSearchCV(rfhw, param_gridhw, cv=10, scoring='roc_auc') 
# fitting the object on our data -- we do this so that we can save the 
# fit for our new data.
imputer_object.fit(explanatory_df)
explanatory_df = imputer_object.transform(explanatory_df)


##########################
### Naive Bayes Model  ###
##########################


### creating naive bayes classifier ###

naive_bayes_classifier = nb()

accuracy_scores = cv(naive_bayes_classifier, explanatory_df, response_series, cv=10, scoring='accuracy')
print accuracy_scores.mean()
#looks like on average the model is 60% accurate, not very high

### calculating accuracy metrics for comparison ###

## ACCURACY METRIC 1: Cohen's Kappa ##

mean_accuracy_score = accuracy_scores.mean()
largest_class_percent_of_total = response_series.value_counts(normalize = True)[0]

largest_class_percent_of_total
#the largest class percent total is 90%, thus the model will correctly
#predict 90% of the time that someone WILL NOT be in the hall of fame

kappa = (mean_accuracy_score - largest_class_percent_of_total) / (1-largest_class_percent_of_total)
Example #11
0
# Predict values for the output
predicted_values = final_estimator_used.predict(df2000_slim)

# Create a confusion matrix to examine the results
cm = pd.crosstab(response_series2000,
                 predicted_values,
                 rownames=['True Label'],
                 colnames=['Predicted Label'],
                 margins=True)

print cm

# Calculate cross val accuracy scores
accuracy_scores_best_OOS = cv(final_estimator_used,
                              df2000_slim,
                              response_series2000,
                              cv=10,
                              scoring='accuracy')

accuracy_scores_best_oldData = cv(final_estimator_used,
                                  explanatory_dfhw,
                                  response_serieshw,
                                  cv=10,
                                  scoring='accuracy')

print accuracy_scores_best_OOS.mean()
#Accuracy of 87% here
print accuracy_scores_best_oldData.mean()
#Accuracy of 92% here
Example #12
0
# fitting the object on our data -- we do this so that we can save the
# fit for our new data.
imputer_object.fit(explanatory_df)
explanatory_df = imputer_object.transform(explanatory_df)

##########################
### Naive Bayes Model  ###
##########################

### creating naive bayes classifier ###

naive_bayes_classifier = nb()

accuracy_scores = cv(naive_bayes_classifier,
                     explanatory_df,
                     response_series,
                     cv=10,
                     scoring='accuracy')
print accuracy_scores.mean()
#looks like on average the model is 60% accurate, not very high

### calculating accuracy metrics for comparison ###

## ACCURACY METRIC 1: Cohen's Kappa ##

mean_accuracy_score = accuracy_scores.mean()
largest_class_percent_of_total = response_series.value_counts(
    normalize=True)[0]

largest_class_percent_of_total
#the largest class percent total is 90%, thus the model will correctly


#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------

#run recursive feature search with 10-fold cv to identify potential features
lr = lm.LogisticRegression()
lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose = 1)
lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

#identify features
features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()]
print features

#run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451)
lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc')
lr_cv.mean()

#create dataset with response and selected features
lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1)



#BUILD FULL LOGISTIC REGRESSION MODEL------------------------------------------

#get model summary with ALL variables (except teamID_CAL because it leads to singular matrix)
model_all = logit('inducted ~ b_atbat + b_runs + b_hits + b_hruns + b_strik + p_wins + p_loss + p_shout + p_saves + p_eruns + p_stout + f_puts + f_dplay + POS_C + POS_P  + teamID_NYN  + teamID_Other', 
               data = lrset).fit(maxiter=5000)
print model_all.summary()

#get predicted probabilities for future cases >= 2000
Example #14
0
f = open('regression.pkl','rb') # open the file in read binary mode
# load the data in the .pkl file into a new variable spmat
regr = cPickle.load(f) 
f.close()


print('Coefficients: \n', regr.coef_)
'''

print 'Computing cross validation'

clf = Ridge(alpha=1)
clf.fit(X, Y.todense())
print clf.predict(X)

print np.mean(cv(clf, X, Y.todense(),scoring='mean_squared_error'))


#PLOT PREDICTED VS "TRUE"
predict_table = clf.predict(X)
n_groups = predict_table.shape[0]

predicted_values = []
true_values = []
x = []

for i in range(0,n_groups):
    predicted_values.append(round(predict_table[i][0]))
    true_values.append(round(Y[i,0]))
    x.append(i)
"""