Beispiel #1
0
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId', 'Survived']].to_csv('gridsearch_rf.csv', index=False)

trained_models = []
for model in models:
    model.fit(train, targets)
    trained_models.append(model)

predictions = []
for model in trained_models:
    predictions.append(model.predict_proba(test)[:, 1])

predictions_df = pd.DataFrame(predictions).T
predictions_df['out'] = predictions_df.mean(axis=1)
predictions_df['PassengerId'] = aux['PassengerId']
predictions_df['out'] = predictions_df['out'].map(lambda s: 1 if s >= 0.5 else 0)

predictions_df = predictions_df[['PassengerId', 'out']]
predictions_df.columns = ['PassengerId', 'Survived']

predictions_df.to_csv('blending_base_models.csv', index=False)
Beispiel #2
0
modeler = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

S_X_train, S_X_test = stacking(modeler,
                               X_train,
                               y_train,
                               X_test,
                               regression=False,
                               metric=metrics.log_loss,
                               needs_proba=True,
                               stratified=True,
                               shuffle=True,
                               random_state=42,
                               verbose=2)

# %%
model = LogisticRegression(penalty='l1', C=1, random_state=42)

model = model.fit(S_X_train, y_train)

y_pred = pd.Series(model.predict(S_X_test))
y_pred_proba = model.predict_proba(S_X_test)[:, 1]

print("R Square:", metrics.accuracy_score(y_test, model.predict(S_X_test)))
print("kappa:", metrics.cohen_kappa_score(y_test, model.predict(S_X_test)))

# %%
# In[ ]:

# Using Ensemble model technique by considering all the models trained to predict Survival (Trial 4)
models = [
    logreg_model, logreg_cv_model, rf_model, gboost_model, dt_model, ab_model
]

trained_models = []
for model in models:
    model.fit(train_reduced, final_train_set_y)
    trained_models.append(model)

predictions = []
for model in trained_models:
    predictions.append(model.predict_proba(test_reduced)[:, 1])

# Take the mean of probability identified by each model
kaggle_df = pd.DataFrame(predictions).T
kaggle_df['out'] = kaggle_df.mean(axis=1)
kaggle_df['PassengerId'] = titanic_test_org['PassengerId']
kaggle_df['out'] = kaggle_df['out'].map(lambda s: 1 if s >= 0.5 else 0)

# dataframe with predictions
kaggle_df = kaggle_df[['PassengerId', 'out']]
kaggle_df.columns = ['PassengerId', 'Survived']

# save to csv
kaggle_df.to_csv('RFTunedsubmission.csv', index=False)

# In[ ]:
Beispiel #4
0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, predictions))
end = time.clock()
print('预测163个结果,所需时间为' + str(end - start))

# In[ ]:

print(model)

# In[ ]:

xgb.to_graphviz(model, num_trees=10)

# In[ ]:

y_score = model.predict_proba(DataFrame(X_test, dtype='float'))
y_score = [a[1] for a in y_score]
fpr, tpr, threshold = roc_curve(y_test, y_score)  ###计算真正率和假正率
roc_auc = auc(fpr, tpr)  ###计算auc的值

plt.figure()
lw = 2
plt.figure(figsize=(10, 10))
plt.plot(fpr,
         tpr,
         color='darkorange',
         lw=lw,
         label='ROC curve (area = %0.2f)' % roc_auc)  ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
Beispiel #5
0
X_test_rf_selected = X_test[X_test.columns.intersection(rf_selected_features)]
y_rf_predictions = random_forest.predict(X_test_rf_selected)

conf_matrix = metrics.confusion_matrix(y_test, y_rf_predictions)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, fmt='g', cmap='coolwarm_r')
plt.title('Random Forests')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#plt.savefig('RF_CM.png', quality=95)
plt.show()

print(f"Accuracy: {metrics.accuracy_score(y_test, y_rf_predictions)}")
print(classification_report(y_test, y_rf_predictions))

y_pred_prob_rf = random_forest.predict_proba(X_test_rf_selected)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob_rf)
auc = metrics.roc_auc_score(y_test, y_pred_prob_rf)
plt.plot(fpr, tpr)
plt.title(f'Random Forests - Area Under Curve : {str(auc)[:4]}')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
#plt.savefig('RF_AUC.png', quality = 95)
plt.show()

print(f'Area Under Curve : {auc}')

#    Boosted Trees
print('\n Gradient Boosted Trees model')
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)