Ejemplo n.º 1
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20, 1: 25, 2: 50},
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(
        n_estimators=n_estimators,
        base_estimator=base_estimator,
        n_jobs=-1,
        random_state=RND_SEED,
    )
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert len(est.named_steps["classifier"]) == base_estimator.n_estimators
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Ejemplo n.º 2
0
def model():
    scores = []
    acc_score = []
    fat_weights = [0.3 for i in range(train["Fatal"].shape[0])]
    sev_weights = [0.5 for i in range(train["Severe"].shape[0])]
    sli_weights = [1 for i in range(train["Slight"].shape[0])]
    class_weights = {
        "Fatal": fat_weights,
        "Severe": sev_weights,
        "Slight": sli_weights
    }
    submission = pd.DataFrame.from_dict(
        {'Accident_Index': test['Accident_Index']})
    for class_name in class_names:
        train_target = train[class_name]
        classifier = EasyEnsembleClassifier(n_estimators=12,
                                            base_estimator=XGBClassifier(
                                                max_depth=4,
                                                learning_rate=0.2,
                                                n_estimators=600,
                                                silent=True,
                                                subsample=0.8,
                                                gamma=0.5,
                                                min_child_weight=10,
                                                objective='binary:logistic',
                                                colsample_bytree=0.6,
                                                max_delta_step=1,
                                                nthreads=1,
                                                n_jobs=1))

        cv_score = np.mean(
            cross_val_score(classifier,
                            train_features,
                            train_target,
                            cv=3,
                            scoring='roc_auc'))
        scores.append(cv_score)
        #         print('CV score for class {} is {}'.format(class_name, cv_score))

        classifier.fit(train_features,
                       train_target,
                       sample_weight=class_weights[class_name])
        submission[class_name] = classifier.predict_proba(test_features)[:, 1]
        acc = roc_auc_score(test[class_name], submission[class_name])
        acc_score.append(acc)
        #         print('Mean accuracy for class {} is {}'.format(class_name,acc))

        #Pickling the model
        model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab')
        pickle.dump(classifier, model_pkl)
        model_pkl.close()

    return (scores, acc_score)
Ejemplo n.º 3
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 base_estimator=base_estimator,
                                 n_jobs=-1,
                                 random_state=RND_SEED)
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert (len(est.named_steps['classifier']) ==
                base_estimator.n_estimators)
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Ejemplo n.º 4
0
def run(X_train, X_test, y_train, y_test):
    print("######################")
    print("Easy Ensemble")
    print("######################")
    print("\n")

    print('Original dataset shape %s' % Counter(y_train))

    # resample all classes but the majority class
    eec = EasyEnsembleClassifier(sampling_strategy='not majority',
                                 replacement=True,
                                 random_state=42,
                                 n_jobs=-1)
    eec.fit(X_train, y_train)
    y_pred = eec.predict(X_test)
    y_proba = eec.predict_proba(X_test)

    return y_test, y_pred, y_proba
Ejemplo n.º 5
0
max_n_estimator = 0

#for n_estimator in range(30, 100, 10):
#    print(n_estimator)
#    abc = AdaBoostClassifier(n_estimators=n_estimator, random_state = 0)
#    scores = []
#    for train_index, test_index in cv.split(X):
#        X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
#        abc.fit(X_train, y_train)
#        scores.append(abc.score(X_test, y_test))
#    average_score = np.mean(scores)
#    if average_score > max_score:
#        max_score, max_n_estimator = average_score, n_estimator
#    print(n_estimator, average_score)

max_n_estimator = 15
print(max_n_estimator)

model = EasyEnsembleClassifier(n_estimators=max_n_estimator, random_state=0)
model.fit(X, y)

print("Finished training!")

#X_test = pd.get_dummies(test_data, columns = features[1:])
X_test = test_data[features[1:]]

predictions = model.predict_proba(X_test)

result = pd.DataFrame({'value': predictions[:, 0]})
result.to_csv("result.csv", index=False)
Ejemplo n.º 6
0
# model.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10)

# %% [code]
# clf.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10,early_stopping_rounds=10)

# %% [markdown]
# ### Training the model

# %% [code]
clf.fit(X_train,Y_train)

# %% [markdown]
# ### Making Predictions

# %% [code]
output=clf.predict_proba(X_test)[:,1]

# %% [markdown]
# ### Final training roc_auc score

# %% [code]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, output,pos_label=1)
auc_score=metrics.auc(fpr, tpr)
print (auc_score)

# %% [markdown]
# ### This gave us a final score of 0.71820 on the  private leaderboard

# %% [markdown]
# ### Scores we got during training
Ejemplo n.º 7
0
    # Print best parameters
    bestEasy_params = space_eval(spaceEasy, bestEasy)

bestEasy_params

clf = EasyEnsembleClassifier(**bestEasy_params,
                            random_state=0,
                            n_estimators=300,
                            n_jobs=-1,
                            verbose=1)

clf.fit(X_train, y_train)

# training roc
easy_y_train_pred = clf.predict_proba(X_train)[:,1]
plotROC(y_train, easy_y_train_pred, 'EasyEnsamble-Train')
# test roc
easy_y_test_pred = clf.predict_proba(X_test)[:,1]
plotROC(y_test, easy_y_test_pred, 'EasyEnsamble-Test')

# fit all data
with Timer('EasyEnsamble, Train') as t:
    clf.fit(X, y.values.ravel())

easy_y_all_pred = clf.predict_proba(X)[:, 1]
plotROC(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData')
roc_auc_score(y, easy_y_all_pred)

# pridict
result = pd.DataFrame()
print("Balanced Testing Accuracy : {:.2%}".format(
    balanced_accuracy_score(y_pred, Y_val)))
print("Confusion Matrix:")
print(confusion_matrix(Y_val, y_pred))
print("Classification Report:")
print(classification_report(y_pred, Y_val))

y_pred_probs = clf.predict_proba(X_val_std)
fpr, tpr, thresholds = roc_curve(Y_val, y_pred_probs[:, 1])
print(auc(fpr, tpr))

# ## Ensemble models utilizing Stacking method

# In[ ]:

eec_probs = eec.predict_proba(X_val_std)
gbdt_probs = gbdt.predict_proba(X_val_std)
xgb_probs = xgb_model.predict_proba(X_val_std)

best_auc = 0
for a in np.arange(0.1, 1.0, 0.1):
    for b in np.arange(0.1, 1.0 - a, 0.1):
        c = 1 - a - b
        stacked_probs = a * eec_probs + b * gbdt_probs + c * xgb_probs
        fpr, tpr, thresholds = roc_curve(Y_val, stacked_probs[:, 1])
        new_auc = auc(fpr, tpr)
        if new_auc > best_auc:
            best_auc = new_auc
            best = (a, b, c)
print(best, best_auc)