def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier( n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED, ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert len(est.named_steps["classifier"]) == base_estimator.n_estimators # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def model(): scores = [] acc_score = [] fat_weights = [0.3 for i in range(train["Fatal"].shape[0])] sev_weights = [0.5 for i in range(train["Severe"].shape[0])] sli_weights = [1 for i in range(train["Slight"].shape[0])] class_weights = { "Fatal": fat_weights, "Severe": sev_weights, "Slight": sli_weights } submission = pd.DataFrame.from_dict( {'Accident_Index': test['Accident_Index']}) for class_name in class_names: train_target = train[class_name] classifier = EasyEnsembleClassifier(n_estimators=12, base_estimator=XGBClassifier( max_depth=4, learning_rate=0.2, n_estimators=600, silent=True, subsample=0.8, gamma=0.5, min_child_weight=10, objective='binary:logistic', colsample_bytree=0.6, max_delta_step=1, nthreads=1, n_jobs=1)) cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) # print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target, sample_weight=class_weights[class_name]) submission[class_name] = classifier.predict_proba(test_features)[:, 1] acc = roc_auc_score(test[class_name], submission[class_name]) acc_score.append(acc) # print('Mean accuracy for class {} is {}'.format(class_name,acc)) #Pickling the model model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab') pickle.dump(classifier, model_pkl) model_pkl.close() return (scores, acc_score)
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len(est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def run(X_train, X_test, y_train, y_test): print("######################") print("Easy Ensemble") print("######################") print("\n") print('Original dataset shape %s' % Counter(y_train)) # resample all classes but the majority class eec = EasyEnsembleClassifier(sampling_strategy='not majority', replacement=True, random_state=42, n_jobs=-1) eec.fit(X_train, y_train) y_pred = eec.predict(X_test) y_proba = eec.predict_proba(X_test) return y_test, y_pred, y_proba
max_n_estimator = 0 #for n_estimator in range(30, 100, 10): # print(n_estimator) # abc = AdaBoostClassifier(n_estimators=n_estimator, random_state = 0) # scores = [] # for train_index, test_index in cv.split(X): # X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index] # abc.fit(X_train, y_train) # scores.append(abc.score(X_test, y_test)) # average_score = np.mean(scores) # if average_score > max_score: # max_score, max_n_estimator = average_score, n_estimator # print(n_estimator, average_score) max_n_estimator = 15 print(max_n_estimator) model = EasyEnsembleClassifier(n_estimators=max_n_estimator, random_state=0) model.fit(X, y) print("Finished training!") #X_test = pd.get_dummies(test_data, columns = features[1:]) X_test = test_data[features[1:]] predictions = model.predict_proba(X_test) result = pd.DataFrame({'value': predictions[:, 0]}) result.to_csv("result.csv", index=False)
# model.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10) # %% [code] # clf.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10,early_stopping_rounds=10) # %% [markdown] # ### Training the model # %% [code] clf.fit(X_train,Y_train) # %% [markdown] # ### Making Predictions # %% [code] output=clf.predict_proba(X_test)[:,1] # %% [markdown] # ### Final training roc_auc score # %% [code] from sklearn import metrics fpr, tpr, thresholds = metrics.roc_curve(Y_test, output,pos_label=1) auc_score=metrics.auc(fpr, tpr) print (auc_score) # %% [markdown] # ### This gave us a final score of 0.71820 on the private leaderboard # %% [markdown] # ### Scores we got during training
# Print best parameters bestEasy_params = space_eval(spaceEasy, bestEasy) bestEasy_params clf = EasyEnsembleClassifier(**bestEasy_params, random_state=0, n_estimators=300, n_jobs=-1, verbose=1) clf.fit(X_train, y_train) # training roc easy_y_train_pred = clf.predict_proba(X_train)[:,1] plotROC(y_train, easy_y_train_pred, 'EasyEnsamble-Train') # test roc easy_y_test_pred = clf.predict_proba(X_test)[:,1] plotROC(y_test, easy_y_test_pred, 'EasyEnsamble-Test') # fit all data with Timer('EasyEnsamble, Train') as t: clf.fit(X, y.values.ravel()) easy_y_all_pred = clf.predict_proba(X)[:, 1] plotROC(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData') roc_auc_score(y, easy_y_all_pred) # pridict result = pd.DataFrame()
print("Balanced Testing Accuracy : {:.2%}".format( balanced_accuracy_score(y_pred, Y_val))) print("Confusion Matrix:") print(confusion_matrix(Y_val, y_pred)) print("Classification Report:") print(classification_report(y_pred, Y_val)) y_pred_probs = clf.predict_proba(X_val_std) fpr, tpr, thresholds = roc_curve(Y_val, y_pred_probs[:, 1]) print(auc(fpr, tpr)) # ## Ensemble models utilizing Stacking method # In[ ]: eec_probs = eec.predict_proba(X_val_std) gbdt_probs = gbdt.predict_proba(X_val_std) xgb_probs = xgb_model.predict_proba(X_val_std) best_auc = 0 for a in np.arange(0.1, 1.0, 0.1): for b in np.arange(0.1, 1.0 - a, 0.1): c = 1 - a - b stacked_probs = a * eec_probs + b * gbdt_probs + c * xgb_probs fpr, tpr, thresholds = roc_curve(Y_val, stacked_probs[:, 1]) new_auc = auc(fpr, tpr) if new_auc > best_auc: best_auc = new_auc best = (a, b, c) print(best, best_auc)