def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = EasyEnsembleClassifier(2, None, n_jobs=-1, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier) ensemble = EasyEnsembleClassifier(2, AdaBoostClassifier(), n_jobs=-1, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier)
def objectiveEasy(params): time1 = time.time() params = { 'sampling_strategy': params['sampling_strategy'], } print("\n############## New Run ################") print(f"params = {params}") FOLDS = 5 count = 1 skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42) score_mean = 0 for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()): clf = EasyEnsembleClassifier(**params, random_state=0, n_estimators=300, n_jobs=-1, verbose=0) X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :] y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx] clf.fit(X_tr, y_tr.values.ravel()) score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl) score_mean += score print(f'{count} CV - score: {round(score, 4)}') count += 1 time2 = time.time() - time1 print(f"Total Time Run: {round(time2 / 60,2)}") gc.collect() print(f'Mean ROC_AUC: {score_mean / FOLDS}') del X_tr, X_vl, y_tr, y_vl, clf, score return -(score_mean / FOLDS)
def fit(self, X, Y, sample_weight=None): import sklearn.tree if self.estimator is None: self.ab_max_depth = int(self.ab_max_depth) base_estimator = sklearn.tree.DecisionTreeClassifier( max_depth=self.ab_max_depth) self.estimator = sklearn.ensemble.AdaBoostClassifier( base_estimator=base_estimator, n_estimators=self.ab_n_estimators, learning_rate=self.ab_learning_rate, algorithm=self.ab_algorithm, random_state=self.random_state) from imblearn.ensemble import EasyEnsembleClassifier estimator = EasyEnsembleClassifier( base_estimator=self.estimator, n_estimators=self.n_estimators, sampling_strategy=self.sampling_strategy, replacement=self.replacement, n_jobs=self.n_jobs, random_state=self.random_state) estimator.fit(X, Y) self.estimator = estimator return self
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws }
def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y)
def balancedClassifier(df): # Create an object of the classifier. seed = 7 num_trees = 30 kfold = model_selection.KFold(n_splits=10, random_state=seed) base_estimator = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) ee_classifier = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator) X = df.take([1, 5, 6, 9, 10, 12, 18, 21], axis=1) # predictors X = X.apply(pd.to_numeric) X = X.iloc[1:] Y = df['Class'] # predicted_class Y = Y.iloc[1:] classes = np.unique(df['Class'].values) print("We have {} unique classes: {}".format(len(classes), classes)) # Train the classifier. ee_classifier.fit(X, Y) predictions = model_selection.cross_val_predict(ee_classifier, X, Y.values.ravel(), cv=kfold) classification_report = metrics.classification_report(Y.values.ravel(), predictions, target_names=classes) print("classification_report ", classification_report) balanced_accuracy = metrics.balanced_accuracy_score( Y.values.ravel(), predictions) print(" Balanced accuracy = ", balanced_accuracy) return predictions, Y
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def ensemble_model_initialise(base_estimator=AdaBoostClassifier(), param={}): config = EasyEnsembleClassifier().get_params() config['base_estimator'] = base_estimator config['n_estimators'] = 50 config['n_jobs'] = -1 config['random_state'] = 42 config['verbose'] = 0 config.update(param) return EasyEnsembleClassifier(**config)
def model(): scores = [] acc_score = [] fat_weights = [0.3 for i in range(train["Fatal"].shape[0])] sev_weights = [0.5 for i in range(train["Severe"].shape[0])] sli_weights = [1 for i in range(train["Slight"].shape[0])] class_weights = { "Fatal": fat_weights, "Severe": sev_weights, "Slight": sli_weights } submission = pd.DataFrame.from_dict( {'Accident_Index': test['Accident_Index']}) for class_name in class_names: train_target = train[class_name] classifier = EasyEnsembleClassifier(n_estimators=12, base_estimator=XGBClassifier( max_depth=4, learning_rate=0.2, n_estimators=600, silent=True, subsample=0.8, gamma=0.5, min_child_weight=10, objective='binary:logistic', colsample_bytree=0.6, max_delta_step=1, nthreads=1, n_jobs=1)) cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) # print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target, sample_weight=class_weights[class_name]) submission[class_name] = classifier.predict_proba(test_features)[:, 1] acc = roc_auc_score(test[class_name], submission[class_name]) acc_score.append(acc) # print('Mean accuracy for class {} is {}'.format(class_name,acc)) #Pickling the model model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab') pickle.dump(classifier, model_pkl) model_pkl.close() return (scores, acc_score)
def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( X_train, y_train) clf2 = make_pipeline(RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0)).fit( X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y)
def easy_ensemble_classifier(self, model): clf = EasyEnsembleClassifier(n_estimators=45, base_estimator=model, random_state=42, n_jobs=-1, sampling_strategy='majority') return clf
def ada_boost(x_train, y_train, cv, n_estimators=100, sampling_strategy="not_majority", name="AdaBoost", only_model=False, **kwargs): """Bags AdaBoost learners which are trained on balanced bootstrap samples. Parameters: x_train: Input data for training y_train: Target data for training cv (list of tuples): cross validation indices n_estimators (int): number of boosted trees to consider sampling_strategy (str): "all", "not_majority", "minority" and more. See docu of classifer for more details. name (str): Name/Description for the model only_model (bool): if True returns only the model Returns: dict: contains results of models """ eec = EasyEnsembleClassifier(n_estimators=n_estimators, sampling_strategy=sampling_strategy, random_state=42) if only_model: return eec return calculate_metrics_cv(model=eec, X=x_train, y_true=y_train, cv=cv, name=name)
def easy_ensemble_classifier(df, drop, target): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) eec = EasyEnsembleClassifier(n_estimators=100, random_state=0) eec.fit(X_train, y_train) y_predictions = eec.predict(X_test) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) return acc_score * 100
def EasyEnsembleClassfier(data,test_data): train_text = data['comment_text'].values.astype(str) test_text = test_data['comment_text'].values.astype(str) all_text = np.concatenate([train_text, test_text]) train_features,test_features = get_features(train_text,test_text,all_text) submission = pd.DataFrame.from_dict({'Id': test_data['id']}) # classifier1 = LogisticRegression(solver='sag', max_iter=180) # classifier2 = SGDClassifier(alpha=.00027, max_iter=180, penalty="l2", loss='modified_huber') # classifier4 = ComplementNB(alpha=0.00027, class_prior=None, fit_prior=False) # eclassifier = VotingClassifier(estimators=[ ('lr', classifier1), ('sgd', classifier2), ('ComplementNB', classifier4)], voting='soft', weights=[1,0.8,0.6]) """For using a stacking classifier, do refer to mlextend.ensemble's StackingClassifier""" for class_name in class_names: train_target = data[class_name] y = train_target.values r = np.log(pr(1,y,train_features) / pr(0,y,train_features)) x_nb = train_features.multiply(r) print(1) l = EasyEnsembleClassifier(base_estimator=LogisticRegression(C=2, solver='sag', max_iter=500)) print(2) n = EasyEnsembleClassifier(base_estimator=SGDClassifier(alpha=.0002, max_iter=180, penalty="l2", loss='modified_huber')) print(3) o = LogisticRegression(C=2, dual=True, max_iter=500) print(4) p = RandomForestClassifier(criterion='gini', max_depth=100, max_features=1000, max_leaf_nodes=None, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=80) print(5) m = VotingClassifier(estimators=[ ('lr', l), ('sgd', n),('lr1',o),('rdf',p)], voting='soft', weights=[0.9,1.35,0.65,0.8]) print(6) m.fit(x_nb, y) """For cross validation scores please uncomment the following lines of code""" # cv_score = np.mean(cross_val_score( # m, x_nb, train_target, cv=5, scoring='roc_auc')) # scores.append(cv_score) # print('CV score for class {} is {}'.format(class_name, cv_score)) # print('Total CV score is {}'.format(np.mean(scores))) submission[class_name] = m.predict_proba(test_features.multiply(r))[:, 1] submission.to_csv('EnsembleClassfierSubmission_2.csv', index=False)
def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) parameters = {'n_estimators': [1, 2], 'base_estimator__n_estimators': [3, 4]} grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters) grid_search.fit(X, y)
def adaboost(X_train, y_train, X_test, y_test): base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train.values.ravel()) y_train_eec = eec.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_eec) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Adaboost (boosting): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) objects = ('Boosting', '-') y_pos = np.arange(len(objects)) performance = [without, 0] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Adaboost z losowym undersamplingiem') plt.show() return without
def find_clf_parameters(self, train_x, train_y, clf_type): max_depth = [2, 4, 6] min_samples_leaf = np.arange(1, 4) min_samples_split = np.arange(2, 5) n_estimators = [100, 300] criterion = ['gini', 'entropy'] sampling_strategy = ['auto', 'majority', 'not majority'] models1 = { 'BalancedRandomForestClassifier': BalancedRandomForestClassifier(random_state=42), 'EasyEnsembleClassifier': EasyEnsembleClassifier(random_state=42) } params1 = { 'BalancedRandomForestClassifier': [{ 'criterion': criterion, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'sampling_strategy': sampling_strategy, }], 'EasyEnsembleClassifier': [{ 'n_estimators': n_estimators, 'sampling_strategy': sampling_strategy }], } helper1 = EstimatorSelectionHelper(models1, params1) if clf_type == 'binary': helper1.fit(train_x, train_y, cv=5, scoring='balanced_accuracy', n_jobs=-1) if clf_type == 'multi-class': helper1.fit(train_x, train_y, cv=5, scoring='balanced_accuracy', n_jobs=-1) df = helper1.score_summary() best_estimator = df['estimator'].iloc[0] best_estimator_params = df['params'].iloc[0] return best_estimator, best_estimator_params
def get_classifiers(): classifiers = [ DummyClassifier(), LogisticRegression(), PassiveAggressiveClassifier(), RidgeClassifier(), SGDClassifier(), \ KNeighborsClassifier(), MLPClassifier(), LinearSVC(), \ NuSVC(), SVC(), DecisionTreeClassifier(), ExtraTreeClassifier(), AdaBoostClassifier(), \ BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), \ RandomForestClassifier(), GaussianProcessClassifier(), \ EasyEnsembleClassifier(), BalancedBaggingClassifier(), BalancedRandomForestClassifier(), XGBClassifier() ] return classifiers
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
def get_naive_binary_estimators(self): balanced_rf_clf = BalancedRandomForestClassifier( sampling_strategy='not majority', random_state=42) balanced_bagging_clf = BalancedBaggingClassifier( sampling_strategy='not majority', random_state=42) balanced_ensemble_clf = EasyEnsembleClassifier( sampling_strategy='not majority', random_state=42) binary_estimators = [ balanced_rf_clf, balanced_bagging_clf, balanced_ensemble_clf ] return binary_estimators
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def sixth_test(X_train, y_train, X_test, y_test): print("Diminuer le nombre de feature numéroté inactive, ensuite SMOTE/ADASYN etc") data = pd.DataFrame(X_train) data['Target'] = y_train inactive_index = data[data['Target'] == 0].index length = len(inactive_index) drop_indices = np.random.choice(inactive_index, round(0.66*length), replace=False) data = data.drop(drop_indices) y_train = data.Target.values data = data.drop("Target", axis=1) X_train = data.values """ print("BalancedRandomForestClassifier") scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=300, random_state=0, n_jobs=2, max_features='log2'), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_log_pred = log_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred) print("confusion_matrix:\n", conf_mat) print() print("BalancedBaggingClassifier") tree = DecisionTreeClassifier(max_features='auto') resample_bagging = BalancedBaggingClassifier( base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2) scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_rf_pred = rf_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred) print("confusion_matrix:\n", conf_mat) """ print("EasyEnsembleClassifier") tree = DecisionTreeClassifier(max_features='auto') ada_tree = AdaBoostClassifier(base_estimator=LogisticRegression()) resample_easy = EasyEnsembleClassifier( base_estimator=ada_tree, n_estimators=10, random_state=0, n_jobs=2, sampling_strategy='auto') scores = cross_validate(resample_easy, X_train, y_train, cv=10, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_rf_pred = rf_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred) print("confusion_matrix:\n", conf_mat)
def __init__(self): self.file_object = open("../logs/modeltune/log.txt", 'a+') self.saved_best_model_path = '../saved_model/best_model.sav' self.logger = App_Logger() self.transformed_data = dataTransform() self.df = self.transformed_data.trainingData() self.data = self.df.iloc[:, :-1] self.label = self.df.iloc[:, -1] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.label, test_size=0.2, random_state=0, stratify=self.label) self.BRF = BalancedRandomForestClassifier(n_jobs=-1) self.EEC = EasyEnsembleClassifier(n_jobs=-1)
def stacking(self, X_train, y_train): print("STACKING") estimators = [ ('rf', RandomForestClassifier(max_depth=2, random_state=42, class_weight='balanced_subsample')), ('bag', BalancedBaggingClassifier(random_state=42)), ('balanced_rf', BalancedRandomForestClassifier()), ('easy', EasyEnsembleClassifier()), ('xgb', XGBClassifier(eta=0.1, objective='multi:softmax', num_class=len(y_train.unique()))) ] xgb = XGBClassifier(eta=0.1, objective='multi:softmax', num_class=len(y_train.unique())) stack = StackingClassifier( estimators=estimators, final_estimator=xgb ) return stack
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def get_models(self): base_lr = LogisticRegression(class_weight='balanced') ovr_lr = OneVsRestClassifier(base_lr) base_eec = EasyEnsembleClassifier(n_estimators=10) ovr_eec = OneVsRestClassifier(base_eec) base_rus = RUSBoostClassifier(n_estimators=50) ovr_rus = OneVsRestClassifier(base_rus) base_bbc = BalancedBaggingClassifier(n_estimators=10) ovr_bbc = OneVsRestClassifier(base_bbc) base_brf = BalancedRandomForestClassifier(n_estimators=100) ovr_brf = OneVsRestClassifier(base_brf) estimators = [('lr', ovr_lr), ('eec', ovr_eec), ('rus', ovr_rus), ('bbc', ovr_bbc), ('brf', ovr_brf)] return estimators
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set( [pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def run(X_train, X_test, y_train, y_test): print("######################") print("Easy Ensemble") print("######################") print("\n") print('Original dataset shape %s' % Counter(y_train)) # resample all classes but the majority class eec = EasyEnsembleClassifier(sampling_strategy='not majority', replacement=True, random_state=42, n_jobs=-1) eec.fit(X_train, y_train) y_pred = eec.predict(X_test) y_proba = eec.predict_proba(X_test) return y_test, y_pred, y_proba
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
def get_models(): models, names = list(), list() # LR models.append( LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l2')) names.append('Logistic Regression') # Ada Boost names.append('Ada Boost') models.append(AdaBoostClassifier()) # Gradient Boosting names.append('Gradient Boosting') models.append(GradientBoostingClassifier()) # RUSBoostClassifier names.append('RUSBoost Classifier') models.append(RUSBoostClassifier()) # BalancedRandomForestClassifier names.append('RandomForestClassifier') models.append(RandomForestClassifier(class_weight='balanced')) # BalancedRandomForestClassifier names.append('EasyEnsembleClassifier') models.append(EasyEnsembleClassifier()) return models, names
def __init__(self, trainFile, testFile): self.trainFile = trainFile self.testFile = testFile self.__een = EasyEnsembleClassifier( base_estimator=LogisticRegression(C=6, solver='sag', max_iter=500)) self.__sgd = SGDClassifier(alpha=.0002, max_iter=180, penalty="l2", loss='modified_huber') self.__rforest = RandomForestClassifier(criterion='gini', max_depth=100, max_features=900, n_estimators=20, n_jobs=-1, min_samples_leaf=3, min_samples_split=10) self.__lr = LogisticRegression(C=6, solver='sag', max_iter=500) self.__vot = VotingClassifier(estimators=[('een', self.__een), ('sgd', self.__sgd), ('lr', self.__lr), ('rf', self.__rforest)], voting='soft', weights=[0.9, 1.3, 0.55, 0.65]) self.train_data = None self.test_data = None self.all_data = None self.train_features = None self.test_features = None self.test_labels = None self.train_labels = None self.class_names = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] self.submissionFile = None self.score = {}
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len(est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
.format(balanced_accuracy_score(y_test, y_pred_brf), geometric_mean_score(y_test, y_pred_brf))) cm_brf = confusion_matrix(y_test, y_pred_brf) plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier( n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2)