def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]: clf = BalancedBaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators assert_warns( UserWarning, BalancedBaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0).fit, X_train, y_train)
def objectiveBalance(params): time1 = time.time() params = { 'sampling_strategy': params['sampling_strategy'], } print("\n############## New Run ################") print(f"params = {params}") FOLDS = 5 count = 1 skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42) score_mean = 0 for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()): clf = BalancedBaggingClassifier(**params, random_state=0, n_estimators=300, n_jobs=-1, verbose=0) X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :] y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx] clf.fit(X_tr, y_tr.values.ravel()) score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl) score_mean += score print(f'{count} CV - score: {round(score, 4)}') count += 1 time2 = time.time() - time1 print(f"Total Time Run: {round(time2 / 60,2)}") gc.collect() print(f'Mean ROC_AUC: {score_mean / FOLDS}') del X_tr, X_vl, y_tr, y_vl, clf, score return -(score_mean / FOLDS)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert ({pipe.steps[-1][1].random_state for pipe in clf_ws } == {pipe.steps[-1][1].random_state for pipe in clf_no_ws})
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators assert_warns(UserWarning, BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0).fit, X_train, y_train)
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier(), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0).fit(X_train, y_train) unique_features = [ np.unique(features).shape[0] for features in ensemble.estimators_features_ ] assert np.median(unique_features) < X.shape[1]
def cross_validation(name): with open('../data/conv_pred/train_data_ad_ignore_' + name + '.pickle', 'rb') as f: data = pickle.load(f) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) kf = KFold(n_splits=5) fscore = 0 ftscore = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #model = RandomForestClassifier(n_estimators=100, n_jobs=8,class_weight={0:1,1:3000}) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8) model.fit(X_train, y_train) predict = model.predict_proba(X_test) score, t_score = eval(y_test, predict) pprint( sorted(zip( np.mean([ est.steps[1][1].feature_importances_ for est in model.estimators_ ], axis=0), v.feature_names_), key=lambda x: x[0], reverse=True)) print('score : ', str(score)) print('true_score : ', str(t_score)) fscore += score ftscore += t_score print('\n') print('final score : ', str(fscore / 10)) print('final true_score : ', str(ftscore / 10))
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, ratio={}, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) == base_estimator.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) < base_estimator.score(X_train, y_train))
def balanced_bragging(X_train, y_train, X_test, y_test, X_train_res, y_train_res): bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train.values.ravel()) y_train_bc = bagging.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bc) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Niezbalansowane (bragging): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) bagging_oversampling = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging_oversampling.fit(X_train_res, y_train_res.ravel()) y_train_bc = bagging_oversampling.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bc) with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("z oversamplingiem (bragging): {}%".format(with_oversampling)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging.fit(X_train, y_train.values.ravel()) y_train_bbc = balanced_bagging.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bbc) within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Zbalansowane (bragging): {}%".format(within)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) objects = ('Bragging','Bragging z oversamplingiem SMOTE', 'Bragging z losowym undersamplingiem') y_pos = np.arange(len(objects)) performance = [without,with_oversampling, within] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność braggingu') plt.show() return without, within
def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier(Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron)
def test_probability(): # Predict probabilities. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(), random_state=0, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) with pytest.raises(ValueError): clf.fit(X, y)
def train_knn_model(df_formatted, true_labels, iteration=0): classifier = BalancedBaggingClassifier( n_estimators=5, base_estimator=KNeighborsClassifier(n_neighbors=5), random_state=0, n_jobs=-1) classifier.fit(df_formatted, true_labels) save_model(classifier, iteration)
def imblearn_(classifier, X_train, y_train, X_test, y_test): clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) printStats(y_test, y_pred) return clf, y_pred
def train_tree_model(X_train, y_train): classifier = BalancedBaggingClassifier( n_estimators=5, base_estimator=DecisionTreeClassifier(), random_state=0, n_jobs=-1) classifier.fit(X_train, y_train) save_model(classifier)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X)
def train_nb_model(X_train, y_train, vectorize=False, iteration=0): # printCsv(print_df, "train") print("train_nb_model", iteration) start_time = time.time() tfidf = TfidfVectorizer( sublinear_tf=True, # min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') features = tfidf.fit_transform(X_train).toarray() labels = y_train print(features.shape) # from sklearn.feature_selection import chi2 # import numpy as np # N = 10 # a = labels == False # features_chi2 = chi2(features, a) # indices = np.argsort(features_chi2[0]) # feature_names = np.array(tfidf.get_feature_names())[indices] # unigrams = [v for v in feature_names if len(v.split(' ')) == 1] # bigrams = [v for v in feature_names if len(v.split(' ')) == 2] # printCsv(pd.DataFrame(feature_names), "tfidf") # print("# asd") # print(" . Most correlated unigrams:\n. {}".format('\n. '.join( # unigrams[0:N]))) # print(" . Most correlated unigrams:\n. {}".format('\n. '.join( # unigrams[-N:]))) # print(" . Most correlated unigrams:\n. {}".format('\n. '.join( # bigrams[0:N]))) # print(" . Most correlated bigrams:\n. {}".format('\n. '.join( # bigrams[-N:]))) # print("Training") tfidf_train = tfidf.fit(X_train) save_model_2(tfidf_train, iteration) bow_features = tfidf_train.transform(X_train) text_clf = BalancedBaggingClassifier(n_estimators=5, base_estimator=LinearSVC(), random_state=0) #train text_clf = text_clf.fit(bow_features, y_train) save_model(text_clf, iteration) print("modle saved") if vectorize: text_clf = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range=(1, 2), stop_words='english')), ('clf', LinearSVC())]) text_clf = text_clf.fit(X_train, y_train) save_model(text_clf)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BalancedBaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
class Classifier(BaseEstimator): def __init__(self): self.reg = BalancedBaggingClassifier(n_estimators=50, random_state=42) def fit(self, X, y): self.reg.fit(X, y) def predict(self, X): return self.reg.predict(X)
def __init__(self): # mimicking balanced random forest with the BalancedBaggingClassifier # and DecisionTreeClassifier combination self.bbc = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(max_features='auto'), ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1)
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1) bagging = BalancedBaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert bagging._max_samples == max_samples
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovo') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5))
def RFC(): train_data, train_target = get_data('data.txt', 'target.txt') model_rfc = BalancedBaggingClassifier( n_estimators=100, base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_rfc.fit(train_data, train_target) save_path_name = '../../model/' + 'rfc.m' joblib.dump(model_rfc, save_path_name)
def cross_validation(x): with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f: data = pickle.load(f) print(data) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) zero = 0 one = 0 for i in y: if i == 0: zero += 1 else: one += 1 print(zero) print(one) cv = 5 kf = KFold(n_splits=cv) fscore = 0 ftscore = 0 all_f_value = 0 all_prec = 0 for train_index, test_index in tqdm(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #model = RandomForestRe(n_estimators=100, n_jobs=8) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8) #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one) model.fit(X_train, y_train) predict = model.predict_proba(X_test) precision, recall, f_value, all_pre = eval(y_test, predict) all_prec += all_pre fscore += precision ftscore += recall all_f_value += f_value pprint( sorted(zip( np.mean([ est.steps[1][1].feature_importances_ for est in model.estimators_ ], axis=0), v.feature_names_), key=lambda x: x[0], reverse=True)) print('\n') print('final precision : ', str(fscore / cv)) print('final recall : ', str(ftscore / cv)) print('final f-value : ', str(all_f_value / cv)) print('final all_precision : ', str(all_prec / cv))
class Classifier(BaseEstimator): def __init__(self): # mimicking balanced random forest with the BalancedBaggingClassifier # and DecisionTreeClassifier combination self.bbc = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(max_features='auto'), ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1) def fit(self, X, y): self.bbc.fit(X, y) def predict_proba(self, X): return self.bbc.predict_proba(X)
def switch_algorithm(algr): switcher = { 1: ('knn', KNeighborsClassifier()), 2: ('lr', LogisticRegression(solver='liblinear')), 3: ('dt', DecisionTreeClassifier()), 4: ('xtr', ExtraTreesClassifier()), 5: ('rf', RandomForestClassifier()), 6: ('gbt', GradientBoostingClassifier()), 7: ('mlp', MLPClassifier()), 8: ('bnb', BernoulliNB()), 9: ('gnb', GaussianNB()), 10: ('polysvc', SVC()), 11: ('sigmsvc', SVC()), 12: ('rbfsvc', SVC()), 13: ('lsvc', SVC()), 14: ('lbsvc', LinearSVC()), 15: ('bsvc', BalancedBaggingClassifier(SVC(kernel='linear', probability=True), sampling_strategy='not majority')), 16: ('absvc', BalancedBaggingClassifier(SVC(kernel='linear', probability=True), sampling_strategy='all')), 17: ('ccsvc', CalibratedClassifierCV()), 18: ('bbnb', BalancedBaggingClassifier()), 19: ('blsvc', BalancedBaggingClassifier()), 20: ('bsvc',BalancedBaggingClassifier()), 21: ('bsvcsig', BalancedBaggingClassifier()), 22: ('xgbt', XGBClassifier(n_thread=-1)), 23: ('bxgbt', BalancedBaggingClassifier(XGBClassifier(n_thread=-1))), 24: ('bgbt', BalancedBaggingClassifier(GradientBoostingClassifier())), 25: ('adb', AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight='balanced'))), 26: ('lgbm', lgb.LGBMClassifier(silent=True, class_weight='balanced')), 27: ('catb', cb.CatBoostClassifier(silent=True)) } # print(switcher.get(algr, "Invalid algorithm")) return switcher.get(algr, "Invalid algorithm")
def test_balanced_bagging_classifier_with_function_sampler(replace): # check that we can provide a FunctionSampler in BalancedBaggingClassifier X, y = make_classification( n_samples=1_000, n_features=10, n_classes=2, weights=[0.3, 0.7], random_state=0, ) def roughly_balanced_bagging(X, y, replace=False): """Implementation of Roughly Balanced Bagging for binary problem.""" # find the minority and majority classes class_counts = Counter(y) majority_class = max(class_counts, key=class_counts.get) minority_class = min(class_counts, key=class_counts.get) # compute the number of sample to draw from the majority class using # a negative binomial distribution n_minority_class = class_counts[minority_class] n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) # draw randomly with or without replacement majority_indices = np.random.choice( np.flatnonzero(y == majority_class), size=n_majority_resampled, replace=replace, ) minority_indices = np.random.choice( np.flatnonzero(y == minority_class), size=n_minority_class, replace=replace, ) indices = np.hstack([majority_indices, minority_indices]) return X[indices], y[indices] # Roughly Balanced Bagging rbb = BalancedBaggingClassifier( base_estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": replace}), ) rbb.fit(X, y) for estimator in rbb.estimators_: class_counts = estimator[-1].class_counts_ assert (class_counts[0] / class_counts[1]) > 0.8
def clf_wrapper(classifier, X_train, y_train, X_test, y_test): clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', replacement=False, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cfm = confusion_matrix(y_test, y_pred) # Predictive Value PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1]) NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1]) ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1]) return (PPV+NPV+ACR)/3
def classifier_imblearn_SVM_training(_X, _Y, _weight): X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split( _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef) bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf", gamma="auto"), n_estimators=10, sampling_strategy="auto", max_samples=80, replacement=False, random_state=0xdeadbeef) bbc.fit(X_train, Y_train) y_pred = bbc.predict(X_test) print("Result from bagging labeled SVM:") print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y)
def test_balanced_bagging_classifier(): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) grid = ParameterGrid({ "max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False] }) for base_estimator in [ None, DummyClassifier(), Perceptron(max_iter=1000, tol=1e-3), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(gamma='scale') ]: for params in grid: BalancedBaggingClassifier(base_estimator=base_estimator, random_state=0, **params).fit(X_train, y_train).predict(X_test)
def create_for_training( classifier_clsname, feature_extractors: typing.Sequence[FeatureExtractorMixin], classifier_params=None): ## instantiate classifier if classifier_clsname == '__svm__': classifier = SVC(gamma=0.1, C=2) elif classifier_clsname == '__bagging_svm__': classifier = BalancedBaggingClassifier( base_estimator=SVC(gamma=0.1, C=2), n_estimators=10, bootstrap=False, sampling_strategy='majority') elif '.' in classifier_clsname: module_name, cls_name = classifier_clsname.rsplit('.', 1) module = importlib.import_module(module_name) classifier = getattr(module, cls_name)() else: classifier = globals()[classifier_clsname]() if classifier_params is not None: classifier.set_params(**classifier_params) extractors = [(str(idx), extractor) for idx, extractor in enumerate(feature_extractors)] return SKLearnClassifierBasedTypeFilter(classifier, extractors)
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators" " does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0).fit(X_train, y_train) clf2 = make_pipeline(RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier()).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs', multi_class='auto'), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == 'i' # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') ozone = fetch_datasets()['ozone_level'] X, y = ozone.data, ozone.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) print('Class distribution of the training set: {}'.format(Counter(y_train))) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) print('Class distribution of the test set: {}'.format(Counter(y_test))) print('Classification results using a bagging classifier on imbalanced data') y_pred_bagging = bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_bagging)) cm_bagging = confusion_matrix(y_test, y_pred_bagging) plt.figure() plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BaggingClassifier')
geometric_mean_score(y_test, y_pred_tree))) cm_tree = confusion_matrix(y_test, y_pred_tree) fig, ax = plt.subplots() plot_confusion_matrix(cm_tree, classes=np.unique(satimage.target), ax=ax, title='Decision tree') ############################################################################### # Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc)))
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)