def test_make_imbalance_dict(): sampling_strategy = {0: 10, 1: 20, 2: 30} X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy) assert Counter(y_) == sampling_strategy sampling_strategy = {0: 10, 1: 20} X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy) assert Counter(y_) == {0: 10, 1: 20, 2: 50}
def test_make_imbalance_dict(): ratio = {0: 10, 1: 20, 2: 30} X_, y_ = make_imbalance(X, Y, ratio=ratio) assert Counter(y_) == ratio ratio = {0: 10, 1: 20} X_, y_ = make_imbalance(X, Y, ratio=ratio) assert Counter(y_) == {0: 10, 1: 20, 2: 50}
def test_make_imbalance_ratio(): # check that using 'ratio' is working sampling_strategy = {0: 10, 1: 20, 2: 30} X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy) assert Counter(y_) == sampling_strategy sampling_strategy = {0: 10, 1: 20} X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy) assert Counter(y_) == {0: 10, 1: 20, 2: 50}
def test_balanced_bagging_classifier(): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) grid = ParameterGrid({ "max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False] }) for base_estimator in [ None, DummyClassifier(), Perceptron(max_iter=1000, tol=1e-3), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(gamma='scale') ]: for params in grid: BalancedBaggingClassifier( base_estimator=base_estimator, random_state=0, **params).fit( X_train, y_train).predict(X_test)
def test_make_imbalance_5(): """Test make_imbalance""" X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=0) counter = Counter(y_) assert_equal(counter[1], 500) assert_equal(counter[0], 5) assert(np.all([X_i in X for X_i in X_]))
def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0).fit(X_train, y_train) unique_features = [np.unique(features).shape[0] for features in ensemble.estimators_features_] assert np.median(unique_features) < X.shape[1]
def test_make_imbalance_2(): """Test make_imbalance""" X_, y_ = make_imbalance(X, Y, ratio=0.25, min_c_=1) counter = Counter(y_) assert_equal(counter[0], 500) assert_equal(counter[1], 125) assert_true(np.all([X_i in X for X_i in X_]))
def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y)
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, ratio={}, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) == base_estimator.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) < base_estimator.score(X_train, y_train))
def test_probability(): # Predict probabilities. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(), random_state=0, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators assert_warns(UserWarning, BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0).fit, X_train, y_train)
def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X)
def test_balanced_batch_generator(sampler): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) X = X.astype(np.float32) batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=None, sampler=sampler, batch_size=batch_size, random_state=42) learning_rate = 0.01 epochs = 10 input_size = X.shape[1] output_size = 3 # helper functions def init_weights(shape): return tf.Variable(tf.random_normal(shape, stddev=0.01)) def accuracy(y_true, y_pred): return np.mean(np.argmax(y_pred, axis=1) == y_true) # input and output data = tf.placeholder("float32", shape=[None, input_size]) targets = tf.placeholder("int32", shape=[None]) # build the model and weights W = init_weights([input_size, output_size]) b = init_weights([output_size]) out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out_act, labels=targets) loss = tf.reduce_sum(cross_entropy) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) predict = tf.nn.softmax(out_act) # Initialization of all variables in the graph init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) sess.run([train_op, loss], feed_dict={data: X_batch, targets: y_batch}) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print("epoch: {} train accuracy: {:.3f}" .format(e, accuracy(y, predicts_train)))
def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) parameters = {'n_estimators': [1, 2], 'base_estimator__n_estimators': [3, 4]} grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5, iid=False) grid_search.fit(X, y)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) estimator = BalancedBaggingClassifier(make_pipeline( SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X)
def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( X_train, y_train) clf2 = make_pipeline(RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0)).fit( X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_make_imbalance_multiclass(): """Test make_imbalance with multiclass data""" # Make y to be multiclass y_ = np.zeros(1000) y_[100:500] = 1 y_[500:] = 2 # Resample the data X_, y_ = make_imbalance(X, y_, ratio=0.1, min_c_=0) counter = Counter(y_) assert_equal(counter[0], 90) assert_equal(counter[1], 400) assert_equal(counter[2], 500) assert_true(np.all([X_i in X for X_i in X_]))
def test_balanced_batch_generator_function_sparse(keep_sparse): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) X = X.astype(np.float32) training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) estimator = EasyEnsembleClassifier(n_estimators=2, base_estimator=make_pipeline( SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y)
def main(algoritmo, df, modelo, balance): # modelo='temas' # balance=1 # algoritmo='NB' ## Definir las columnas de interés col = ['tweet', modelo] df = df[col] df = df[df[modelo] != ''] df.columns = ['tweet', modelo] df = df[pd.notnull(df[modelo])] df = df[pd.notnull(df[modelo])] df['categoria'] = df[modelo].astype('category') df[modelo] = df['categoria'].cat.codes df[modelo] = df[modelo].astype('int') dftemas = df[['categoria', modelo]] temas = dftemas.set_index(modelo).to_dict() #Balancear respuesta muestra = df[modelo].value_counts().min() X, y = make_imbalance(df, df[modelo], sampling_strategy=[{ i: muestra for i in list(df[modelo].value_counts().index) }][0], random_state=0) ## Train y test para el modelo if balance == 1: X_train, X_test, y_train, y_test = train_test_split(X['tweet'], X[modelo], random_state=0) else: X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df[modelo], random_state=0) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = correr_modelo(algoritmo, X_train_tfidf, y_train) cwd = os.getcwd() dump(clf, cwd + '/assets/pys/modelo_temas.joblib') pickle.dump(count_vect.vocabulary_, open(cwd + "/assets/pys/vocabulario_temas.pkl", "wb")) with open(cwd + '/assets/pys/temas.json', 'w') as fp: json.dump(temas, fp)
def test_make_imbalance_error(): # we are reusing part of utils.check_ratio, however this is not cover in # the common tests so we will repeat it here ratio = {0: -100, 1: 50, 2: 50} with raises(ValueError, match="in a class cannot be negative"): make_imbalance(X, Y, ratio) ratio = {0: 10, 1: 70} with raises(ValueError, match="should be less or equal to the original"): make_imbalance(X, Y, ratio) y_ = np.zeros((X.shape[0], )) ratio = {0: 10} with raises(ValueError, match="needs to have more than 1 class."): make_imbalance(X, y_, ratio) ratio = 'random-string' with raises(ValueError, match="has to be a dictionary or a function"): make_imbalance(X, Y, ratio)
def test_make_imbalance_multiclass(): """Test make_imbalance with multiclass data""" # Make y to be multiclass y_ = np.zeros(1000) y_[100:500] = 1 y_[500:] = 2 # Resample the data X_, y_ = make_imbalance(X, y_, ratio=0.1, min_c_=0) counter = Counter(y_) assert_equal(counter[0], 90) assert_equal(counter[1], 400) assert_equal(counter[2], 500) assert(np.all([X_i in X for X_i in X_]))
def test_probability(): # Predict probabilities. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), random_state=0, max_samples=5, ) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), )
def test_balanced_bagging_classifier(base_estimator, params): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) BalancedBaggingClassifier(base_estimator=base_estimator, random_state=0, **params).fit(X_train, y_train).predict(X_test)
def test_balanced_bagging_classifier_error(params): # Test that it gives proper exception on deficient input. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }) base = DecisionTreeClassifier() clf = BalancedBaggingClassifier(base_estimator=base, **params) with pytest.raises(ValueError): clf.fit(X, y) # Test support of decision_function assert not (hasattr( BalancedBaggingClassifier(base).fit(X, y), "decision_function"))
def test_error(): # Test that it gives proper exception on deficient input. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }) base = DecisionTreeClassifier() # Test n_estimators assert_raises(ValueError, BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y) # Test max_samples assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=-1).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=1000).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples="foobar").fit, X, y) # Test max_features assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=-1).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=0.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=2.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=5).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features="foobar").fit, X, y) # Test support of decision_function assert not (hasattr( BalancedBaggingClassifier(base).fit(X, y), 'decision_function'))
def TrirandomUnderSampling(self, X, y): """ y : numpy array """ result = [] unique_elements, counts_elements = np.unique(y, return_counts=True) # dictCount = dict(zip(unique_elements, counts_elements)) numClass = len(unique_elements) c = [0] * numClass nData = len(y) minVal = counts_elements.min() sample_strategy = dict() for i in range(numClass): if i in unique_elements: sample_strategy[i] = minVal Xres, yres = make_imbalance(X, y, sample_strategy) return Xres, yres
def test_pipeline_score_samples_pca_lof(): X, y = load_iris(return_X_y=True) sampling_strategy = {0: 50, 1: 30, 2: 20} X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler() pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('rus', rus), ('pca', pca), ('lof', lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0], ) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def random_forest(df_normalized_w_target): X = df_normalized_w_target[list(df_normalized_w_target.columns)[7:-1]] print("X Shape", X.shape) Y = df_normalized_w_target[list(df_normalized_w_target.columns)[-1]] print("Y Shape", Y.shape) perm_feat_imp = X.iloc[:, [0, 5, 9, 3, 12, 13, 4, 23, 7, 10, 16, 6, 52]] print("Perm Feat Impt Shape", perm_feat_imp.shape) X, y = make_imbalance(perm_feat_imp, Y, sampling_strategy={ 1: 2700, 2: 2700, 3: 2700, 4: 2700, 5: 2700, 6: 2700, 7: 2700 }, random_state=42) X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split( X, y, random_state=42) print('Training target statistics: {}'.format(Counter(y_train_rf))) print('Testing target statistics: {}'.format(Counter(y_test_rf))) rfc = RandomForestClassifier(n_estimators=100) rfc = rfc.fit(X_train_rf, y_train_rf) rfc_pred = rfc.predict(X_test_rf) print("rfc pred shape", rfc_pred.shape) y_pred_rf = rfc.predict(X_test_rf) print("y pred rf shape", y_pred_rf.shape) print("y train rf shape", y_train_rf.shape) print("y train rf shape", X_train_rf.shape) rf_train_acc = metrics.accuracy_score(y_train_rf, rfc.predict(X_train_rf)) rf_test_acc = metrics.accuracy_score(y_test_rf, rfc.predict(X_test_rf)) print("Random Forest Train Accuracy:", metrics.accuracy_score(y_train_rf, rfc.predict(X_train_rf))) print("Random Forest Test Accuracy:", metrics.accuracy_score(y_test_rf, rfc.predict(X_test_rf))) print(confusion_matrix(y_test_rf, rfc_pred)) print(classification_report(y_test_rf, rfc_pred)) #print(classification_report(y_test_rf,rfc_pred)) return (rf_train_acc, rf_test_acc)
def training(train_dataset): logging.debug("func called training") XX = train_dataset.drop([ "grade", "evaluat_desc", 'game_id', "Description", "shortDesc", "UpdateDescription", "subject", "game_tags", "Type", "game_feature", "game_key", "main_play", "play_key", "game_play_way", "playway", "gamefeature", "grade_", "gametype" ], axis=1) feature = XX.columns X_train, y_train = train_dataset[feature], train_dataset["grade_"] logging.debug('balancing dataset') def ratio_data(grade, n): return int(round(len(y_train[y_train == grade]) * n)) ratio = {} #ratio = {3: ratio_data(3, 1), 4: ratio_data(4, 1), 1: ratio_data(1, 1), 2: ratio_data(2, 0.9)} for i in range(1, 5): if i in list(set(y_train)) and i != 2: ratio[i] = ratio_data(i, 1) if i in list(set(y_train)) and i == 2: ratio[i] = ratio_data(i, 0.9) X_train, y_train = make_imbalance(XX, train_dataset.grade_, ratio=ratio) logging.debug('feature importance & feature selection') clf = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=1, n_jobs=2) clf.fit(X_train, y_train) importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print(XX.columns[f], importances[indices[f]]) logging.debug('save params to pkl file') with open(os.path.join(path_config.MODEL_DIR, 'clf.pkl'), "wb") as f: cPickle.dump(clf, f) return clf
def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) parameters = { "n_estimators": [1, 2], "base_estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5, ) grid_search.fit(X, y)
def test_error(): # Test that it gives proper exception on deficient input. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}) base = DecisionTreeClassifier() # Test n_estimators assert_raises(ValueError, BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y) # Test max_samples assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=-1).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=1000).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples="foobar").fit, X, y) # Test max_features assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=-1).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=0.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=2.0).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features=5).fit, X, y) assert_raises(ValueError, BalancedBaggingClassifier(base, max_features="foobar").fit, X, y) # Test support of decision_function assert not (hasattr( BalancedBaggingClassifier(base).fit(X, y), 'decision_function'))
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0).fit(X_train, y_train) clf2 = make_pipeline(RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier()).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = EasyEnsembleClassifier( 2, None, n_jobs=-1, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier) ensemble = EasyEnsembleClassifier( 2, AdaBoostClassifier(), n_jobs=-1, random_state=0).fit( X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier)
def sample_data(df_normalized_w_target): X = df_normalized_w_target[list(df_normalized_w_target.columns)[7:-1]] Y = df_normalized_w_target[list(df_normalized_w_target.columns)[-1]] X, y = make_imbalance(X, Y, sampling_strategy={ 1: 2700, 2: 2700, 3: 2700, 4: 2700, 5: 2700, 6: 2700, 7: 2700 }, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) #print(X_train, X_test, y_train, y_test print('* Data Sampled') return (X_train, X_test, y_train, y_test)
def test_make_imbalance_float(): X_, y_ = assert_warns_message(DeprecationWarning, "'min_c_' is deprecated in 0.2", make_imbalance, X, Y, ratio=0.5, min_c_=1) X_, y_ = assert_warns_message(DeprecationWarning, "'ratio' being a float is deprecated", make_imbalance, X, Y, ratio=0.5, min_c_=1) assert_equal(Counter(y_), {0: 50, 1: 25, 2: 50}) # resample without using min_c_ X_, y_ = make_imbalance(X_, y_, ratio=0.25, min_c_=None) assert_equal(Counter(y_), {0: 50, 1: 12, 2: 50})
def check_classifiers_with_encoded_labels(name, classifier): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pytest.importorskip("pandas") df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True) df, y = make_imbalance(df, y, sampling_strategy={ "Iris-setosa": 30, "Iris-versicolor": 20, "Iris-virginica": 50, }) classifier.set_params(sampling_strategy={ "Iris-setosa": 20, "Iris-virginica": 20, }) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df) assert set(y_pred) == set(y.cat.categories.tolist())
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len(est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, sampling_strategy={}, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) == base_estimator.score(X_train, y_train) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) < base_estimator.score( X_train, y_train)
def test_balanced_bagging_classifier(): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyClassifier(), Perceptron(), DecisionTreeClassifier(), KNeighborsClassifier(), SVC()]: for params in grid: BalancedBaggingClassifier( base_estimator=base_estimator, random_state=0, **params).fit(X_train, y_train).predict(X_test)
def check_classifiers_with_encoded_labels(name, classifier_orig): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pd = pytest.importorskip("pandas") classifier = clone(classifier_orig) iris = load_iris(as_frame=True) df, y = iris.data, iris.target y = pd.Series(iris.target_names[iris.target], dtype="category") df, y = make_imbalance( df, y, sampling_strategy={ "setosa": 30, "versicolor": 20, "virginica": 50, }, ) classifier.set_params(sampling_strategy={"setosa": 20, "virginica": 20}) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df) assert set(y_pred) == set(y.cat.categories.tolist())
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators with pytest.warns(UserWarning): BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train)
def create_imbalance(X, y, min_class, maj_class, imb_ratio, verbose=True): """ Create artificially an imbalance of (balanced) data """ # get samples for each class if original total number of samples is unknown (eg. 12500 for IMDB) X_min, X_maj = [], [] for i, value in enumerate(y): if value in min_class: X_min.append(X[i]) if value in maj_class: X_maj.append(X[i]) maj_cardinality = len(X_maj) # samples of majority class min_count = int(maj_cardinality * imb_ratio) # desired number of samples of minority class with ratio imb_ratio # need to reshape for images as 'make_imbalance' expects X to be a 2d-array. X_orig = X if len(list(X.shape)) > 2: X = X.reshape(X.shape[0], -1) X_res, y_res = make_imbalance(X, y, sampling_strategy={min_class[0]: min_count, maj_class[0]: maj_cardinality}, random_state=42, verbose=True) # reshape backwards to original shape if len(list(X.shape)) > 2: X_res = X_res.reshape(X_res.shape[0], X_orig.shape[1], X_orig.shape[2], X_orig.shape[3]) if verbose: print("min_class is: ", min_class) print("maj_class is: ", maj_class) print('Distribution before imbalancing: {}'.format(Counter(y))) print('Distribution after imbalancing: {}'.format(Counter(y_res))) return X_res, y_res
def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0, ).fit(X_train, y_train) unique_features = [ np.unique(features).shape[0] for features in ensemble.estimators_features_ ] assert np.median(unique_features) < X.shape[1]
def begin(): # Import Dataset dataset = pd.read_csv('CSV/CTG.csv') # Pre-processing data dataset = pp.clean_nan(dataset) print(dataset.shape) X, y = pp.split_iv_dv(dataset=dataset, exclude=(0, 1, 2, 39)) print(pp.get_balance(y)) # Making dataset imbalanced from imblearn.datasets import make_imbalance X_resampled, y_resampled = make_imbalance(X, y, ratio=0.05, min_c_=3, random_state=0) print('Synthetic generation:\n', pp.get_balance(y_resampled)) X_csv = pd.DataFrame(X_resampled) y_csv = pd.DataFrame(y_resampled) dataframe = pd.concat((X_csv, y_csv), axis=1) dataframe.columns = ['b', 'e', 'LBE', 'LB', 'AC', 'FM', 'UC', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'DL', 'DS', 'DP', 'DR', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP', 'CLASS', 'NSP'] dataframe.to_csv('CTG_imb.csv', index=False) return 0
def data(): iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) return X, y
############################################################################### # We will show how to use the parameter ``ratio`` when dealing with the # ``make_imbalance`` function. For this function, this parameter accepts both # dictionary and callable. When using a dictionary, each key will correspond to # the class of interest and the corresponding value will be the number of # samples desired in this class. iris = load_iris() print('Information of the original iris data set: \n {}'.format( Counter(iris.target))) plot_pie(iris.target) ratio = {0: 10, 1: 20, 2: 30} X, y = make_imbalance(iris.data, iris.target, ratio=ratio) print('Information of the iris data set after making it' ' imbalanced using a dict: \n ratio={} \n y: {}'.format(ratio, Counter(y))) plot_pie(y) ############################################################################### # You might required more flexibility and require your own heuristic to # determine the number of samples by class and you can define your own callable # as follow. In this case we will define a function which will use a float # multiplier to define the number of samples per class. def ratio_multiplier(y): multiplier = {0: 0.5, 1: 0.7, 2: 0.95}
from sklearn.datasets import load_iris keras = pytest.importorskip('keras') from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical from imblearn.datasets import make_imbalance from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss from imblearn.keras import BalancedBatchGenerator from imblearn.keras import balanced_batch_generator iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) def _build_keras_model(n_classes, n_features): model = Sequential() model.add(Dense(n_classes, input_dim=n_features, activation='softmax')) model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']) return model def test_balanced_batch_generator_class_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10)
from sklearn.datasets import load_iris from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss from imblearn.pipeline import make_pipeline from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE) print('Training target statistics: {}'.format(Counter(y_train))) print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10) # Two subplots, unpack the axes array immediately f, axs = plt.subplots(2, 3) axs = [a for ax in axs for a in ax] axs[0].scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) axs[0].scatter(X[y == 1, 0], X[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) axs[0].set_title('Original set') ratios = [0.9, 0.75, 0.5, 0.25, 0.1] for i, ratio in enumerate(ratios, start=1): ax = axs[i] X_, y_ = make_imbalance(X, y, ratio=ratio, min_c_=1) ax.scatter(X_[y_ == 0, 0], X_[y_ == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax.scatter(X_[y_ == 1, 0], X_[y_ == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax.set_title('make_imbalance ratio ({})'.format(ratio)) plt.show()
sns.set() # Define some color for the plotting almost_black = '#262626' palette = sns.color_palette() # Generate the dataset X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10) f, axs = plt.subplots(1, 2) # Original axs[0].scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0", alpha=0.5, facecolor=palette[0], linewidth=0.15) axs[0].scatter(X[y == 1, 0], X[y == 1, 1], label="Class #0", alpha=0.5, facecolor=palette[2], linewidth=0.15) # Make imbalance X_, y_ = make_imbalance(X, y, ratio=0.5, min_c_=1) X_0, y_0 = make_imbalance(X, y, ratio=0.5, min_c_=0) # After making imbalance axs[1].scatter(X_[y_ == 0, 0], X_[y_ == 0, 1], label="Class #0", alpha=0.5, facecolor=palette[0], linewidth=0.15) axs[1].scatter(X_[y_ == 1, 0], X_[y_ == 1, 1], label="Class #0", alpha=0.5, facecolor=palette[2], linewidth=0.15) plt.show()