Ejemplo n.º 1
0
def test_make_imbalance_dict():
    sampling_strategy = {0: 10, 1: 20, 2: 30}
    X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy)
    assert Counter(y_) == sampling_strategy

    sampling_strategy = {0: 10, 1: 20}
    X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy)
    assert Counter(y_) == {0: 10, 1: 20, 2: 50}
Ejemplo n.º 2
0
def test_make_imbalance_dict():
    ratio = {0: 10, 1: 20, 2: 30}
    X_, y_ = make_imbalance(X, Y, ratio=ratio)
    assert Counter(y_) == ratio

    ratio = {0: 10, 1: 20}
    X_, y_ = make_imbalance(X, Y, ratio=ratio)
    assert Counter(y_) == {0: 10, 1: 20, 2: 50}
Ejemplo n.º 3
0
def test_make_imbalance_ratio():
    # check that using 'ratio' is working
    sampling_strategy = {0: 10, 1: 20, 2: 30}
    X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy)
    assert Counter(y_) == sampling_strategy

    sampling_strategy = {0: 10, 1: 20}
    X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy)
    assert Counter(y_) == {0: 10, 1: 20, 2: 50}
Ejemplo n.º 4
0
def test_balanced_bagging_classifier():
    # Check classification for various parameter settings.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20,
                           1: 25,
                           2: 50},
        random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    grid = ParameterGrid({
        "max_samples": [0.5, 1.0],
        "max_features": [1, 2, 4],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False]
    })

    for base_estimator in [
            None,
            DummyClassifier(),
            Perceptron(max_iter=1000, tol=1e-3),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            SVC(gamma='scale')
    ]:
        for params in grid:
            BalancedBaggingClassifier(
                base_estimator=base_estimator, random_state=0, **params).fit(
                    X_train, y_train).predict(X_test)
def test_make_imbalance_5():
    """Test make_imbalance"""
    X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=0)
    counter = Counter(y_)
    assert_equal(counter[1], 500)
    assert_equal(counter[0], 5)
    assert(np.all([X_i in X for X_i in X_]))
Ejemplo n.º 6
0
def test_bootstrap_features():
    # Test that bootstrapping features may generate duplicate features.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert np.unique(features).shape[0] == X.shape[1]

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=True,
        random_state=0).fit(X_train, y_train)

    unique_features = [np.unique(features).shape[0]
                       for features in ensemble.estimators_features_]
    assert np.median(unique_features) < X.shape[1]
def test_make_imbalance_2():
    """Test make_imbalance"""
    X_, y_ = make_imbalance(X, Y, ratio=0.25, min_c_=1)
    counter = Counter(y_)
    assert_equal(counter[0], 500)
    assert_equal(counter[1], 125)
    assert_true(np.all([X_i in X for X_i in X_]))
Ejemplo n.º 8
0
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
Ejemplo n.º 9
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    base_estimator = DecisionTreeClassifier().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    # disable the resampling by passing an empty dictionary.
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=False,
        n_estimators=10,
        ratio={},
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) ==
            base_estimator.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=True,
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) <
            base_estimator.score(X_train, y_train))
Ejemplo n.º 10
0
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(),
            random_state=0,
            max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Ejemplo n.º 11
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BalancedBaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=100,
            bootstrap=True,
            oob_score=True,
            random_state=0).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(UserWarning,
                     BalancedBaggingClassifier(
                         base_estimator=base_estimator,
                         n_estimators=1,
                         bootstrap=True,
                         oob_score=True,
                         random_state=0).fit,
                     X_train,
                     y_train)
Ejemplo n.º 12
0
def test_base_estimator():
    # Check base_estimator and its default values.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    ensemble = BalancedBaggingClassifier(None,
                                         n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      DecisionTreeClassifier)

    ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(),
                                         n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      DecisionTreeClassifier)

    ensemble = BalancedBaggingClassifier(Perceptron(),
                                         n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      Perceptron)
Ejemplo n.º 13
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
Ejemplo n.º 14
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1),
                      DecisionTreeClassifier()),
        max_features=2)
    estimator.fit(X, y).predict(X)
Ejemplo n.º 15
0
def test_balanced_batch_generator(sampler):
    X, y = load_iris(return_X_y=True)
    X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40})
    X = X.astype(np.float32)

    batch_size = 10
    training_generator, steps_per_epoch = balanced_batch_generator(
        X, y, sample_weight=None, sampler=sampler,
        batch_size=batch_size, random_state=42)

    learning_rate = 0.01
    epochs = 10
    input_size = X.shape[1]
    output_size = 3

    # helper functions
    def init_weights(shape):
        return tf.Variable(tf.random_normal(shape, stddev=0.01))

    def accuracy(y_true, y_pred):
        return np.mean(np.argmax(y_pred, axis=1) == y_true)

    # input and output
    data = tf.placeholder("float32", shape=[None, input_size])
    targets = tf.placeholder("int32", shape=[None])

    # build the model and weights
    W = init_weights([input_size, output_size])
    b = init_weights([output_size])
    out_act = tf.nn.sigmoid(tf.matmul(data, W) + b)

    # build the loss, predict, and train operator
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=out_act, labels=targets)
    loss = tf.reduce_sum(cross_entropy)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.minimize(loss)
    predict = tf.nn.softmax(out_act)

    # Initialization of all variables in the graph
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        for e in range(epochs):
            for i in range(steps_per_epoch):
                X_batch, y_batch = next(training_generator)
                sess.run([train_op, loss],
                         feed_dict={data: X_batch, targets: y_batch})

            # For each epoch, run accuracy on train and test
            predicts_train = sess.run(predict, feed_dict={data: X})
            print("epoch: {} train accuracy: {:.3f}"
                  .format(e, accuracy(y, predicts_train)))
Ejemplo n.º 16
0
def test_easy_ensemble_classifier_grid_search():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)

    parameters = {'n_estimators': [1, 2],
                  'base_estimator__n_estimators': [3, 4]}
    grid_search = GridSearchCV(
        EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()),
        parameters, cv=5, iid=False)
    grid_search.fit(X, y)
Ejemplo n.º 17
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    estimator = BalancedBaggingClassifier(make_pipeline(
        SelectKBest(k=1), DecisionTreeClassifier()),
                                          max_features=2)
    estimator.fit(X, y).predict(X)
Ejemplo n.º 18
0
def test_easy_ensemble_classifier_single_estimator():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(
        X_train, y_train)
    clf2 = make_pipeline(RandomUnderSampler(random_state=0),
                         AdaBoostClassifier(random_state=0)).fit(
                             X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Ejemplo n.º 19
0
def test_easy_ensemble_classifier_single_estimator():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(
        X_train, y_train)
    clf2 = make_pipeline(RandomUnderSampler(random_state=0),
                         AdaBoostClassifier(random_state=0)).fit(
                             X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Ejemplo n.º 20
0
def test_make_imbalance_multiclass():
    """Test make_imbalance with multiclass data"""
    # Make y to be multiclass
    y_ = np.zeros(1000)
    y_[100:500] = 1
    y_[500:] = 2

    # Resample the data
    X_, y_ = make_imbalance(X, y_, ratio=0.1, min_c_=0)
    counter = Counter(y_)
    assert_equal(counter[0], 90)
    assert_equal(counter[1], 400)
    assert_equal(counter[2], 500)
    assert_true(np.all([X_i in X for X_i in X_]))
Ejemplo n.º 21
0
def test_balanced_batch_generator_function_sparse(keep_sparse):
    X, y = load_iris(return_X_y=True)
    X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40})
    X = X.astype(np.float32)

    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10,
        random_state=42)
    for idx in range(steps_per_epoch):
        X_batch, y_batch = next(training_generator)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    estimator = EasyEnsembleClassifier(n_estimators=2,
                                       base_estimator=make_pipeline(
                                           SelectKBest(k=1),
                                           AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
Ejemplo n.º 23
0
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
Ejemplo n.º 24
0
def main(algoritmo, df, modelo, balance):
    # modelo='temas'
    # balance=1
    # algoritmo='NB'

    ## Definir las columnas de interés
    col = ['tweet', modelo]
    df = df[col]
    df = df[df[modelo] != '']
    df.columns = ['tweet', modelo]
    df = df[pd.notnull(df[modelo])]
    df = df[pd.notnull(df[modelo])]
    df['categoria'] = df[modelo].astype('category')
    df[modelo] = df['categoria'].cat.codes
    df[modelo] = df[modelo].astype('int')
    dftemas = df[['categoria', modelo]]
    temas = dftemas.set_index(modelo).to_dict()

    #Balancear respuesta
    muestra = df[modelo].value_counts().min()
    X, y = make_imbalance(df,
                          df[modelo],
                          sampling_strategy=[{
                              i: muestra
                              for i in list(df[modelo].value_counts().index)
                          }][0],
                          random_state=0)

    ## Train y test para el modelo
    if balance == 1:
        X_train, X_test, y_train, y_test = train_test_split(X['tweet'],
                                                            X[modelo],
                                                            random_state=0)
    else:
        X_train, X_test, y_train, y_test = train_test_split(df['tweet'],
                                                            df[modelo],
                                                            random_state=0)
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = correr_modelo(algoritmo, X_train_tfidf, y_train)
    cwd = os.getcwd()

    dump(clf, cwd + '/assets/pys/modelo_temas.joblib')
    pickle.dump(count_vect.vocabulary_,
                open(cwd + "/assets/pys/vocabulario_temas.pkl", "wb"))
    with open(cwd + '/assets/pys/temas.json', 'w') as fp:
        json.dump(temas, fp)
def test_make_imbalance_error():
    # we are reusing part of utils.check_ratio, however this is not cover in
    # the common tests so we will repeat it here
    ratio = {0: -100, 1: 50, 2: 50}
    with raises(ValueError, match="in a class cannot be negative"):
        make_imbalance(X, Y, ratio)
    ratio = {0: 10, 1: 70}
    with raises(ValueError, match="should be less or equal to the original"):
        make_imbalance(X, Y, ratio)
    y_ = np.zeros((X.shape[0], ))
    ratio = {0: 10}
    with raises(ValueError, match="needs to have more than 1 class."):
        make_imbalance(X, y_, ratio)
    ratio = 'random-string'
    with raises(ValueError, match="has to be a dictionary or a function"):
        make_imbalance(X, Y, ratio)
def test_make_imbalance_multiclass():
    """Test make_imbalance with multiclass data"""
    # Make y to be multiclass
    y_ = np.zeros(1000)
    y_[100:500] = 1
    y_[500:] = 2

    # Resample the data
    X_, y_ = make_imbalance(X, y_, ratio=0.1, min_c_=0)
    counter = Counter(y_)
    assert_equal(counter[0], 90)
    assert_equal(counter[1], 400)
    assert_equal(counter[2], 500)
    assert(np.all([X_i in X for X_i in X_]))
Ejemplo n.º 27
0
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)),
        )

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)),
        )

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(solver="lbfgs",
                                              multi_class="auto"),
            random_state=0,
            max_samples=5,
        )
        ensemble.fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)),
        )

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)),
        )
Ejemplo n.º 28
0
def test_balanced_bagging_classifier(base_estimator, params):
    # Check classification for various parameter settings.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    BalancedBaggingClassifier(base_estimator=base_estimator,
                              random_state=0,
                              **params).fit(X_train, y_train).predict(X_test)
Ejemplo n.º 29
0
def test_balanced_bagging_classifier_error(params):
    # Test that it gives proper exception on deficient input.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          })
    base = DecisionTreeClassifier()
    clf = BalancedBaggingClassifier(base_estimator=base, **params)
    with pytest.raises(ValueError):
        clf.fit(X, y)

    # Test support of decision_function
    assert not (hasattr(
        BalancedBaggingClassifier(base).fit(X, y), "decision_function"))
Ejemplo n.º 30
0
def test_error():
    # Test that it gives proper exception on deficient input.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          })
    base = DecisionTreeClassifier()

    # Test n_estimators
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y)

    # Test max_samples
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=1000).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples="foobar").fit, X,
                  y)

    # Test max_features
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features="foobar").fit,
                  X, y)

    # Test support of decision_function
    assert not (hasattr(
        BalancedBaggingClassifier(base).fit(X, y), 'decision_function'))
Ejemplo n.º 31
0
 def TrirandomUnderSampling(self, X, y):
     """
     y : numpy array
     """
     result = []
     unique_elements, counts_elements = np.unique(y, return_counts=True)
     #         dictCount = dict(zip(unique_elements, counts_elements))
     numClass = len(unique_elements)
     c = [0] * numClass
     nData = len(y)
     minVal = counts_elements.min()
     sample_strategy = dict()
     for i in range(numClass):
         if i in unique_elements:
             sample_strategy[i] = minVal
     Xres, yres = make_imbalance(X, y, sample_strategy)
     return Xres, yres
Ejemplo n.º 32
0
def test_pipeline_score_samples_pca_lof():
    X, y = load_iris(return_X_y=True)
    sampling_strategy = {0: 50, 1: 30, 2: 20}
    X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy)
    # Test that the score_samples method is implemented on a pipeline.
    # Test that the score_samples method on pipeline yields same results as
    # applying transform and score_samples steps separately.
    rus = RandomUnderSampler()
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    lof = LocalOutlierFactor(novelty=True)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('lof', lof)])
    pipe.fit(X, y)
    # Check the shapes
    assert pipe.score_samples(X).shape == (X.shape[0], )
    # Check the values
    lof.fit(pca.fit_transform(X))
    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
Ejemplo n.º 33
0
def test_balanced_batch_generator_function_sparse(keep_sparse):
    X, y = load_iris(return_X_y=True)
    X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40})
    X = X.astype(np.float32)

    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X),
        y,
        keep_sparse=keep_sparse,
        batch_size=10,
        random_state=42)
    for idx in range(steps_per_epoch):
        X_batch, y_batch = next(training_generator)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
Ejemplo n.º 34
0
def random_forest(df_normalized_w_target):

    X = df_normalized_w_target[list(df_normalized_w_target.columns)[7:-1]]
    print("X Shape", X.shape)
    Y = df_normalized_w_target[list(df_normalized_w_target.columns)[-1]]
    print("Y Shape", Y.shape)

    perm_feat_imp = X.iloc[:, [0, 5, 9, 3, 12, 13, 4, 23, 7, 10, 16, 6, 52]]
    print("Perm Feat Impt Shape", perm_feat_imp.shape)

    X, y = make_imbalance(perm_feat_imp,
                          Y,
                          sampling_strategy={
                              1: 2700,
                              2: 2700,
                              3: 2700,
                              4: 2700,
                              5: 2700,
                              6: 2700,
                              7: 2700
                          },
                          random_state=42)
    X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
        X, y, random_state=42)
    print('Training target statistics: {}'.format(Counter(y_train_rf)))
    print('Testing target statistics: {}'.format(Counter(y_test_rf)))
    rfc = RandomForestClassifier(n_estimators=100)
    rfc = rfc.fit(X_train_rf, y_train_rf)
    rfc_pred = rfc.predict(X_test_rf)
    print("rfc pred shape", rfc_pred.shape)
    y_pred_rf = rfc.predict(X_test_rf)
    print("y pred rf shape", y_pred_rf.shape)
    print("y train rf shape", y_train_rf.shape)
    print("y train rf shape", X_train_rf.shape)

    rf_train_acc = metrics.accuracy_score(y_train_rf, rfc.predict(X_train_rf))
    rf_test_acc = metrics.accuracy_score(y_test_rf, rfc.predict(X_test_rf))
    print("Random Forest Train Accuracy:",
          metrics.accuracy_score(y_train_rf, rfc.predict(X_train_rf)))
    print("Random Forest Test Accuracy:",
          metrics.accuracy_score(y_test_rf, rfc.predict(X_test_rf)))
    print(confusion_matrix(y_test_rf, rfc_pred))
    print(classification_report(y_test_rf, rfc_pred))
    #print(classification_report(y_test_rf,rfc_pred))
    return (rf_train_acc, rf_test_acc)
Ejemplo n.º 35
0
def training(train_dataset):
    logging.debug("func called training")
    XX = train_dataset.drop([
        "grade", "evaluat_desc", 'game_id', "Description", "shortDesc",
        "UpdateDescription", "subject", "game_tags", "Type", "game_feature",
        "game_key", "main_play", "play_key", "game_play_way", "playway",
        "gamefeature", "grade_", "gametype"
    ],
                            axis=1)

    feature = XX.columns

    X_train, y_train = train_dataset[feature], train_dataset["grade_"]

    logging.debug('balancing dataset')

    def ratio_data(grade, n):

        return int(round(len(y_train[y_train == grade]) * n))

    ratio = {}
    #ratio = {3: ratio_data(3, 1), 4: ratio_data(4, 1), 1: ratio_data(1, 1), 2: ratio_data(2, 0.9)}
    for i in range(1, 5):
        if i in list(set(y_train)) and i != 2:
            ratio[i] = ratio_data(i, 1)
        if i in list(set(y_train)) and i == 2:
            ratio[i] = ratio_data(i, 0.9)
    X_train, y_train = make_imbalance(XX, train_dataset.grade_, ratio=ratio)

    logging.debug('feature importance & feature selection')
    clf = RandomForestClassifier(criterion='entropy',
                                 n_estimators=100,
                                 random_state=1,
                                 n_jobs=2)
    clf.fit(X_train, y_train)
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    for f in range(X_train.shape[1]):
        print(XX.columns[f], importances[indices[f]])

    logging.debug('save params to pkl file')
    with open(os.path.join(path_config.MODEL_DIR, 'clf.pkl'), "wb") as f:
        cPickle.dump(clf, f)

    return clf
Ejemplo n.º 36
0
def test_easy_ensemble_classifier_grid_search():
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20, 1: 25, 2: 50},
        random_state=0,
    )

    parameters = {
        "n_estimators": [1, 2],
        "base_estimator__n_estimators": [3, 4],
    }
    grid_search = GridSearchCV(
        EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()),
        parameters,
        cv=5,
    )
    grid_search.fit(X, y)
Ejemplo n.º 37
0
def test_error():
    # Test that it gives proper exception on deficient input.
    X, y = make_imbalance(
        iris.data, iris.target, sampling_strategy={0: 20,
                                                   1: 25,
                                                   2: 50})
    base = DecisionTreeClassifier()

    # Test n_estimators
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y)

    # Test max_samples
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=1000).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples="foobar").fit, X,
                  y)

    # Test max_features
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features="foobar").fit,
                  X, y)

    # Test support of decision_function
    assert not (hasattr(
        BalancedBaggingClassifier(base).fit(X, y), 'decision_function'))
Ejemplo n.º 38
0
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    clf2 = make_pipeline(RandomUnderSampler(
        random_state=clf1.estimators_[0].steps[0][1].random_state),
                         KNeighborsClassifier()).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Ejemplo n.º 39
0
def test_base_estimator():
    # Check base_estimator and its default values.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = EasyEnsembleClassifier(
        2, None, n_jobs=-1, random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)

    ensemble = EasyEnsembleClassifier(
        2, AdaBoostClassifier(), n_jobs=-1, random_state=0).fit(
            X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    clf2 = make_pipeline(RandomUnderSampler(
        random_state=clf1.estimators_[0].steps[0][1].random_state),
                         KNeighborsClassifier()).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Ejemplo n.º 41
0
def test_base_estimator():
    # Check base_estimator and its default values.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = EasyEnsembleClassifier(
        2, None, n_jobs=-1, random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)

    ensemble = EasyEnsembleClassifier(
        2, AdaBoostClassifier(), n_jobs=-1, random_state=0).fit(
            X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)
Ejemplo n.º 42
0
def sample_data(df_normalized_w_target):
    X = df_normalized_w_target[list(df_normalized_w_target.columns)[7:-1]]
    Y = df_normalized_w_target[list(df_normalized_w_target.columns)[-1]]
    X, y = make_imbalance(X,
                          Y,
                          sampling_strategy={
                              1: 2700,
                              2: 2700,
                              3: 2700,
                              4: 2700,
                              5: 2700,
                              6: 2700,
                              7: 2700
                          },
                          random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    #print(X_train, X_test, y_train, y_test
    print('* Data Sampled')
    return (X_train, X_test, y_train, y_test)
Ejemplo n.º 43
0
def test_make_imbalance_float():
    X_, y_ = assert_warns_message(DeprecationWarning,
                                  "'min_c_' is deprecated in 0.2",
                                  make_imbalance,
                                  X,
                                  Y,
                                  ratio=0.5,
                                  min_c_=1)
    X_, y_ = assert_warns_message(DeprecationWarning,
                                  "'ratio' being a float is deprecated",
                                  make_imbalance,
                                  X,
                                  Y,
                                  ratio=0.5,
                                  min_c_=1)
    assert_equal(Counter(y_), {0: 50, 1: 25, 2: 50})
    # resample without using min_c_
    X_, y_ = make_imbalance(X_, y_, ratio=0.25, min_c_=None)
    assert_equal(Counter(y_), {0: 50, 1: 12, 2: 50})
Ejemplo n.º 44
0
def check_classifiers_with_encoded_labels(name, classifier):
    # Non-regression test for #709
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709
    pytest.importorskip("pandas")
    df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True)
    df, y = make_imbalance(df,
                           y,
                           sampling_strategy={
                               "Iris-setosa": 30,
                               "Iris-versicolor": 20,
                               "Iris-virginica": 50,
                           })
    classifier.set_params(sampling_strategy={
        "Iris-setosa": 20,
        "Iris-virginica": 20,
    })
    classifier.fit(df, y)
    assert set(classifier.classes_) == set(y.cat.categories.tolist())
    y_pred = classifier.predict(df)
    assert set(y_pred) == set(y.cat.categories.tolist())
Ejemplo n.º 45
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 base_estimator=base_estimator,
                                 n_jobs=-1,
                                 random_state=RND_SEED)
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert (len(est.named_steps['classifier']) ==
                base_estimator.n_estimators)
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Ejemplo n.º 46
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 base_estimator=base_estimator,
                                 n_jobs=-1,
                                 random_state=RND_SEED)
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert (len(est.named_steps['classifier']) ==
                base_estimator.n_estimators)
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Ejemplo n.º 47
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    base_estimator = DecisionTreeClassifier().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    # disable the resampling by passing an empty dictionary.
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=False,
        n_estimators=10,
        sampling_strategy={},
        random_state=0,
    ).fit(X_train, y_train)

    assert ensemble.score(X_train,
                          y_train) == base_estimator.score(X_train, y_train)

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=True,
        random_state=0,
    ).fit(X_train, y_train)

    assert ensemble.score(X_train, y_train) < base_estimator.score(
        X_train, y_train)
def test_balanced_bagging_classifier():
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "max_features": [1, 2, 4],
                          "bootstrap": [True, False],
                          "bootstrap_features": [True, False]})

    for base_estimator in [None,
                           DummyClassifier(),
                           Perceptron(),
                           DecisionTreeClassifier(),
                           KNeighborsClassifier(),
                           SVC()]:
        for params in grid:
            BalancedBaggingClassifier(
                base_estimator=base_estimator,
                random_state=0,
                **params).fit(X_train, y_train).predict(X_test)
def check_classifiers_with_encoded_labels(name, classifier_orig):
    # Non-regression test for #709
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709
    pd = pytest.importorskip("pandas")
    classifier = clone(classifier_orig)
    iris = load_iris(as_frame=True)
    df, y = iris.data, iris.target
    y = pd.Series(iris.target_names[iris.target], dtype="category")
    df, y = make_imbalance(
        df,
        y,
        sampling_strategy={
            "setosa": 30,
            "versicolor": 20,
            "virginica": 50,
        },
    )
    classifier.set_params(sampling_strategy={"setosa": 20, "virginica": 20})
    classifier.fit(df, y)
    assert set(classifier.classes_) == set(y.cat.categories.tolist())
    y_pred = classifier.predict(df)
    assert set(y_pred) == set(y.cat.categories.tolist())
Ejemplo n.º 50
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]:
        clf = BalancedBaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=100,
            bootstrap=True,
            oob_score=True,
            random_state=0,
        ).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        with pytest.warns(UserWarning):
            BalancedBaggingClassifier(
                base_estimator=base_estimator,
                n_estimators=1,
                bootstrap=True,
                oob_score=True,
                random_state=0,
            ).fit(X_train, y_train)
Ejemplo n.º 51
0
def create_imbalance(X, y, min_class, maj_class, imb_ratio, verbose=True):
    """
    Create artificially an imbalance of (balanced) data
    """
    # get samples for each class if original total number of samples is unknown (eg. 12500 for IMDB)

    X_min, X_maj = [], []
    for i, value in enumerate(y):
        if value in min_class:
            X_min.append(X[i])
        if value in maj_class:
            X_maj.append(X[i])

    maj_cardinality = len(X_maj)  # samples of majority class
    min_count = int(maj_cardinality * imb_ratio)  # desired number of samples of minority class with ratio imb_ratio

    # need to reshape for images as 'make_imbalance' expects X to be a 2d-array.
    X_orig = X
    if len(list(X.shape)) > 2:
        X = X.reshape(X.shape[0], -1)

    X_res, y_res = make_imbalance(X, y,
                                  sampling_strategy={min_class[0]: min_count, maj_class[0]: maj_cardinality},
                                  random_state=42, verbose=True)

    # reshape backwards to original shape
    if len(list(X.shape)) > 2:
        X_res = X_res.reshape(X_res.shape[0], X_orig.shape[1], X_orig.shape[2], X_orig.shape[3])

    if verbose:
        print("min_class is: ", min_class)
        print("maj_class is: ", maj_class)
        print('Distribution before imbalancing: {}'.format(Counter(y)))
        print('Distribution after imbalancing: {}'.format(Counter(y_res)))

    return X_res, y_res
Ejemplo n.º 52
0
def test_bootstrap_features():
    # Test that bootstrapping features may generate duplicate features.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=False,
        random_state=0,
    ).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert np.unique(features).shape[0] == X.shape[1]

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=True,
        random_state=0,
    ).fit(X_train, y_train)

    unique_features = [
        np.unique(features).shape[0]
        for features in ensemble.estimators_features_
    ]
    assert np.median(unique_features) < X.shape[1]
Ejemplo n.º 53
0
def begin():
    # Import Dataset
    dataset = pd.read_csv('CSV/CTG.csv')

    # Pre-processing data
    dataset = pp.clean_nan(dataset)
    print(dataset.shape)
    X, y = pp.split_iv_dv(dataset=dataset, exclude=(0, 1, 2, 39))

    print(pp.get_balance(y))

    # Making dataset imbalanced
    from imblearn.datasets import make_imbalance
    X_resampled, y_resampled = make_imbalance(X, y, ratio=0.05, min_c_=3, random_state=0)

    print('Synthetic generation:\n', pp.get_balance(y_resampled))

    X_csv = pd.DataFrame(X_resampled)
    y_csv = pd.DataFrame(y_resampled)

    dataframe = pd.concat((X_csv, y_csv), axis=1)
    dataframe.columns = ['b', 'e', 'LBE', 'LB', 'AC', 'FM', 'UC', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'DL', 'DS', 'DP', 'DR', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP', 'CLASS', 'NSP']
    dataframe.to_csv('CTG_imb.csv', index=False)
    return 0
Ejemplo n.º 54
0
def data():
    iris = load_iris()
    X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40})
    y = to_categorical(y, 3)
    return X, y
Ejemplo n.º 55
0
###############################################################################
# We will show how to use the parameter ``ratio`` when dealing with the
# ``make_imbalance`` function. For this function, this parameter accepts both
# dictionary and callable. When using a dictionary, each key will correspond to
# the class of interest and the corresponding value will be the number of
# samples desired in this class.

iris = load_iris()

print('Information of the original iris data set: \n {}'.format(
    Counter(iris.target)))
plot_pie(iris.target)

ratio = {0: 10, 1: 20, 2: 30}
X, y = make_imbalance(iris.data, iris.target, ratio=ratio)

print('Information of the iris data set after making it'
      ' imbalanced using a dict: \n ratio={} \n y: {}'.format(ratio,
                                                              Counter(y)))
plot_pie(y)

###############################################################################
# You might required more flexibility and require your own heuristic to
# determine the number of samples by class and you can define your own callable
# as follow. In this case we will define a function which will use a float
# multiplier to define the number of samples per class.


def ratio_multiplier(y):
    multiplier = {0: 0.5, 1: 0.7, 2: 0.95}
Ejemplo n.º 56
0
from sklearn.datasets import load_iris

keras = pytest.importorskip('keras')
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import NearMiss

from imblearn.keras import BalancedBatchGenerator
from imblearn.keras import balanced_batch_generator

iris = load_iris()
X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40})
y = to_categorical(y, 3)


def _build_keras_model(n_classes, n_features):
    model = Sequential()
    model.add(Dense(n_classes, input_dim=n_features, activation='softmax'))
    model.compile(optimizer='sgd', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


def test_balanced_batch_generator_class_no_return_indices():
    with pytest.raises(ValueError, match='needs to return the indices'):
        BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10)
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50},
                      random_state=0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10)

# Two subplots, unpack the axes array immediately
f, axs = plt.subplots(2, 3)

axs = [a for ax in axs for a in ax]

axs[0].scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0",
            alpha=0.5, edgecolor=almost_black, facecolor=palette[0],
            linewidth=0.15)
axs[0].scatter(X[y == 1, 0], X[y == 1, 1], label="Class #1",
           alpha=0.5, edgecolor=almost_black, facecolor=palette[2],
           linewidth=0.15)
axs[0].set_title('Original set')

ratios = [0.9, 0.75, 0.5, 0.25, 0.1]
for i, ratio in enumerate(ratios, start=1):
    ax = axs[i]

    X_, y_ = make_imbalance(X, y, ratio=ratio, min_c_=1)

    ax.scatter(X_[y_ == 0, 0], X_[y_ == 0, 1], label="Class #0",
                alpha=0.5, edgecolor=almost_black, facecolor=palette[0],
                linewidth=0.15)
    ax.scatter(X_[y_ == 1, 0], X_[y_ == 1, 1], label="Class #1",
               alpha=0.5, edgecolor=almost_black, facecolor=palette[2],
               linewidth=0.15)
    ax.set_title('make_imbalance ratio ({})'.format(ratio))

plt.show()
Ejemplo n.º 59
0
sns.set()

# Define some color for the plotting
almost_black = '#262626'
palette = sns.color_palette()

# Generate the dataset
X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10)

f, axs = plt.subplots(1, 2)

# Original
axs[0].scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0",
            alpha=0.5, facecolor=palette[0],
            linewidth=0.15)
axs[0].scatter(X[y == 1, 0], X[y == 1, 1], label="Class #0",
            alpha=0.5, facecolor=palette[2],
            linewidth=0.15)
# Make imbalance
X_, y_ = make_imbalance(X, y, ratio=0.5, min_c_=1)
X_0, y_0 = make_imbalance(X, y, ratio=0.5, min_c_=0)
# After making imbalance
axs[1].scatter(X_[y_ == 0, 0], X_[y_ == 0, 1], label="Class #0",
            alpha=0.5, facecolor=palette[0],
            linewidth=0.15)
axs[1].scatter(X_[y_ == 1, 0], X_[y_ == 1, 1], label="Class #0",
            alpha=0.5, facecolor=palette[2],
            linewidth=0.15)
plt.show()