コード例 #1
0
def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts):
    pytest.importorskip("pandas")
    X, y = fetch_openml("iris", version=1, return_X_y=True, as_frame=as_frame)
    X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy)
    if as_frame:
        assert hasattr(X_res, "loc")
    assert Counter(y_res) == expected_counts
コード例 #2
0
def check_classifiers_with_encoded_labels(name, classifier_orig):
    # Non-regression test for #709
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709
    pytest.importorskip("pandas")
    classifier = clone(classifier_orig)
    df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True)
    df, y = make_imbalance(
        df,
        y,
        sampling_strategy={
            "Iris-setosa": 30,
            "Iris-versicolor": 20,
            "Iris-virginica": 50,
        },
    )
    classifier.set_params(sampling_strategy={
        "Iris-setosa": 20,
        "Iris-virginica": 20
    })
    classifier.fit(df, y)
    assert set(classifier.classes_) == set(y.cat.categories.tolist())
    y_pred = classifier.predict(df)
    assert set(y_pred) == set(y.cat.categories.tolist())
コード例 #3
0
def test_make_imbalance_dict(iris, sampling_strategy, expected_counts):
    X, y = iris
    _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy)
    assert Counter(y_) == expected_counts
コード例 #4
0
def test_make_imbalance_error_single_class(iris):
    X, y = iris
    y = np.zeros_like(y)
    with pytest.raises(ValueError, match="needs to have more than 1 class."):
        make_imbalance(X, y, sampling_strategy={0: 10})
コード例 #5
0
def test_make_imbalance_error(iris, sampling_strategy, err_msg):
    # we are reusing part of utils.check_sampling_strategy, however this is not
    # cover in the common tests so we will repeat it here
    X, y = iris
    with pytest.raises(ValueError, match=err_msg):
        make_imbalance(X, y, sampling_strategy=sampling_strategy)
コード例 #6
0
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

RANDOM_STATE = 42

# %% [markdown]
# Preparation
# -----------
# **Make 3 imbalanced toy classification tasks.**

distribution = {0: 100, 1: 50}

# dataset 1
X, y = make_moons(200, noise=0.2, random_state=RANDOM_STATE)
dataset1 = make_imbalance(X,
                          y,
                          sampling_strategy=distribution,
                          random_state=RANDOM_STATE)
# dataset 2
X, y = make_circles(200, noise=0.2, factor=0.5, random_state=RANDOM_STATE)
dataset2 = make_imbalance(X,
                          y,
                          sampling_strategy=distribution,
                          random_state=RANDOM_STATE)
# dataset 3
X, y = make_classification(200,
                           n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
X += 2 * np.random.RandomState(RANDOM_STATE).uniform(size=X.shape)
コード例 #7
0
imbalance_distr = {
    0: 178,
    1: 120,
    2: 80,
    3: 60,
    4: 50,
    5: 44,
    6: 40,
    7: 40,
    8: 40,
    9: 40
}

X, y = make_imbalance(X,
                      y,
                      sampling_strategy=imbalance_distr,
                      random_state=RANDOM_STATE)

fig = plot_2Dprojection_and_cardinality(X, y, figsize=(8, 4))

# %% [markdown]
# Classification
# --------------
# We split the data into train and test subsets and fit a ``SelfPacedEnsembleClassifier`` (with support vector machine as base classifier) on the train samples.
# The fitted classifier can subsequently be used to predict the value of the digit for the samples in the test subset.

# Split data into 50% train and 50% test subsets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    shuffle=True,
コード例 #8
0
RANDOM_STATE = 42

# %% [markdown]
# Preparation
# -----------
# **Make 3 imbalanced iris classification tasks.**

iris = sklearn.datasets.load_iris()
X = iris.data[:, 0:2]  # we only take the first two features for visualization
y = iris.target

X, y = make_imbalance(X,
                      y,
                      sampling_strategy={
                          0: 50,
                          1: 30,
                          2: 10
                      },
                      random_state=RANDOM_STATE)
print('Class distribution of imbalanced iris dataset: \n%s' %
      sort_dict_by_key(Counter(y)))

# %% [markdown]
# **Create SPE (ensemble size = 5) with different base classifiers.**

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

classifiers = {
コード例 #9
0
    data=X,
    x="feature 1",
    y="feature 2",
    hue=y,
    style=y,
    ax=axs[0, 0],
)
axs[0, 0].set_title("Original set")

multipliers = [0.9, 0.75, 0.5, 0.25, 0.1]
for ax, multiplier in zip(axs.ravel()[1:], multipliers):
    X_resampled, y_resampled = make_imbalance(
        X,
        y,
        sampling_strategy=ratio_func,
        **{
            "multiplier": multiplier,
            "minority_class": 1
        },
    )

    sns.scatterplot(
        data=X_resampled,
        x="feature 1",
        y="feature 2",
        hue=y_resampled,
        style=y_resampled,
        ax=ax,
    )
    ax.set_title(f"Sampling ratio = {multiplier}")