def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts): pytest.importorskip("pandas") X, y = fetch_openml("iris", version=1, return_X_y=True, as_frame=as_frame) X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy) if as_frame: assert hasattr(X_res, "loc") assert Counter(y_res) == expected_counts
def check_classifiers_with_encoded_labels(name, classifier_orig): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pytest.importorskip("pandas") classifier = clone(classifier_orig) df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True) df, y = make_imbalance( df, y, sampling_strategy={ "Iris-setosa": 30, "Iris-versicolor": 20, "Iris-virginica": 50, }, ) classifier.set_params(sampling_strategy={ "Iris-setosa": 20, "Iris-virginica": 20 }) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df) assert set(y_pred) == set(y.cat.categories.tolist())
def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): X, y = iris _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy) assert Counter(y_) == expected_counts
def test_make_imbalance_error_single_class(iris): X, y = iris y = np.zeros_like(y) with pytest.raises(ValueError, match="needs to have more than 1 class."): make_imbalance(X, y, sampling_strategy={0: 10})
def test_make_imbalance_error(iris, sampling_strategy, err_msg): # we are reusing part of utils.check_sampling_strategy, however this is not # cover in the common tests so we will repeat it here X, y = iris with pytest.raises(ValueError, match=err_msg): make_imbalance(X, y, sampling_strategy=sampling_strategy)
import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap RANDOM_STATE = 42 # %% [markdown] # Preparation # ----------- # **Make 3 imbalanced toy classification tasks.** distribution = {0: 100, 1: 50} # dataset 1 X, y = make_moons(200, noise=0.2, random_state=RANDOM_STATE) dataset1 = make_imbalance(X, y, sampling_strategy=distribution, random_state=RANDOM_STATE) # dataset 2 X, y = make_circles(200, noise=0.2, factor=0.5, random_state=RANDOM_STATE) dataset2 = make_imbalance(X, y, sampling_strategy=distribution, random_state=RANDOM_STATE) # dataset 3 X, y = make_classification(200, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) X += 2 * np.random.RandomState(RANDOM_STATE).uniform(size=X.shape)
imbalance_distr = { 0: 178, 1: 120, 2: 80, 3: 60, 4: 50, 5: 44, 6: 40, 7: 40, 8: 40, 9: 40 } X, y = make_imbalance(X, y, sampling_strategy=imbalance_distr, random_state=RANDOM_STATE) fig = plot_2Dprojection_and_cardinality(X, y, figsize=(8, 4)) # %% [markdown] # Classification # -------------- # We split the data into train and test subsets and fit a ``SelfPacedEnsembleClassifier`` (with support vector machine as base classifier) on the train samples. # The fitted classifier can subsequently be used to predict the value of the digit for the samples in the test subset. # Split data into 50% train and 50% test subsets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True,
RANDOM_STATE = 42 # %% [markdown] # Preparation # ----------- # **Make 3 imbalanced iris classification tasks.** iris = sklearn.datasets.load_iris() X = iris.data[:, 0:2] # we only take the first two features for visualization y = iris.target X, y = make_imbalance(X, y, sampling_strategy={ 0: 50, 1: 30, 2: 10 }, random_state=RANDOM_STATE) print('Class distribution of imbalanced iris dataset: \n%s' % sort_dict_by_key(Counter(y))) # %% [markdown] # **Create SPE (ensemble size = 5) with different base classifiers.** from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF classifiers = {
data=X, x="feature 1", y="feature 2", hue=y, style=y, ax=axs[0, 0], ) axs[0, 0].set_title("Original set") multipliers = [0.9, 0.75, 0.5, 0.25, 0.1] for ax, multiplier in zip(axs.ravel()[1:], multipliers): X_resampled, y_resampled = make_imbalance( X, y, sampling_strategy=ratio_func, **{ "multiplier": multiplier, "minority_class": 1 }, ) sns.scatterplot( data=X_resampled, x="feature 1", y="feature 2", hue=y_resampled, style=y_resampled, ax=ax, ) ax.set_title(f"Sampling ratio = {multiplier}")