def generate_oversamplers(oversamplers_names): "Generate oversamplers." oversamplers = [ ('NO OVERSAMPLING', None, {}), ('RANDOM OVERSAMPLING', RandomOverSampler(), {}), ('SMOTE', SMOTE(), { 'k_neighbors': [3, 5] }), ('BORDERLINE SMOTE', BorderlineSMOTE(), { 'k_neighbors': [3, 5] }), ('ADASYN', ADASYN(), { 'n_neighbors': [2, 3] }), ('G-SMOTE', GeometricSMOTE(), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] }), ('K-MEANS RANDOM OVERSAMPLING', RandomOverSampler(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS SMOTE', SMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS BORDERLINE SMOTE', BorderlineSMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS G-SMOTE', GeometricSMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('SOMO', SMOTE(clusterer=SOM(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0], 'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0] }), ('G-SOMO', GeometricSMOTE(clusterer=SOM(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0], 'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0] }) ] if oversamplers_names in ('basic', 'scaled', 'undersampled'): oversamplers = select_pipelines( oversamplers, ('NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE', 'BORDERLINE SMOTE', 'ADASYN', 'G-SMOTE')) if oversamplers_names == 'scaled': oversamplers = append_transformer(MinMaxScaler(), oversamplers) elif oversamplers_names == 'undersampled': oversamplers = set_sampling_strategy( lambda y: generate_sampling_strategy(y, 1 / 3), oversamplers) oversamplers = append_transformer( RandomUnderSampler( sampling_strategy=lambda y: generate_sampling_strategy(y, 3)), oversamplers) return oversamplers
from sklearnext.over_sampling.base import DensityDistributor from sklearnext.tools import report_model_search_results # Load data X, y = make_classification(n_informative=15, n_clusters_per_class=3, weights=[0.9, 0.1]) # Define estimators estimators = [('GBC', GradientBoostingClassifier()), ('SMOTE+GBC', Pipeline([('smote', SMOTE()), ('gbc', GradientBoostingClassifier())])), ('KMeanSMOTE+GBC', Pipeline([('smote', SMOTE(clusterer=KMeans(n_init=1), distributor=DensityDistributor())), ('gbc', GradientBoostingClassifier())]))] # Define parameters grid param_grids = [{ 'SMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5], 'SMOTE+GBC__gbc__max_depth': [2, 4] }, { 'KMeanSMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5], 'KMeanSMOTE+GBC__smote__clusterer__n_clusters': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'KMeanSMOTE+GBC__smote__distributor__filtering_threshold': [0.8, 1.0, 1.2, 1.5], 'KMeanSMOTE+GBC__gbc__max_depth': [2, 4] }]
from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.cluster import KMeans from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import RandomOverSampler, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'random-oversampler') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('RANDOM OVERSAMPLING', RandomOverSampler(random_state=0)), ('K-MEANS RANDOM OVERSAMPLING', RandomOverSampler(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), { 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] } ) ] classifiers = [ ('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}), ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}), ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth': [3, 6], 'n_estimators': [50, 100]}) ] # Load datasets imbalanced_datasets = read_csv_dir(datasets_path)
import pytest import numpy as np from sklearnext.cluster import KMeans from ...over_sampling import RandomOverSampler, SMOTE, GeometricSMOTE from ...cluster import SOM from ...utils.validation import _TrivialOversampler from ..distribution import DensityDistributor X = np.array(list(product(range(5), range(4)))) y = np.array([0] * 10 + [1] * 6 + [2] * 4) LABELS = np.array([0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 3, 3, 3, 0, 3, 3, 3]) NEIGHBORS = [(0, 1), (0, 2), (0, 3), (1, 2), (2, 3)] @pytest.mark.parametrize('clusterer', [None, KMeans(), SOM()]) def test_fit(clusterer): """Test the fit method of the extended base oversampler.""" oversampler = _TrivialOversampler(clusterer=clusterer).fit(X, y) assert oversampler.sampling_strategy_ == OrderedDict({1: 4, 2: 6}) @pytest.mark.parametrize('clusterer', [None, KMeans(), SOM()]) def test_fit(clusterer): """Test the fit and resample method of the extended base oversampler.""" oversampler = _TrivialOversampler(clusterer=clusterer) X_resampled, y_resampled = oversampler.fit_resample(X, y) assert hasattr(oversampler, 'distributor_') assert hasattr(oversampler.distributor_, 'intra_distribution_') assert hasattr(oversampler.distributor_, 'inter_distribution_') if isinstance(clusterer, SOM):
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import SMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'smote') # Oversamplers and classifiers oversamplers = [('NO OVERSAMPLING', None), ('SMOTE', SMOTE(random_state=0), { 'k_neighbors': [3, 5] }), ('K-MEANS SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] })] classifiers = [('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), { 'n_neighbors': [3, 5] }), ('DT', DecisionTreeClassifier(random_state=2), { 'max_depth': [3, 6] }),
from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.cluster import KMeans from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import SMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'borderline-smote') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('BORDERLINE-SMOTE', SMOTE(random_state=0, kind='borderline1'), {'k_neighbors': [3, 5]}), ('K-MEANS BORDERLINE-SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0, kind='borderline1'), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] } ) ] classifiers = [ ('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}), ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}), ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth':[3, 6], 'n_estimators': [50, 100]}) ] # Load datasets
from sklearnext.over_sampling import GeometricSMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'gsmote') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('G-SMOTE', GeometricSMOTE(random_state=0), { 'k_neighbors': [3, 5], 'truncation_factor': [-1.0, 0.0, 1.0], 'deformation_factor': [0.0, 0.5, 1.0] } ), ('K-MEANS G-SMOTE', GeometricSMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), { 'k_neighbors': [3, 5], 'truncation_factor': [-1.0, 0.0, 1.0], 'deformation_factor': [0.0, 0.5, 1.0], 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] } ) ] classifiers = [ ('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}), ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}), ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth': [3, 6], 'n_estimators': [50, 100]}) ]