from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearnext.tools import BinaryExperiment
from sklearnext.over_sampling import SMOTE, GeometricSMOTE

# Generate datasets
datasets = [
    ('A', make_classification(random_state=1, weights=[0.80, 0.20], n_features=10)),
    ('B', make_classification(random_state=1, weights=[0.85, 0.15], n_features=10)),
    ('C', make_classification(random_state=1, weights=[0.90, 0.10], n_features=10))
]

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('SMOTE', SMOTE(random_state=0), {'k_neighbors':[3, 4]}),
    ('G-SMOTE', GeometricSMOTE(random_state=0), {
        'k_neighbors':[3, 4], 
        'deformation_factor': [0.25, 0.50, 0.75], 
        'truncation_factor': [-0.5, 0.0, 0.5]
        }
    )
]
classifiers = [
    ('DT', DecisionTreeClassifier(), {'max_depth': [3, 4, 5]}),
    ('KNN', KNeighborsClassifier(), {'n_neighbors':[3, 5]}),
]

# Define experiment
experiment = BinaryExperiment(
    name='example', 
Beispiel #2
0
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.over_sampling import RandomOverSampler, SMOTE, GeometricSMOTE, DensityDistributor
from sklearnext.cluster import SOM
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data',
                     'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'gsomo')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('RANDOM OVERSAMPLING', RandomOverSampler(random_state=0)),
    ('SMOTE', SMOTE(random_state=1), {
        'k_neighbors': [3, 5]
    }),
    ('G-SOMO',
     GeometricSMOTE(clusterer=SOM(),
                    distributor=DensityDistributor(distances_exponent=2,
                                                   filtering_threshold=1.0),
                    random_state=3), {
                        'k_neighbors': [3, 5],
                        'truncation_factor': [-1.0, 0.0, 0.25, 1.0],
                        'deformation_factor': [0.0, 0.5, 1.0],
                        'clusterer__n_clusters': [0.2, 0.5],
                        'distributor__distribution_ratio': [0.75, 1.0]
                    })
]
classifiers = [('LR', LogisticRegression()),
Beispiel #3
0
from imblearn.pipeline import Pipeline
from sklearnext.cluster import KMeans
from sklearnext.model_selection import ModelSearchCV
from sklearnext.over_sampling import SMOTE
from sklearnext.over_sampling.base import DensityDistributor
from sklearnext.tools import report_model_search_results

# Load data
X, y = make_classification(n_informative=15,
                           n_clusters_per_class=3,
                           weights=[0.9, 0.1])

# Define estimators
estimators = [('GBC', GradientBoostingClassifier()),
              ('SMOTE+GBC',
               Pipeline([('smote', SMOTE()),
                         ('gbc', GradientBoostingClassifier())])),
              ('KMeanSMOTE+GBC',
               Pipeline([('smote',
                          SMOTE(clusterer=KMeans(n_init=1),
                                distributor=DensityDistributor())),
                         ('gbc', GradientBoostingClassifier())]))]

# Define parameters grid
param_grids = [{
    'SMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5],
    'SMOTE+GBC__gbc__max_depth': [2, 4]
}, {
    'KMeanSMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5],
    'KMeanSMOTE+GBC__smote__clusterer__n_clusters':
    [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
Beispiel #4
0
def generate_oversamplers(oversamplers_names):
    "Generate oversamplers."
    oversamplers = [
        ('NO OVERSAMPLING', None, {}),
        ('RANDOM OVERSAMPLING', RandomOverSampler(), {}),
        ('SMOTE', SMOTE(), {
            'k_neighbors': [3, 5]
        }), ('BORDERLINE SMOTE', BorderlineSMOTE(), {
            'k_neighbors': [3, 5]
        }), ('ADASYN', ADASYN(), {
            'n_neighbors': [2, 3]
        }),
        ('G-SMOTE', GeometricSMOTE(), {
            'k_neighbors': [3, 5],
            'selection_strategy': ['combined', 'minority', 'majority'],
            'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
            'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
        }),
        ('K-MEANS RANDOM OVERSAMPLING',
         RandomOverSampler(clusterer=KMeans(),
                           distributor=DensityDistributor()),
         {
             'k_neighbors': [3,
                             5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS SMOTE',
         SMOTE(clusterer=KMeans(), distributor=DensityDistributor()), {
             'k_neighbors': [3, 5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS BORDERLINE SMOTE',
         BorderlineSMOTE(clusterer=KMeans(), distributor=DensityDistributor()),
         {
             'k_neighbors': [3, 5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS G-SMOTE',
         GeometricSMOTE(clusterer=KMeans(), distributor=DensityDistributor()),
         {
             'k_neighbors':
             [3, 5],
             'selection_strategy': ['combined', 'minority', 'majority'],
             'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
             'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('SOMO', SMOTE(clusterer=SOM(), distributor=DensityDistributor()), {
            'k_neighbors': [3, 5],
            'clusterer__n_clusters':
            [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'distributor__distances_exponent': [0, 1, 2, 5],
            'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0],
            'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
        }),
        ('G-SOMO',
         GeometricSMOTE(clusterer=SOM(), distributor=DensityDistributor()), {
             'k_neighbors': [3, 5],
             'selection_strategy': ['combined', 'minority', 'majority'],
             'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
             'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0],
             'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
         })
    ]
    if oversamplers_names in ('basic', 'scaled', 'undersampled'):
        oversamplers = select_pipelines(
            oversamplers, ('NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE',
                           'BORDERLINE SMOTE', 'ADASYN', 'G-SMOTE'))
    if oversamplers_names == 'scaled':
        oversamplers = append_transformer(MinMaxScaler(), oversamplers)
    elif oversamplers_names == 'undersampled':
        oversamplers = set_sampling_strategy(
            lambda y: generate_sampling_strategy(y, 1 / 3), oversamplers)
        oversamplers = append_transformer(
            RandomUnderSampler(
                sampling_strategy=lambda y: generate_sampling_strategy(y, 3)),
            oversamplers)

    return oversamplers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.cluster import KMeans
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import SMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data',
                     'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results',
                    'kmeans-oversampling', 'smote')

# Oversamplers and classifiers
oversamplers = [('NO OVERSAMPLING', None),
                ('SMOTE', SMOTE(random_state=0), {
                    'k_neighbors': [3, 5]
                }),
                ('K-MEANS SMOTE',
                 SMOTE(clusterer=KMeans(random_state=1, n_init=1),
                       distributor=DensityDistributor(),
                       random_state=0), {
                           'k_neighbors': [3, 5],
                           'clusterer__n_clusters':
                           [0.0, 0.25, 0.5, 0.75, 1.0],
                           'distributor__distances_exponent': [0, 1, 2],
                           'distributor__filtering_threshold': [0.5, 1.0]
                       })]
classifiers = [('LR', LogisticRegression()),
               ('KNN', KNeighborsClassifier(), {
                   'n_neighbors': [3, 5]
Beispiel #6
0
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.cluster import KMeans, SOM, AgglomerativeClustering, Birch, SpectralClustering
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import SMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data',
                     'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results',
                    'clustering-smote')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('SMOTE', SMOTE(random_state=0), {
        'k_neighbors': [3, 4, 5]
    }),
    ('K-MEANS SMOTE',
     SMOTE(clusterer=KMeans(random_state=1),
           distributor=DensityDistributor(),
           random_state=0), {
               'k_neighbors': [3, 4, 5],
               'clusterer__n_clusters':
               [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
               'distributor__distances_exponent': [0, 1, 2, 5],
               'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
           }),
    ('SOMO',
     SMOTE(clusterer=SOM(), distributor=DensityDistributor(),
           random_state=0), {
Beispiel #7
0
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.cluster import KMeans
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import SMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'borderline-smote')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('BORDERLINE-SMOTE', SMOTE(random_state=0, kind='borderline1'), {'k_neighbors': [3, 5]}),
    ('K-MEANS BORDERLINE-SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0, kind='borderline1'), {
        'k_neighbors': [3, 5],
        'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0],
        'distributor__distances_exponent': [0, 1, 2],
        'distributor__filtering_threshold': [0.5, 1.0]
        }
    ) 
]
classifiers = [
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}),
    ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}),
    ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth':[3, 6], 'n_estimators': [50, 100]})
]