Esempio n. 1
0
def test_function_sampler_reject_sparse():
    X_sparse = sparse.csr_matrix(X)
    sampler = FunctionSampler(accept_sparse=False)
    with pytest.raises(TypeError,
                       match="A sparse matrix was passed, "
                       "but dense data is required"):
        sampler.fit(X_sparse, y)
Esempio n. 2
0
def test_function_sampler_func(X, y):
    def func(X, y):
        return X[:10], y[:10]

    sampler = FunctionSampler(func=func)
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X[:10])
    assert_array_equal(y_res, y[:10])
Esempio n. 3
0
def test_function_sampler_reject_sparse():
    X_sparse = sparse.csr_matrix(X)
    sampler = FunctionSampler(accept_sparse=False)
    with pytest.raises(
            TypeError,
            match="A sparse matrix was passed, "
            "but dense data is required"):
        sampler.fit_resample(X_sparse, y)
def test_function_sampler_func(X, y):
    def func(X, y):
        return X[:10], y[:10]

    sampler = FunctionSampler(func=func)
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X[:10])
    assert_array_equal(y_res, y[:10])
Esempio n. 5
0
def test_function_sampler_func_kwargs(X, y):

    def func(X, y, ratio, random_state):
        rus = RandomUnderSampler(ratio=ratio, random_state=random_state)
        return rus.fit_sample(X, y)

    sampler = FunctionSampler(func=func, kw_args={'ratio': 'auto',
                                                  'random_state': 0})
    X_res, y_res = sampler.fit_sample(X, y)
    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_sample(X, y)
    assert_allclose_dense_sparse(X_res, X_res_2)
    assert_array_equal(y_res, y_res_2)
def test_function_sampler_func_kwargs(X, y):
    def func(X, y, sampling_strategy, random_state):
        rus = RandomUnderSampler(
            sampling_strategy=sampling_strategy, random_state=random_state
        )
        return rus.fit_resample(X, y)

    sampler = FunctionSampler(
        func=func, kw_args={"sampling_strategy": "auto", "random_state": 0}
    )
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X_res_2)
    assert_array_equal(y_res, y_res_2)
Esempio n. 7
0
def train(lang='pt'):
    params = PARAMS.copy()
    initial_epoch = 0
    X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump")
    X = np.asarray(X)
    params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump")
    params["vocab_size"] = params['embedding_matrix'].shape[0]
    params["embedding_dim"] = params['embedding_matrix'].shape[1]
    
    if not os.path.exists(PATH):
        os.makedirs(PATH)
    if not os.path.exists(PATH+'log_dir'):
        os.makedirs(PATH+'log_dir')
        
    #params["loss"] = util.focal_loss(gamma=5.,alpha=1588)
    lastest_model = load_lastest(lang=lang)
    if(lastest_model == None):
        model, params = generate_model(params)
    else:
        model = lastest_model[0]
        initial_epoch = lastest_model[1]
        
    print(model.metrics_names)
    
    params['sampler'] = FunctionSampler(func=balance_dataset,
                          kw_args={'cut_off': 0.5,
                                  'random_state': 42})
    
    data_generator = DataGenerator(X,Y, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], sampler=params['sampler'])
    #data_generator.remove_reliable_0(pct=1.0)
    validation_data = data_generator.get_validation_data()
    print('data_generator.x: ', data_generator.__getitem__(0)[0][0:5])
    print('data_generator.y: ', data_generator.__getitem__(0)[1][0:5])

    #params["class_weights"]= data_generator.get_classes_weights()
    
    reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1)
    early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1)
    csv_logger = CSVLogger(PATH+'traning.log', append=True)
    tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"])
    model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5',
                                               monitor='val_categorical_accuracy',
                                               verbose=1,
                                               mode='max')
    params["callbacks"] = [model_checkpoint, early_stopping, tensorboard_callback, csv_logger, reduce_lr]
    
    model.fit_generator(data_generator,
                        epochs=params["epochs"],
                        verbose=1,
                        callbacks=params["callbacks"],
                        validation_data=validation_data,
                        #workers=7,
                        #use_multiprocessing=True,
                        class_weight=params["class_weights"],
                        initial_epoch=initial_epoch)
Esempio n. 8
0
    def fit(self,
            x_train: pd.DataFrame,
            meta_train: pd.DataFrame,
            validation_group=None):
        """
        Trains the model by cross validation using the parameters of the object.
        For this method, the cv generator is taken to be the validation_method. train_test_method which was used
        in the run() method to split the set into train and test is not used in this case.

        :param x_train: samples with features
        :param meta_train: meta data with target and validation columns
        :param validation_group: an array with the validation group values.
            if none is given then the meta_train is indexed to get the target and validation columns. If an array is given
            then the meta_train is taken to be just the target variable that will be used to fit the model
        :return: best set of hyperparameters from crossvalidation and the feature coefficients
        """
        np.random.seed(11235)

        if validation_group is None:
            target, set_for_kfold = meta_train.loc[:, self.
                                                   target_column], meta_train.loc[:,
                                                                                  self
                                                                                  .
                                                                                  validation_group]
        else:
            target = meta_train
            set_for_kfold = validation_group
        xtrain, ytrain = self.resampler.fit_resample(X=x_train, y=target)
        if type(self.resampler) != type(FunctionSampler()):
            # we change set_for_kfold to ytrain if we resampled ytrain since it no longer has the same shape as
            # the validation_group variable. This means that resampling can't work when the foldgenerator used to split data
            # into validation folds is set to GrxoupKFold and the validation_group is not equal to the targets. This is
            # because we don't know how to resample another variable other than the target. eg if target is water colour
            # and validation_group is the area number, we wouldn't be able to resample the water colour AND find what
            # area number the new samples would take/
            set_for_kfold = ytrain
        # Scaling the train set
        xtrain = self.css.fit_transform(xtrain)
        xtrain = self.scaler.fit_transform(xtrain)

        # Splitting the train set into validation folds that will be used in training
        validation_sets = self.validation_method.split(xtrain,
                                                       y=set_for_kfold,
                                                       groups=set_for_kfold)

        # Perform grid CV using Kfolds as folds.
        self.model = mth.CV_models(grid=self.grid,
                                   estimator=self.estimator,
                                   parameter_search=self.cv,
                                   **self.kwargs)
        # The estimator object is an sklearn classifier
        set_parameters, estimator, set_coef = self.model.fit(
            features=xtrain, target=ytrain, ksets=validation_sets)

        return set_parameters, set_coef
def test_function_sampler_validate():
    # check that we can let a pass a regression variable by turning down the
    # validation
    X, y = make_regression()

    def dummy_sampler(X, y):
        indices = np.random.choice(np.arange(X.shape[0]), size=100)
        return _safe_indexing(X, indices), _safe_indexing(y, indices)

    sampler = FunctionSampler(func=dummy_sampler, validate=False)
    pipeline = make_pipeline(sampler, LinearRegression())
    y_pred = pipeline.fit(X, y).predict(X)

    assert type_of_target(y_pred) == "continuous"
Esempio n. 10
0
def test_balanced_bagging_classifier_with_function_sampler(replace):
    # check that we can provide a FunctionSampler in BalancedBaggingClassifier
    X, y = make_classification(
        n_samples=1_000,
        n_features=10,
        n_classes=2,
        weights=[0.3, 0.7],
        random_state=0,
    )

    def roughly_balanced_bagging(X, y, replace=False):
        """Implementation of Roughly Balanced Bagging for binary problem."""
        # find the minority and majority classes
        class_counts = Counter(y)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # compute the number of sample to draw from the majority class using
        # a negative binomial distribution
        n_minority_class = class_counts[minority_class]
        n_majority_resampled = np.random.negative_binomial(n=n_minority_class,
                                                           p=0.5)

        # draw randomly with or without replacement
        majority_indices = np.random.choice(
            np.flatnonzero(y == majority_class),
            size=n_majority_resampled,
            replace=replace,
        )
        minority_indices = np.random.choice(
            np.flatnonzero(y == minority_class),
            size=n_minority_class,
            replace=replace,
        )
        indices = np.hstack([majority_indices, minority_indices])

        return X[indices], y[indices]

    # Roughly Balanced Bagging
    rbb = BalancedBaggingClassifier(
        base_estimator=CountDecisionTreeClassifier(),
        n_estimators=2,
        sampler=FunctionSampler(func=roughly_balanced_bagging,
                                kw_args={"replace": replace}),
    )
    rbb.fit(X, y)

    for estimator in rbb.estimators_:
        class_counts = estimator[-1].class_counts_
        assert (class_counts[0] / class_counts[1]) > 0.8
Esempio n. 11
0
def test_function_resampler_fit():
    # Check that the validation is bypass when calling `fit`
    # Non-regression test for:
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782
    X = np.array([[1, np.nan], [2, 3], [np.inf, 4]])
    y = np.array([0, 1, 1])

    def func(X, y):
        return X[:1], y[:1]

    sampler = FunctionSampler(func=func, validate=False)
    sampler.fit(X, y)
    sampler.fit_resample(X, y)
Esempio n. 12
0
def test_function_sampler_identity(X, y):
    sampler = FunctionSampler()
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X)
    assert_array_equal(y_res, y)
Esempio n. 13
0
        size=n_majority_resampled,
        replace=replace,
    )
    minority_indices = np.random.choice(
        np.flatnonzero(y == minority_class),
        size=n_minority_class,
        replace=replace,
    )
    indices = np.hstack([majority_indices, minority_indices])

    return X[indices], y[indices]


# Roughly Balanced Bagging
rbb = BalancedBaggingClassifier(
    sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True})
)
cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy")

print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}")


# %% [markdown]
# .. topic:: References:
#
#    .. [1] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and
#           boosting." AAAI/IAAI 1997 (1997): 546-551.
#
#    .. [2] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by
#           using ensemble models." 2009 IEEE symposium on computational
#           intelligence and data mining. IEEE, 2009.
Esempio n. 14
0
#
# Instead of repeating the same samples when over-sampling or perturbating the
# generated bootstrap samples, one can use some specific heuristic instead.
# :class:`~imblearn.over_sampling.ADASYN` and
# :class:`~imblearn.over_sampling.SMOTE` can be used in this case.

# %%
from imblearn import FunctionSampler  # to use a idendity sampler
from imblearn.over_sampling import SMOTE, ADASYN

X, y = create_dataset(n_samples=150, weights=(0.1, 0.2, 0.7))

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))

samplers = [
    FunctionSampler(),
    RandomOverSampler(random_state=0),
    SMOTE(random_state=0),
    ADASYN(random_state=0),
]

for ax, sampler in zip(axs.ravel(), samplers):
    title = "Original dataset" if isinstance(sampler,
                                             FunctionSampler) else None
    plot_resampling(X, y, sampler, ax, title=title)
fig.tight_layout()

# %% [markdown]
# The following plot illustrates the difference between
# :class:`~imblearn.over_sampling.ADASYN` and
# :class:`~imblearn.over_sampling.SMOTE`.
Esempio n. 15
0
    print('Debug mode on')
# Todo remove highly correlated features
# Todo features selction

# Create pipeline
pipe = Pipeline([
    # the scale stage is populated by the param_grid
    ('outlier', 'passthrough'),
    ('selection', 'passthrough'),
    ('scale', RobustScaler()),
    ('model', 'passthrough')
])
# Specify parameters to be searched over
svc_search = {
    'outlier':
    Categorical([FunctionSampler(func=isof),
                 FunctionSampler(func=lof)]),  # , FunctionSampler(func=lof)
    'model': [SVC(class_weight='balanced', decision_function_shape='ovo')],
    'model__kernel': Categorical(['rbf']),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform')
}

xgb_search = {
    'model': [XGBClassifier(feature_names=X_train.columns)],
    'scale': ['passthrough'],
    'model__max_delta_step': Integer(0, 20),
    'model__colsample_bylevel': (0.01, 1.0, 'uniform'),
    'model__learning_rate': (0.01, 1.0, 'log-uniform'),
    'model__n_estimators': Integer(60, 400),
    'model__max_depth': Integer(3, 12),
    # 'model__scale_pos_weight': Real(1, 1000, 'log-uniform'), only binary
def train(lang='pt'):
    params = PARAMS.copy()
    initial_epoch = 0
    X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump")
    X = np.asarray(X)
    params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump")
    params["vocab_size"] = params['embedding_matrix'].shape[0]
    params["embedding_dim"] = params['embedding_matrix'].shape[1]
    
    if not os.path.exists(PATH):
        os.makedirs(PATH)
    if not os.path.exists(PATH+'log_dir'):
        os.makedirs(PATH+'log_dir')
    
    kfold_count = 1
    skf = StratifiedKFold(n_splits=params['k-folds'], shuffle=True)
    for train_index, test_index in skf.split(X, Y):
        print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
        X_train, Y_train = X[train_index], Y[train_index]
        X_test, Y_test = X[test_index], Y[test_index]
        
        params['sampler'] =  FunctionSampler(func=balance_dataset,
                          kw_args={'cut_off': 0.5,
                                  'random_state': np.random.randint(0, 100000)})
        
        model, params = generate_model(params)
        
        print(model.metrics_names)
        
        data_g_train = DataGenerator(X_train, Y_train, lang=lang, process_x=process_x, process_y=process_y, sampler=params['sampler'], batch_size=PARAMS['batch_size'], separate_val=False)
        data_g_val = DataGenerator(X_test, Y_test, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], separate_val=False)
        print('data_generator.x: ', data_g_train.__getitem__(0)[0][0:5])
        print('data_generator.y: ', data_g_train.__getitem__(0)[1][0:5])

        #params["class_weights"]= data_generator.get_classes_weights()

        reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1)
        early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1)
        csv_logger = CSVLogger(PATH+'traning.log', append=True)
        tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"])
        model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5',
                                                   monitor='val_categorical_accuracy',
                                                   verbose=1,
                                                   mode='max')
        clr = CyclicLR(base_lr=1e-3, max_lr=2e-3,
                   step_size=300., mode='exp_range',
                   gamma=0.99994)

        params["callbacks"] = [tensorboard_callback, csv_logger, clr]

        model.fit_generator(data_g_train,
                            epochs=params["epochs"],
                            verbose=1,
                            callbacks=params["callbacks"],
                            #workers=7,
                            #use_multiprocessing=True,
                            class_weight=params["class_weights"],
                            initial_epoch=initial_epoch)
        
        batch_val = 1000
        data_g_val.set_batch_size(batch_val)
        y_pred = np.zeros(Y_test.shape)
        y_val = np.zeros(Y_test.shape)
        for i, (x, y) in enumerate(data_g_val):
            y_pred[i*batch_val:(i+1)*batch_val] = np.argmax(model.predict(x), axis=1)
            y_val[i*batch_val:(i+1)*batch_val] = np.argmax(y, axis=1)
        result = util.evaluate(y_val, y_pred)
        print('Model '+NAME+' val score on '+lang+', k-fold-'+str(kfold_count)+': ', result)
        model.save(PATH+'weights-{epoch:03d}-kfold{fold}-{result:.4f}-{lang}.hdf5'.format(epoch=params["epochs"], result=result, lang=lang, fold=kfold_count))
        del data_g_train, data_g_val
        del model
        del X_train, Y_train, X_test, Y_test
        K.clear_session()
        gc.collect()
        kfold_count += 1
Esempio n. 17
0
    def __init__(self,
                 meta_data: pd.DataFrame,
                 estimator,
                 grid: dict = None,
                 estimator_name=None,
                 train_test_column="group",
                 target_column="target",
                 names=(None, None),
                 train_test_split_method=StratifiedKFold(n_splits=7),
                 css_normalisation=CSSNormaliser(identity=True),
                 validation_method_group: tuple = (None, None),
                 scaler=FunctionTransformer(validate=False),
                 resampler=FunctionSampler(),
                 cv_suffix: str = "_cv",
                 features=None,
                 **kwargs):
        """
        Each instance of this class is a classification experiment which stores all the configurations and the results.
        This object is used to run the experiment procedure, using all the attributes passed or implied by its creation.
        The only parameters necessary for the creation of an experiment are the meta-data Dataframe (properly formated)
        and an estimator (from the default ones).

        .. note::

            A properly formatted meta-data dataframe has the values of the response (target) variable under the column
            `target`, and the values of the variable used to split the dataset into train and test sets (and the
            train into validation sets) under the column `group`. For the purposes of the paper, the meta-data dataframe
            is the wwfdf csv found in `data/processed`. The target values can be either under the column `Water` or
            `River_loc`.

        A minimal example initialising an Experiment object is given below

        example::

            from sklearn.ensemble import RandomForestClassifier
            import pandas as pd
            # project_dir is the path to the project directory
            wwfdf = pd.read_csv(project_dir+"data/processed/wwfdf",index_col = 0)
            # creating the object
            experiment_object = Experiment(meta_data = wwfdf, estimator = RandomForestClassifier,
                                           target_column = "Water",train_test_column = "Area_group")

        :param meta_data: The meta-data dataframe that contains the target, train_test_group and validation_group columns.
            The default names for the columns are "target" for the target variable, and "group" for the train_test_group
            and validation_group
        :param target_column: The column in the meta_data dataframe where the values of the target variable are found.
            default is `target`
        :param train_test_column: The column in the meta_data dataframe where the values of the variable used to split
            the samples into train.and test sets are found. default is `group`
        :param train_test_split_method: StratifiedKFold or GroupKFold
        :param estimator: estimator object,
        :param grid: grid of hyperparameters
        :param estimator_name: name of estimator
        :param css_normalisation: CSSNormalisation object
        :param validation_method_group: A tuple which specifies the method used to split the train set into folds,
                                        and which column of the meta_data is going to be used as the group variable
                                        (StratifiedKFold or GroupKFold,column_name_for_grouping)
        :param scaler: sklearn.preprocessing method
        :param resampler: imblearn.over_sampler method
        :param cv_method: str
            Choose from {"_cv","_bcv","_rcv"}
        :param names: (str,str)
            Names of feature and meta data set
        :param features: pandas DataFrame
            The features to be used for running the experiment or for fitting. If not given then when calling then run
            method of the object, they have to be passed together with meta_data
        :param kwargs:
        """
        self.names = names
        self.target_column = mth.checking_columns(dataframe=meta_data,
                                                  column=target_column,
                                                  x=target_column)
        self.y_true = meta_data.loc[:, self.target_column]

        # Get group column if it is present in the meta_data frame. If not, the target column is used
        self.train_test_split_column = mth.checking_columns(
            meta_data,
            train_test_column,
            x=train_test_column,
            handle=lambda x: self.target_column)

        self.css = css_normalisation
        self.train_test_split_method = train_test_split_method
        # IF validation_method_group is None then the validation method is the same as the train-test splitting method
        if not validation_method_group[0]:
            self.validation_method = train_test_split_method
        # Otherwise the validation metho is the first element of the tuple validation_method_group
        else:
            self.validation_method = validation_method_group[0]

        # The validation group column is set to the train-test split column if not specified in the validation_method_group
        self.validation_group = mth.checking_columns(
            dataframe=meta_data,
            column=validation_method_group[1],
            x=validation_method_group[1],
            handle=lambda x: self.train_test_split_column)

        self.scaler = scaler
        self.resampler = resampler
        self.cv = cv_suffix
        self.grid = grid
        self.estimator = estimator
        # In the case the user interacts directly with this class but they don't want to choose a grid themselves
        # or want the default
        if self.grid is None or type(
                self.grid) == str and self.grid.lower() == "default":
            self.default_grid()
        if estimator_name is None:
            self.estimator_name = type(self.estimator).__name__
        else:
            self.estimator_name = estimator_name
        self.meta_data = meta_data
        self.features = features
        self.kwargs = kwargs
# :class:`imblearn.FunctionSampler` will be called when using the method
# ``fit_resample``.


def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng,
                            behaviour='new')
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

##############################################################################
# Integrate it within a pipeline
##############################################################################

##############################################################################
# By elimnating outliers before the training, the classifier will be less
# affected during the prediction.

pipe = make_pipeline(
    FunctionSampler(func=outlier_rejection),
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
Esempio n. 19
0
    def addPreProcess(self):

        steps = []

        for preProcessMethod in self.preProcessingList:

            # MRMR args :
            # feature_correlations is the precomputed correlation matrix between the features, can't grid search on it
            # => set it once and for all
            # target_correlations is the precomputed correlation matrix between the features and the target,
            # can't grid search on it => set it once and for all
            # n_components, number of components with highest correlation with target, default to 20
            # thresh, the threshold for the correlation above which we have to remove one feature, default to 1.0
            if preProcessMethod == 'mrmr':
                feature_corr = self.feature_correlations.copy()
                target_corr = self.target_correlations.copy()
                steps.append(('mrmr', MRMR(feature_corr, target_corr,
                                           self.keys)))

            elif preProcessMethod == 'select_corr':
                steps.append(('select_corr',
                              Selector(score_func=np.corrcoef,
                                       labels=self.keys,
                                       info_vector=self.target_correlations,
                                       random_state=self.seed)))

            # MutualInfoSelection args :
            # n_components, the number of components with the highest mutual info to keep, default to 20
            elif preProcessMethod == 'select_mut':
                steps.append(('select_mut',
                              Selector(score_func=mutual_info_regression,
                                       labels=self.keys,
                                       info_vector=self.target_muInf,
                                       random_state=self.seed)))

            # Whitening is equivalent to applying PCA on all the variables and scaling the variables to unit variance
            # Whitening args :
            # n_components, number of components to identify with PCA, default to all components
            elif preProcessMethod == 'whitening':
                steps.append(
                    ('whitening', PCA(whiten=True, random_state=self.seed)))

            # No args
            elif preProcessMethod == 'standardization':
                steps.append(('standardization', StandardScaler()))

            # PCA args :
            # n_components, number of components to identify with PCA, default to all components
            elif preProcessMethod == 'PCA':
                steps.append(('PCA', PCA(random_state=self.seed)))

            # No args
            elif preProcessMethod == 'outliers':
                steps.append(
                    ('outliers', FunctionSampler(func=remove_outliers)))

            # No args
            elif preProcessMethod == 'upsample':
                steps.append(
                    ('upsampling', UpSampling(random_state=self.seed)))

            # No args
            elif preProcessMethod == 'downsample':
                steps.append(
                    ('downsampling', DownSampling(random_state=self.seed)))

            # other pre-processing steps ?
            else:
                continue

        steps.append(('regression', self.model))
        self.model = Pipeline(steps=steps)
Esempio n. 20
0
File: tree.py Progetto: daniekie/aml
# Create pipeline
pipe = Pipeline([
    # the scale stage is populated by the param_grid
    ('remove const cloumns', VarianceThreshold(threshold=0)),
    ('outlier', 'passthrough'),
    # ('sample', NearMiss()),
    ('scale', StandardScaler()),
    ('selection', GenericUnivariateSelect()),
    ('estimation', XGBClassifier())
])

# Specify parameters to be searched over
param_grid = [
    # Feature selection: SelectKBest
    {
        'outlier': [FunctionSampler(func=isof),
                    FunctionSampler(func=lof)],
        'selection__mode': ['fpr'],
        'selection__param': [0.00005, 0.0001, 0.001, 1],
        'selection__score_func': [f_classif]
    }
]

# Gridsearch
search = GridSearchCV(pipe,
                      param_grid=param_grid,
                      n_jobs=-1,
                      scoring='balanced_accuracy',
                      cv=2)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe.steps])
    return X[y_pred == 1], y[y_pred == 1]


def plot_scatter(X, y, title):
    """Function to plot some data as a scatter plot."""
    plt.figure(figsize=(16, 16))
    plt.scatter(X[y == 1, 0], X[y == 1, 1], c='r', label='Class #1 - Fraud')
    plt.scatter(X[y == 0, 0],
                X[y == 0, 1],
                c='b',
                label='Class #0 - Non-Fraud')
    plt.legend()
    plt.title(title)


reject_sampler = FunctionSampler(func=outlier_rejection)
X_vals = X.values
y_vals = y.values
X_inliers, y_inliers = reject_sampler.fit_resample(X_vals, y_vals)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

print("Total outliers removed: {:}".format(len(X_vals) - len(X_inliers)))
print("New lenght of X: {} ; new length of y {}".format(
    len(X_inliers), len(y_inliers)))


def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False):

    results_table = []
    results = []
    rand_state = 42
# our dataset during training. The function passed to the
# :class:`imblearn.FunctionSampler` will be called when using the method
# ``fit_resample``.


def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

##############################################################################
# Integrate it within a pipeline
##############################################################################

##############################################################################
# By elimnating outliers before the training, the classifier will be less
# affected during the prediction.

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                     LogisticRegression(random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))
Esempio n. 23
0
    del a, b
    print('Debug mode on')

# Inspect data
percent_missing = X_test.isnull().sum() * 100 / len(X_test)
missing_value_df = pd.DataFrame({'column_name': X_test.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
# X_train is missing 3%-10% of the values

# Create pipeline
pipe = Pipeline([
    # the scale stage is populated by the param_grid
    ('impute', SimpleImputer()),
    ('remove const cloumns', VarianceThreshold(threshold=0)),
    ('outlier1', FunctionSampler(func=isof)),
    ('outlier2', FunctionSampler(func=lof)),
    ('scale', 'passthrough'),
    ('selection', GenericUnivariateSelect()),  # Known bug: https://github.com/scikit-learn/scikit-learn/issues/15672
    ('estimation', SVR())
])


# Specify parameters to be searched over
param_grid = [
    # Feature selection: SelectKBest
    {
        'scale': [RobustScaler(), StandardScaler()],
        'impute__strategy': ['median'],
        'selection__mode': ['k_best'],
        'selection__param': [85, 90, 95, 100],
Esempio n. 24
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
Esempio n. 25
0
def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")
#print(train_reviews.shape,train_sentiments.shape)
#print(test_reviews.shape,test_sentiments.shape)
#print(blind_reviews.shape,blind_sentiments.shape)

#OVERSAMPLING of Data
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn import FunctionSampler

train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews_data['reviews_text'],
                                  reviews_data['sentiment'], test_size=0.3, random_state=0)

def resample(X, y):
    return RandomOverSampler().fit_resample(X, y)

sampler = FunctionSampler(func=resample, validate=False)
#reshape and ravel coverts pandas df to numpy array, since RandomOverSampler only accepts numpy array
train_reviews, train_sentiments = sampler.fit_resample(train_reviews.values.reshape(-1,1), train_sentiments.ravel())

train_reviews = train_reviews.reshape(train_reviews.size,)

train_reviews = pd.Series(train_reviews)
train_sentiments = pd.Series(train_sentiments)
train_sentiments.value_counts()

# CountVectorizer implements both tokenization and occurrence counting in a single class. Read more here https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# You can also reuse the from scratch code we learnt in previous class
# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Count vectorizer with 
# We will use different over-sampling approaches and use a kNN classifier
# to check if we can recognize the 2 presidents. The evaluation will be
# performed through cross-validation and we will plot the mean ROC curve.
#
# We will create different pipelines and evaluate them.

# %%
from imblearn import FunctionSampler
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=3)

pipeline = [
    make_pipeline(FunctionSampler(), classifier),
    make_pipeline(RandomOverSampler(random_state=42), classifier),
    make_pipeline(ADASYN(random_state=42), classifier),
    make_pipeline(SMOTE(random_state=42), classifier),
]

# %%
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=3)

# %% [markdown]
# We will compute the mean ROC curve for each pipeline using a different splits
# provided by the :class:`~sklearn.model_selection.StratifiedKFold`
# cross-validation.
X_test = np.vstack([moons, blobs])
y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
                    np.zeros(blobs.shape[0], dtype=np.int8)])

plot_scatter(X_test, y_test, 'Testing dataset')


def outlier_rejection(X, y):
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                     LogisticRegression(random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

clf = LogisticRegression(random_state=rng)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

plt.show()
Esempio n. 29
0
def test_function_sampler_identity(X, y):
    sampler = FunctionSampler()
    X_res, y_res = sampler.fit_sample(X, y)
    assert_allclose_dense_sparse(X_res, X)
    assert_array_equal(y_res, y)
Esempio n. 30
0
File: main.py Progetto: daniekie/aml
# Specify parameters to be searched over
<<<<<<< HEAD
param_grid = [
    # {
    #     'outlier': [FunctionSampler(func=isof)],  #, FunctionSampler(func=lof)
    #     'sample': [RandomUnderSampler()],  #RandomOverSampler(), , SMOTEENN(), SMOTETomek()
    #     'selection__mode': ['fwe'], #'fpr', 'fdr',
    #     'selection__param': [0.1],
    #     'estimation': [SVC(class_weight='balanced', decision_function_shape='ovo')],
    #     'estimation__kernel': ['rbf'],
    #     'estimation__C': np.logspace(0, 2, num=4),
    #     'estimation__gamma':  ['auto']
    # } #,
    {
        'outlier': [FunctionSampler(func=isof)],  # , FunctionSampler(func=lof)
        'sample': [None, RandomUnderSampler(), RandomOverSampler(), SMOTEENN(), SMOTETomek(), SMOTE(), ADASYN()],  #
        'selection': [None],
        'estimation': [SVC(class_weight='balanced', decision_function_shape='ovo')],
        'estimation__kernel': ['rbf'],
        'estimation__C': np.linspace(1.4, 1.8, num=5),
        'estimation__gamma': ['auto']
    },{
        'outlier': [FunctionSampler(func=isof)],  # , FunctionSampler(func=lof)
        'sample': [None, RandomUnderSampler(), RandomOverSampler(), SMOTEENN(), SMOTETomek(), SMOTE(), ADASYN()],  #
        'selection__mode': ['fwe'],
        'selection__param': [0.1],
        'estimation': [SVC(class_weight='balanced', decision_function_shape='ovo')],
        'estimation__kernel': ['rbf'],
        'estimation__C': np.linspace(1.4, 1.8, num=5),
        'estimation__gamma': ['auto']
Esempio n. 31
0
# Prototype generation: under-sampling by generating new samples
# --------------------------------------------------------------
#
# :class:`~imblearn.under_sampling.ClusterCentroids` under-samples by replacing
# the original samples by the centroids of the cluster found.

# %%
import matplotlib.pyplot as plt
from imblearn import FunctionSampler
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import ClusterCentroids

X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)

samplers = {
    FunctionSampler(),  # identity resampler
    ClusterCentroids(random_state=0),
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}"
    )
    plot_resampling(X, y, sampler, ax[1])

fig.tight_layout()

# %% [markdown]
# Prototype selection: under-sampling by selecting existing samples