Example #1
0
def test_enn_not_good_object():
    nn = "rnd"
    enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode")
    err_msg = (
        "n_neighbors must be an interger or an object compatible with the "
        "KNeighborsMixin API of scikit-learn"
    )
    with pytest.raises(ValueError, match=err_msg):
        enn.fit_resample(X, Y)
Example #2
0
def edited_KNN(X, Y):
    from imblearn.under_sampling import EditedNearestNeighbours
    enn = EditedNearestNeighbours()
    enn.fit_resample(X, Y)
    indexes = enn.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
Example #3
0
def edited_KNN(X, Y):
    from imblearn.under_sampling import EditedNearestNeighbours
    enn = EditedNearestNeighbours()
    enn.fit_resample(X, Y)
    indexes = enn.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
Example #4
0
    def fit_resample(self, X, y):
        n_features = X.shape[1]
        continuous_features = np.setdiff1d(np.arange(n_features),
                                           self.categorical_features)
        X_continuous = X[:, continuous_features]
        X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc'])
        X_categorical = X[:, categorical_features]

        if X_continuous.dtype.name != 'object':
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64

        ohe = OneHotEncoder(sparse=True,
                            handle_unknown='ignore',
                            dtype=dtype_ohe)
        X_ohe = ohe.fit_transform(X_categorical.toarray() if sparse.
                                  issparse(X_categorical) else X_categorical)
        X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr')

        enn_balancer = EditedNearestNeighbours(sampling_strategy='all')
        X_resampled, y_resampled = enn_balancer.fit_resample(X_encoded, y)
        selected_indices = enn_balancer.sample_indices_
        X_resampled = X[selected_indices, :]

        return X_resampled, y_resampled
Example #5
0
def test_enn_fit_resample_with_nn_object():
    nn = NearestNeighbors(n_neighbors=4)
    enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode")
    X_resampled, y_resampled = enn.fit_resample(X, Y)

    X_gt = np.array(
        [
            [-0.10903849, -0.12085181],
            [0.01936241, 0.17799828],
            [2.59928271, 0.93323465],
            [1.42772181, 0.526027],
            [1.92365863, 0.82718767],
            [0.25738379, 0.95564169],
            [-0.284881, -0.62730973],
            [0.57062627, 1.19528323],
            [0.78318102, 2.59153329],
            [0.35831463, 1.33483198],
            [-0.14313184, -1.0412815],
            [-0.09816301, -0.74662486],
            [0.52726792, -0.38735648],
            [0.2821046, -0.07862747],
        ]
    )
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #6
0
def get_resample(X, Y):
    print("不均衡データの調節開始.....")
    #sme = SMOTEENN()
    enn = EditedNearestNeighbours()
    X_res, Y_res = enn.fit_resample(X, Y)
    print("調節終了")
    return (X_res, Y_res)
def test_enn_check_kind_selection():
    """Check that `check_sel="all"` is more conservative than
    `check_sel="mode"`."""

    X, y = make_classification(
        n_samples=1000,
        n_classes=2,
        weights=[0.3, 0.7],
        random_state=0,
    )

    enn_all = EditedNearestNeighbours(kind_sel="all")
    enn_mode = EditedNearestNeighbours(kind_sel="mode")

    enn_all.fit_resample(X, y)
    enn_mode.fit_resample(X, y)

    assert enn_all.sample_indices_.size < enn_mode.sample_indices_.size
Example #8
0
def test_enn_fit_resample():
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #9
0
def load_from_csv(input_dir: str,
                  counts_file: str = "normalized_counts.csv.gz",
                  n_jobs=1,
                  low_expression=0.1) -> (AnnData, AnnData):
    u"""
    load data from csv files
    :param input_dir:
    :param counts_file:
    :param n_jobs
    :param str
    :return:
    """
    logger.info("Reading {0}".format(input_dir))

    input_file = os.path.join(input_dir, counts_file)

    # if not os.path.exists(input_file):
    #     input_file += ".gz"

    mtx = pd.read_csv(input_file, index_col=0)
    meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0)
    meta = meta.loc[meta.index, :]

    logger.info(mtx.shape)
    # filter low expressed genes
    genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)]

    mtx = mtx.loc[genes_sum, :]

    logger.info(mtx.shape)
    mtx = mtx.transpose()

    data = AnnData(mtx, obs=meta)
    data.obs = meta

    logger.info("Perform ENN")
    enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"])

    data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :])

    data_enn.obs = meta.iloc[idx_enn, :]

    logger.info("Perform RENN")
    renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"])

    data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :])

    data_renn.obs = meta.iloc[idx_renn, :]

    return data, data_enn, data_renn
Example #10
0
def undersample_ENN(df, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    enn = EditedNearestNeighbours(sampling_strategy="auto")
    X_res, y_res = enn.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
Example #11
0
def load_data(filename_classes,
              filename_features,
              with_enn=False,
              denoise=False):
    """
    loads data from data files
    :param filename_classes:
    :param filename_features:
    :param with_enn:
    :return: data rows, targets for training and test
    """
    print("loading classes")
    class_data = numpy.loadtxt(open(filename_classes, "rb"),
                               delimiter=";",
                               dtype=numpy.dtype('U20'),
                               skiprows=1,
                               usecols=1)

    print("loading features")
    # remove first (index) column when loading the csv
    feature_data = numpy.loadtxt(open(filename_features, "rb"),
                                 delimiter=";",
                                 dtype=int,
                                 skiprows=1,
                                 usecols=range(1, 81))

    # create set of class labels
    class_labels = list(set(class_data.tolist()))
    class_dict = dict()
    for enumerated_class_label in enumerate(class_labels):
        index, name = enumerated_class_label
        class_dict[name] = index

    # convert list of text labels to list of indices
    # this gives us the class assignments as list of indices
    class_assignments = list()
    for entry in class_data:
        class_assignments.append(class_dict[entry])

    if with_enn:
        # clean up the data to limit chances of noisy training samples
        print("ENN: cleaning up data for instance selection")
        enn = EditedNearestNeighbours(return_indices=True)
        data_resampled, target_resampled, sample_indices = enn.fit_resample(
            feature_data, class_assignments)
        if denoise:
            # do not return any data points considered noise by enn
            return data_resampled, target_resampled, class_labels, sample_indices
        else:
            return feature_data, class_assignments, class_labels, sample_indices
    else:
        return feature_data, class_assignments, class_labels, list()
Example #12
0
def edited_nearest_neighbour(X,
                             y,
                             visualize=False,
                             pca2d=True,
                             pca3d=True,
                             tsne=True,
                             pie_evr=True):
    enn = EditedNearestNeighbours()
    X_res, y_res = enn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def readFile(path,
             y_label,
             method,
             encode_features=[],
             skew_exempted=[],
             training_ratio=0.7,
             shuffle=True,
             needSkew=False,
             fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle

    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(
            skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness

    raw = pd.get_dummies(
        raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X = raw.drop(y_label, axis=1)
    y = raw[y_label]
    X_train, X_test, y_train, y_test = split(X, y, training_ratio)
    if (method == 'OverSample'):
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    if (method == 'UnderSample'):
        # for i in []
        #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        model = EditedNearestNeighbours(random_state=42)
        X_res, y_res = model.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    # if(method=='Weights'):
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)

    return X_train, X_test, y_train, y_test
Example #14
0
def under_sample_data(matrix, y_train):
    add_to_log('Under Sampling')
    add_to_log('Sample distribution %s' % Counter(y_train))
    # clean proximity samples using TomeKLinks
    tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1)
    X_res, y_res = tl.fit_resample(matrix, y_train)
    add_to_log('TomekLinks distribution %s' % Counter(y_res))

    enn = EditedNearestNeighbours(random_state=7,
                                  sampling_strategy='majority',
                                  n_jobs=-1)
    X_res, y_res = enn.fit_resample(X_res, y_res)

    add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res))
    return X_res, y_res
Example #15
0
def get_data(force_reload=False, strategy='oversampling', test_size=0.15):
    train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy))
    train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy))
    val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy))
    val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy))

    training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file)
    val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file)

    if not force_reload and training_files_exist and val_files_exist:
        X_train = np.load(train_data_file)
        y_train = np.load(train_labels_file)

        X_val = np.load(val_data_file)
        y_val = np.load(val_labels_file)
    else:
        train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
        X, y = to_data_format(train_df)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

        print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape))

        if strategy == 'oversampling':
            X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train)
        elif strategy == 'combine':
            smote = SMOTE(n_jobs=n_jobs)
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train)
        elif strategy == 'undersampling':
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = enn.fit_resample(X_train, y_train)
        elif strategy == 'condensed-undersampling':
            cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3)
            X_train, y_train = cnn.fit_resample(X_train, y_train)

        print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape))

        np.save(train_data_file, X_train)
        np.save(train_labels_file, y_train)
        np.save(val_data_file, X_val)
        np.save(val_labels_file, y_val)

    return X_train, X_val, y_train, y_val
def ENN_us(X_train,
           Y_train,
           seed,
           sampling_strategy,
           n_neighbors=3,
           kind_sel='all'):
    enn = EditedNearestNeighbours(random_state=seed,
                                  n_jobs=-1,
                                  n_neighbors=n_neighbors,
                                  kind_sel=kind_sel,
                                  sampling_strategy=sampling_strategy)
    print('Before ENN undersampling : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = enn.fit_resample(X_train, Y_train)
    print('After ENN undersampling : ',
          sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
Example #17
0
def load_from_csv(input_dir: str) -> (AnnData, AnnData):
    u"""
    load data from csv files
    :param input_dir:
    :return:
    """
    logger.info("read")
    mtx = pd.read_csv(os.path.join(input_dir, "normalized_counts.csv.gz"),
                      index_col=0,
                      engine="c")
    meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"),
                       index_col=0,
                       engine="c")
    meta = meta.loc[meta.index, :]

    mtx = mtx.transpose()

    data = AnnData(mtx, obs=meta)
    data.obs = meta

    logger.info("enn")
    enn = EditedNearestNeighbours(n_jobs=10, return_indices=True)

    mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"])

    data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :])

    data_enn.obs = meta.iloc[idx_enn, :]

    logger.info("Repeated enn")
    renn = RepeatedEditedNearestNeighbours(n_jobs=10, return_indices=True)

    mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"])

    data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :])

    data_renn.obs = meta.iloc[idx_renn, :]

    return data, data_enn, data_renn
Example #18
0
def sampling(X_train, y_train):
    ran_over = RandomOverSampler(random_state=42)
    X_train_oversample,y_train_oversample = ran_over.fit_resample(X_train,y_train)
    ran_under = RandomUnderSampler(random_state=42)
    X_train_undersample, y_train_undersample = ran_under.fit_resample(X_train,y_train)
    tl = TomekLinks(n_jobs=6)
    X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train)
    sm = SMOTE(random_state=42, n_jobs=5)
    X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
    enn = EditedNearestNeighbours()
    X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

    print(np.unique(y_train, return_counts=True))
    print("after sampling")
    print("randomg over sampling")
    print(np.unique(y_train_oversample, return_counts=True))
    print("SMOTE sampling")
    print(np.unique(y_train_sm, return_counts=True))
    print("random under sampling")
    print(np.unique(y_train_undersample, return_counts=True))
    print("TomekLinks under sampling")
    print(np.unique(y_train_tl, return_counts=True))
    return (X_train_oversample, y_train_oversample, X_train_undersample, y_train_undersample,
     X_train_tl, y_train_tl, X_train_sm, y_train_sm, X_train_enn, y_train_enn)
Example #19
0
TestSet_dev['QuantileAmt']=pd.qcut(x=TestSet_dev['TransactionAmt'], q=BinNum, labels=['Q'+str(X) for X in range(1,(BinNum+1))])


############################################################
############################################################
############################################################
############################################################
ColumnSelect=np.asarray(["C"+str(X) for X in range(1,15)])
TempTrain=TrainTransaction[ColumnSelect]
TempTrain=TempTrain.join([pd.get_dummies(data=TrainTransaction["ProductCD"]), pd.get_dummies(data=TrainTransaction["P_emaildomain"]), pd.get_dummies(data=TrainTransaction["QuantileAmt"])])
pd.get_dummies(data=TrainTransaction["P_emaildomain"]).shape
pd.get_dummies(data=TrainTransaction["ProductCD"]).shape
TempTrain.shape
#Undersample
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(TempTrain.iloc[1:1000:], TrainTransaction['isFraud'].iloc[1:1000:])
X_resampled
#Train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

#Set up SDG Model with Grid Search
LGBMModel=LGBMClassifier()
LGBMModel.fit(X_train, y_train)

#Predict
Predictions=LGBMModel.predict(TempTrain)

#Metrics
print(confusion_matrix(y_test, Predictions))
print(classification_report(y_test, Predictions))
Example #20
0
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError, match="has to be one of"):
        enn.fit_resample(X, Y)
Example #21
0
            X_best_train = f_classif_select.fit_transform(X_train, y_train)
            X_best_test = f_classif_select.fit_transform(X_test, y_test)

            knn.fit(X_best_train, y_train)
            y_pred = knn.predict(X_best_test)
            scores[i].append(metric(y_test, y_pred))

    for dataset_score in scores:
        print(np.mean(dataset_score))

if not SKIP_ENN:
    scores = [[] for _ in range(len(datasets))]

    for i, dataset in enumerate(datasets):
        X, y = dataset
        X, y = enn.fit_resample(X, y)
        for train_index, test_index in rskf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            f_classif_select = SelectKBest(k=K_BEST)
            X_best_train = f_classif_select.fit_transform(X_train, y_train)
            X_best_test = f_classif_select.fit_transform(X_test, y_test)

            knn.fit(X_best_train, y_train)
            y_pred = knn.predict(X_best_test)
            scores[i].append(metric(y_test, y_pred))

    for dataset_score in scores:
        print(np.mean(dataset_score))
Example #22
0
def test_deprecation_random_state():
    enn = EditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        enn.fit_resample(X, Y)
Example #23
0
def validateFitModel(X_train,
                     y_train,
                     X_test=None,
                     y_test=None,
                     cv=False,
                     target=None):
    rs = RobustScaler(quantile_range=(0.1, 0.90))
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(rs.fit_transform(X_train))
    ncr = EditedNearestNeighbours(n_neighbors=1,
                                  sampling_strategy=[7, 10],
                                  random_state=42,
                                  return_indices=True)
    _, _, indexes = ncr.fit_resample(X_train_mms, y_train)
    resampling_index = random.sample(range(len(indexes)), len(indexes))
    sampled_indexes = indexes[resampling_index]
    with open(os.path.join(MODELS_PATH, 'sampled_dfs_%s.bin' % target),
              'wb') as f:
        pickle.dump(sampled_indexes, f)
        f.close()

    model = XGBClassifier(verbosity=2,
                          n_estimators=100,
                          objective='multi:softprob',
                          learning_rate=0.125,
                          min_child_weight=1,
                          max_depth=13,
                          gamma=0.6,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=0.9,
                          reg_lambda=2,
                          scale_pos_weight=0.05)
    if cv:
        param_grid = {
            'n_estimators': [10],
            'objective': ['multi:softprob'],
            'learning_rate': [0.125],
            'min_child_weigth': [1],
            'max_depth': [13],
            'gamma': [0.6],
            'max_delta_step': [0],
            'subsample': [1],
            'colsample_bytree': [0.9],
            'reg_lambda': [2],
            'scale_pos_weight': [0.05]
        }
        validate(X_train[sampled_indexes],
                 y_train[sampled_indexes],
                 X_test,
                 y_test,
                 target=target,
                 model=model,
                 parameters=param_grid,
                 model_name='XGB')
    else:
        model.fit(X_train[sampled_indexes], y_train[sampled_indexes])
        with open(
                os.path.join(MODELS_PATH, '%s_fitted_classifier.bin' % target),
                'wb') as f:
            pickle.dump(model, f)
            f.close()

    return
Example #24
0
# ---------- ABALONE -----
# dataset = pd.read_csv('data/abalone.txt')
# X_data = dataset.iloc[:, 0:].values
# y_data = dataset.iloc[:, 8].values

print(X_data.shape)
print('-------')

# ------- CNN --------
cnn = CondensedNearestNeighbour()
X_cnn, y_cnn = cnn.fit_resample(X_data, y_data)
print(X_cnn.shape)

# ------- ENN --------
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_data, y_data)
print(X_enn.shape)

# ------- RENN --------
renn = RepeatedEditedNearestNeighbours()
X_renn, y_renn = renn.fit_resample(X_data, y_data)
print(X_renn.shape)

# ------- Tomek --------
tl = TomekLinks()
X_t, y_t = tl.fit_resample(X_data, y_data)
print(X_t.shape)

# ------- RUS --------
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_data, y_data)
X_test = np.load('FUMSECK_L3/X_test610.npy')
y_test = np.load('FUMSECK_L3/y_test610.npy')

#========================================
# (Optional) ENN : delete dirty examples
#========================================

X_integrated = trapz(X_train, axis=1)
X_integrated = pd.DataFrame(
    X_integrated, columns=['SWS', 'FWS', 'FL Orange', 'FL Red', 'Curvature'])
y = y_train.argmax(1)

# ENN for cleaning data
enn = EditedNearestNeighbours()
X_rs, y_rs = enn.fit_resample(X_integrated, y)

X_train = X_train.take(enn.sample_indices_, axis=0)
y_train = y_train.take(enn.sample_indices_, axis=0)

#plot_2Dcyto(X_rs, y_rs, tn, 'FWS', 'FL Red')
#plot_2Dcyto(X_integrated, y, tn, 'FWS', 'FL Red')

#========================================================
# RUS: Delete random observations from majority classes
#========================================================

balancing_dict = Counter(np.argmax(y_train, axis=1))
for class_, obs_nb in balancing_dict.items():
    if obs_nb > 3000:
        balancing_dict[class_] = 3000
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Three subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours(return_indices=True)
X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)
reduction_str = ('Reduced {:.2f}%'.format(
    100 * (1 - float(len(X_resampled)) / len(X))))
print(reduction_str)
c3 = ax2.scatter(X_vis[idx_samples_removed, 0],
                 X_vis[idx_samples_removed, 1],
                 alpha=.2,
                 label='Removed samples',
                 c='g')
plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str)

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours(return_indices=True)
Example #27
0
def splitrepeat_mcn(X, y, model_list, splits, repeats, num_classes, feature_list=None, mc_strategy='ovr', test_ratio=0.25, class_labels=None,
                    stacked_model=None, imbalanced=None, categorical_features=None, over_strategy='auto', under_strategy='auto', avg_strategy='macro',
                    initial_split_seed=None, initial_split_ratio=0.5, verbose=0):
    '''
    Runs a j-split k-repeat random sub-sampling cross-validation for classification tasks

    model must be a sklearn model

    Parameters
    ----------
    X : pandas DataFrame
        Independent data
    y : pandas Series or numpy array
        Depndent data or labels
    model :  any scikit-learn classifier 
        Currently tested with randomforestclassifier, gradientboostedclassifier
    splits :  array
        Specify a list of seed values to be used. 
        TO DO: insert int to randomly assign seeds
    repeats :  array
        Specify a list of seed values to be used. 
        TO DO: insert int to randomly assign seeds
    num_classes : int
        Number of classes. If classes are not arranged in numerical format (ex: 0,1,2) then specify class_labels
    class_labels : list of strings or ints
        Set labels of classes if not numerical from 0
    test_ratio : float
        Used in sklearn.metrics.train_test_split to calculate the proportion of validation and test sets vs training data. 
        Test set is calculated first, followed by validation set, so if the same number is used for both the test set will be larger than the validation set.
    imbalanced : default=None
        'over' : utilize imblearn's SMOTE (or SMOTENC if categorical_features are defined) to oversample the train set
        'under' : utilize imblearn's EditedNearestNeighbours to undersample the test set
    categorical_features : list of categorical features in data, used in SMOTENC
    avg_strategy : see 'average' in sklearn's roc_auc_score (default = 'macro')
    verbose : 0, 1, or 2
        0 : disables all output
        1 : shows split/repeat number
        2 : adds confusion_matrix
    initial_split_seed : int
        If this value is specified, data will be initially split once. Use this to match previously used train/test splits (sklearn implementation) and to ensure that training data remains
        in the training set. Data on the testing side of the split may be shuffled into the training set, but never the reverse.
        If this value is not specified, all data will be shuffled. This is useful if a holdout test set will be used for final testing (note: do not test holdout sets using splitrepeat_cv)
    initial_split_ratio : float
        If initial_split_seed is specified, this ratio will be used to split initial train/test ratios. Small train splits are preferred to enable more data to be shuffled and to reduce overfitting.
        This value replaces "train_size" in sklearn's train_test_split.
        Note that the train data from this initial split will be added to all training sets generated 
    over_strategy : see "search_strategy" from imblearn.oversampling.SMOTE
    under_strategy : see "search_strategy" from imblearn.undersampling.EditedNearestNeighbours

    Returns
    -------
    Dataframe with sensitivity, specificity, PPV, NPV, accuracy, and F1 values for each class
    '''
    df = pd.DataFrame()

    if class_labels == None:
        class_labels = list(range(num_classes)) # For multiclass ROC curve calculations (requires numerical input)

    if initial_split_seed != None:
        _X_train, X, _y_train, y = train_test_split(X, y.values.ravel(), train_size=initial_split_ratio, random_state=initial_split_seed, stratify=y)
        y = pd.Series(y)
    
    # Begin j-split k-repeat loop
    for j in splits:
        X_, X_test, y_, y_test = train_test_split(X, y.values.ravel(), test_size=test_ratio, random_state=j, stratify=y)
        if initial_split_seed != None:
            X_ = X_.append(_X_train)
            y_ = np.append(y_,_y_train)

        if imbalanced == 'under':
            enn = EditedNearestNeighbours(sampling_strategy=under_strategy, random_state=j)
            X_,y_ = enn.fit_resample(X_,y_)
            X_test,y_test = enn.fit_resample(X_test,y_test) # Add option to call test resampling
        if imbalanced == 'over':
            if categorical_features is None:
                sm = SMOTE(random_state=j, sampling_strategy=over_strategy)
            else:
                categorical_features = np.in1d(X_.columns.values, categorical_features)
                sm = SMOTENC(categorical_features = categorical_features, sampling_strategy=over_strategy, random_state=j)
            X_, y_ = sm.fit_resample(X_,y_)

        # Run models
        for k in repeats:
            np.random.seed(k)
            y_output = pd.DataFrame()
            y_ = pd.DataFrame(y_)
            for i in class_labels:
                model = model_list[i]
                model.set_params(random_state=k)
                X_i = X_[feature_list[i]] # Select feature list
                replace_y = {class_labels[i]:1}
                replace_y.update(zip([x for x in class_labels if x!=i],[0 for x in [x for x in class_labels if x!=i]]))
                y_i = y_.replace(replace_y) # Set selected class=1, others=0 (OneVsRest)
            
                model.fit(X_i,y_i.values.ravel())
                y_pred = model.predict_proba(X_test[feature_list[i]])
                y_output['Clf'+str(i)] = y_pred[:,1]

            # Use each classifier's target class (OneVsRest) output as selected output probability, then divide by total 
            # so that probability outputs sum to 1
            for i in range(len(y_output)):                         
                y_output.iloc[i,:] = y_output.iloc[i,:].divide(y_output.sum(axis=1)[i]).to_numpy() 
            
            if stacked_model != None:
                y_output = stacked_model.predict_proba(y_output)

            cmat = multilabel_confusion_matrix(y_test, np.argmax(y_output.to_numpy(), axis=1))
            if verbose >= 1:
                print('Split: ',j,', Repeat: ',k)
            if verbose >= 2:
                print(cmat)
            report = pd.DataFrame({'j':j,'k':k}, index=[0]) 

            for c in range(num_classes):                
                TP = cmat[c][1][1]
                FP = cmat[c][0][1]
                TN = cmat[c][0][0]
                FN = cmat[c][1][0]
                                
                report['Sensitivity'+str(c)] = (TP/(TP+FN))
                report['Specificity'+str(c)] = (TN/(FP+TN))
                report['PPV'+str(c)] = (TP/(TP+FP))
                report['NPV'+str(c)] = (TN/(TN+FN))
                report['Accuracy'+str(c)] = (TP+TN)/len(y_test)

            y_test1 = y_test.copy()
            y_test1[y_test1 > 1] = 1 #Set class 2 predictions to class 1, to enable comparison with two-class 
            y_pred = np.argmax(y_output.to_numpy(), axis=1)
            y_pred[y_pred > 1] = 1

            report['Sensitivity'] = recall_score(y_test1,y_pred, average=avg_strategy, labels=[1])
            report['Specificity'] = recall_score(y_test1,y_pred, average=avg_strategy, labels=[0])
            report['PPV'] = precision_score(y_test1,y_pred, average=avg_strategy, labels=[1])
            report['NPV'] = precision_score(y_test1,y_pred, average=avg_strategy, labels=[0])
            report['F1_Score'] = f1_score(y_test1,y_pred, average=avg_strategy)
            report['Accuracy'] = accuracy_score(y_test1, y_pred)

            report.set_index(['j','k'], inplace=True)

            df = df.append(report)

    return df
Example #28
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
Example #29
0
    folds_origin.append([X_train, X_test, y_train, y_test])

    #resampling 1: under + SMOTE, final ratio 0.5:
    under = RandomUnderSampler(sampling_strategy=0.0026)
    X_train_re, y_train_re = under.fit_resample(X_train, y_train)
    start1 = time.time()
    over = SMOTE(sampling_strategy=0.5)
    X_train_re1, y_train_re1 = over.fit_resample(X_train_re, y_train_re)
    end1 = time.time()
    exe_time1 = end1 - start1
    folds_re1.append([X_train_re1, X_test, y_train_re1, y_test])

    #resampling 2: under + SMOTE + ENN, final ratio 0.5:
    start2 = time.time()
    enn = EditedNearestNeighbours(sampling_strategy='all')
    X_train_re2, y_train_re2 = enn.fit_resample(X_train_re1, y_train_re1)
    end2 = time.time()
    exe_time2 = end2 - start2 + exe_time1
    folds_re2.append([X_train_re2, X_test, y_train_re2, y_test])

    #resampling 3: under + SMOTE + ENN, final ratio 1:
    start3 = time.time()
    over = SMOTEENN()
    X_train_re3, y_train_re3 = over.fit_resample(X_train_re, y_train_re)
    end3 = time.time()
    exe_time3 = end3 - start3
    folds_re3.append([X_train_re3, X_test, y_train_re3, y_test])

    times.append([exe_time1, exe_time2, exe_time3])

# store the folds for original and different re-samplings
def imbalance_handler(
        mode: str,
        parent_class_label: str):
    """
    Purpose
    -------
    The purpose of this function is to provide the user a tool that
    allows them to easily manipulate their training and/or test dataset
    so that it is  significantly more balanced between its classes. One
    would want to do this in order to improve the realiability of their
    classifier that will get trainined on this dataset (see 1. in the
    References section for more information about this).

    **Note, however, that if a class has only 5 or fewer article
    instances that belong to it, it will be dropped completely due to
    the fact that the SMOTE and ENN algorithms used in this function
    rely on at least 6 nearest-neighbors of a class to exist. If this
    class label is particularly important and you would like to keep it
    around, then obtain more data for it.**

    Parameters
    ----------
    mode : str
        This string allows the user to specify how they would like the
        imbalancing of the dataset to be handled. The available options
        include:
            1. "smote" - In this mode, the only algorithm that will be
                         implemented to make  the dataset more balanced
                         is the over-sampling algorithm SMOTE. See 1., 3.,
                         4., and 5. in the References section for more
                         information about this algorithm.
            2. "enn" - In this mode, the only algorithm that will be
                       implemented to make the dataset more balanced is
                       the under-sampling algorithm Edited-Nearest Neighbors
                       (ENN). See 1. and 6. for more information about
                       this algorithm.
            3. "smote-enn" - In this mode, this function will implement
                             both the SMOTE and ENN algorithms; SMOTE
                             will oversample to make the classes balanced
                             and ENN will under-sample to remove any newly
                             generated samples in the minority class(es)
                             that are not helpful. See 1. and 7. for more
                             information about the benefits of doing using
                             this 2-step process and for how this is
                             implemented in the imbalanced-learn module.
    parent_class_label : str
        This string represents the class label that is the Parent class
        of all of the sub-classes that will be distignuished and predicted
        by a classifier that you wish to build. I.e., if you want to build
        a classifier for the children of the "Auto Type" class (which
        includes "Budget Cars", "Concept Cars", and "Luxury Cars" to name
        a few), then you simply have to pass in the "Auto Type" string to
        this parameter.

    Returns
    -------
    to_return : (Sparse Numpy Array, Numpy Array)
        The former element represents the new feature matrix (where some
        rows correspond to the article instances that were synthetically
        generated if the user specifed for over-sampling to occur) and the
        latter element represents the new class labels. Note that the
        number of rows in both these array objects are the same since each
        row of the two correspond to the same (real or synthetic) article
        instance.

    References
    ----------
    1. https://towardsdatascience.com/guide-to-classification-on-imbalanced-datasets-d6653aa5fa23
    2. https://imbalanced-learn.readthedocs.io/en/stable/index.html
    3. https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
    4. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    5. https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/
    6. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html#imblearn.under_sampling.EditedNearestNeighbours
    7. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html
    """
    # First, get values for the parameters that we will need to use for
    # the rest of the function.
    normalized_mode = "".join(mode.lower().split("-"))
    child_tier_lvl, raw_articles_df = class_data_retrival(
        parent_class_label, give_child_tier_lvl=True)

    # Before performing any transformations on our data, we need to
    # double check that it is suitable for the BOWs and balance
    # correcting model. If it is not, then perform any corrections
    # neccessary.
    child_tier_label = "Tier{}".format(child_tier_lvl)
    counts_of_classes = raw_articles_df[child_tier_label].value_counts()
    counts_checker = counts_of_classes.values <= 6
    num_with_less = counts_checker.sum()
    if num_with_less > 0:
        # If any of the classes that we are working with have 5 or fewer
        # articles in them. If this is the case, then we cannot use any
        # of the over/under-sampling algorithms that investigate the
        # characteristics of its 6 nearest-neighbors. Our current
        # solution is to simply drop this class from consideration.
        indicies_with_less = np.argwhere(counts_checker).flatten()
        labels_with_less = counts_of_classes.index.values[indicies_with_less]
        if num_with_less == 1:
            # If there is exactly 1 class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size == 1
            label_to_remove = labels_with_less[0]
            articles_df = raw_articles_df[raw_articles_df[child_tier_label]
                                          != label_to_remove]
        elif num_with_less == 2:
            # If there are exactly 2 class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size == 2
            conditions_to_remove = np.logical_and(
                raw_articles_df[child_tier_label] != labels_with_less[0],
                raw_articles_df[child_tier_label] != labels_with_less[1])
            articles_df = raw_articles_df[conditions_to_remove]
        else:
            # If there are 3 or more class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size >= 3
            for i in range(len(labels_with_less)):
                #
                if i == 0:
                    # If we are on our first iteration. In this case, we
                    # need to instantiate the `conditions_to_remove`
                    # object with the first two labels that we want to
                    # remove.
                    conditions_to_remove = np.logical_and(
                        raw_articles_df[child_tier_label] != labels_with_less[i],
                        raw_articles_df[child_tier_label] != labels_with_less[i + 1])
                elif i > 1:
                    # If we are on either our third or further down
                    # iteration. If this is the case, then we know that
                    # the `conditions_to_remove` object has been
                    # instantiated. We just need to add on to it with
                    # the remaining labels that we would like to remove.
                    conditions_to_remove = np.logical_and(
                        conditions_to_remove, raw_articles_df[child_tier_label]
                        != labels_with_less[i])
            articles_df = raw_articles_df[conditions_to_remove]

    else:
        # All the article counts for each class pass the test :).
        articles_df = raw_articles_df
    # Next, obtain your X (features) matrix and your y (labels) vector.
    _, featue_matrix = bag_of_words_converter(mode="tfidf",
                                              parent_class_label=None,
                                              articles_df=articles_df,
                                              upper_n_gram=2,
                                              upper_features=300,
                                              apply_PCA=True)
    labels_arr = np.array(
        articles_df[child_tier_label].tolist())

    # Next, implement the algorithm the user has specified.
    if normalized_mode == "smote":
        # If the user would first like to oversample with the SMOTE
        # algorithm.
        sm_model = SMOTE(random_state=169,
                         n_jobs=3)
        final_feature_matrix, final_labels_arr = sm_model.fit_resample(
            featue_matrix, labels_arr)
    elif normalized_mode == "enn":
        # If the user would like to undersample with the Tomek links
        # algorithm
        enn_model = EditedNearestNeighbours(sampling_strategy="auto",
                                            n_jobs=3)
        final_feature_matrix, final_labels_arr = enn_model.fit_resample(
            featue_matrix, labels_arr)
    elif normalized_mode == "smoteenn":
        # If the user would first like to oversample with SMOTE and then
        # improve on that new set of samples by undersampling with the
        # ENN algorithm

        # Instantiate the smoteenn object from imblearn that first
        # performs SMOTE and then ENN.
        sm_enn_model = SMOTEENN(random_state=169,
                                n_jobs=3)

        # Fit and resample with this pipeline object.
        final_feature_matrix, final_labels_arr = sm_enn_model.fit_resample(
            featue_matrix, labels_arr)

    to_return = (final_feature_matrix, final_labels_arr)

    return to_return