def test_categorical_smote_k_custom_nn(categorical_data):
    X, y = categorical_data
    smote = SMOTEN(k_neighbors=_CustomNearestNeighbors(n_neighbors=5))
    X_res, y_res = smote.fit_resample(X, y)

    assert X_res.shape == (80, 3)
    assert Counter(y_res) == {"apple": 40, "not apple": 40}
Beispiel #2
0
def test_smoten(data):
    # overall check for SMOTEN
    X, y = data
    sampler = SMOTEN(random_state=0)
    X_res, y_res = sampler.fit_resample(X, y)

    assert X_res.shape == (80, 3)
    assert y_res.shape == (80, )
Beispiel #3
0
def test_smoten_resampling():
    # check if the SMOTEN resample data as expected
    # we generate data such that "not apple" will be the minority class and
    # samples from this class will be generated. We will force the "blue"
    # category to be associated with this class. Therefore, the new generated
    # samples should as well be from the "blue" category.
    X = np.array(["green"] * 5 + ["red"] * 10 + ["blue"] * 7,
                 dtype=object).reshape(-1, 1)
    y = np.array(
        ["apple"] * 5 + ["not apple"] * 3 + ["apple"] * 7 + ["not apple"] * 5 +
        ["apple"] * 2,
        dtype=object,
    )
    sampler = SMOTEN(random_state=0)
    X_res, y_res = sampler.fit_resample(X, y)

    X_generated, y_generated = X_res[X.shape[0]:], y_res[X.shape[0]:]
    np.testing.assert_array_equal(X_generated, "blue")
    np.testing.assert_array_equal(y_generated, "not apple")
Beispiel #4
0
 def custom_smote(self, X_train, y_train,  algo,sampling_strategy, *cat):
     cat = pd.Series(cat)
     a = dict(zip(list(X_train.columns), np.arange(X_train.shape[0])))
     cat = list(cat.map(a))
     if algo == "smotenc":
         over_sample = SMOTENC(categorical_features = cat,sampling_strategy = sampling_strategy)    
     elif algo == "smote":
         over_sample = SMOTE(sampling_strategy = sampling_strategy)  
     elif algo == 'smoten':
         over_sample = SMOTEN(sampling_strategy = sampling_strategy)  
     else:
         return "please specify algo argument"
     
     X_train.index = np.arange(X_train.shape[0])
     X_smote, y_smote = over_sample.fit_resample(X_train,y_train)
     
     return X_smote, y_smote
Beispiel #5
0
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
print("Dataset after resampling:")
print(sorted(Counter(y_resampled).items()))
print()
print("SMOTE-NC will generate categories for the categorical features:")
print(X_resampled[-5:])
print()

# %% [markdown]
# However, if the dataset is composed of only categorical features then one
# should use :class:`~imblearn.over_sampling.SMOTEN`.

# %%
from imblearn.over_sampling import SMOTEN

# Generate only categorical data
X = np.array(["A"] * 10 + ["B"] * 20 + ["C"] * 30, dtype=object).reshape(-1, 1)
y = np.array([0] * 20 + [1] * 40, dtype=np.int32)

print(f"Original class counts: {Counter(y)}")
print()
print(X[:5])
print()

sampler = SMOTEN(random_state=0)
X_res, y_res = sampler.fit_resample(X, y)
print(f"Class counts after resampling {Counter(y_res)}")
print()
print(X_res[-5:])
print()
Beispiel #6
0
def rebalance_training_data(X: pd.DataFrame, y: pd.Series,
                            target: str) -> Tuple[pd.DataFrame, pd.Series]:
    # Uses median as the number of training rows for each class
    from collections import Counter
    prev_nrows = len(X)
    prev_stdv = compute_class_nrow_stdv(y, is_discrete=True)
    hist = dict(Counter(y).items())  # type: ignore
    median = int(np.median([count for key, count in hist.items()]))

    def _split_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
        X = df[df.columns[df.columns != target]]  # type: ignore
        y = df[target]
        return X, y

    # Filters out rows having NaN values for over-sampling
    X[target] = y
    X_notna, y_notna = _split_data(X.dropna())
    X_na, y_na = _split_data(X[X.isnull().any(axis=1)])

    # Over-sampling for training data whose row number is smaller than the median value
    hist_na = dict(Counter(y_na).items())  # type: ignore
    smote_targets = []
    kn = 5  # `k_neighbors` default value in `SMOTEN`
    for key, count in hist.items():
        if count < median:
            nna = hist_na[key] if key in hist_na else 0
            if count - nna > kn:
                smote_targets.append((key, median - nna))
            else:
                _logger.warning(
                    f"Over-sampling of '{key}' in y='{target}' failed because the number of the clean rows "
                    f"is too small: {count - nna}")

    if len(smote_targets) > 0:
        from imblearn.over_sampling import SMOTEN
        sampler = SMOTEN(random_state=42,
                         sampling_strategy=dict(smote_targets),
                         k_neighbors=kn)
        X_notna, y_notna = sampler.fit_resample(X_notna, y_notna)

    X = pd.concat([X_notna, X_na])
    y = pd.concat([y_notna, y_na])

    # Under-sampling for training data whose row number is greater than the median value
    rus_targets = list(
        map(lambda x: (x[0], median),
            filter(lambda x: x[1] > median, hist.items())))
    if len(rus_targets) > 0:
        # NOTE: The other smarter implementations can skew samples if there are many rows having NaN values,
        # so we just use `RandomUnderSampler` here.
        from imblearn.under_sampling import RandomUnderSampler
        sampler = RandomUnderSampler(random_state=42,
                                     sampling_strategy=dict(rus_targets))
        X, y = sampler.fit_resample(X, y)

    _logger.info(
        "Rebalanced training data (y={}, median={}): #rows={}(stdv={}) -> #rows={}(stdv={})"
        .format(target, median, prev_nrows, prev_stdv, len(X),
                compute_class_nrow_stdv(y, is_discrete=True)))
    _logger.debug("class hist: {} => {}".format(hist.items(),
                                                Counter(y).items()))
    return X, y
Beispiel #7
0
plt.figure(figsize=(20,5))
plt.title('Classes da variável Churn desbalanceadas', size=15)
sns.countplot(x='Churn', data=churn2)
plt.xlabel('Classes', size=15)
plt.ylabel('');

"""Rebalanceamento das classes com vários algoritmos de reamostragem de dados. Aqui irei aplicar model0s de *Oversampling*, de *Undersampling* e dessas duas técnicas de forma combinada."""

#Algoritmos de Oversampling
X1,y1=SMOTE().fit_resample(X,y)
X2,y2=ADASYN().fit_resample(X,y)
X3,y3=BorderlineSMOTE().fit_resample(X,y)
X4,y4=SVMSMOTE().fit_resample(X,y)
X5,y5=KMeansSMOTE().fit_resample(X,y)
X6,y6=SMOTEN().fit_resample(X,y)
#X7,y7=SMOTENC().fit_resample(X,y)
X8,y8=RandomOverSampler().fit_resample(X,y)

#Algoritmos de Undersampling
X9,y9=RandomUnderSampler().fit_resample(X,y)
X10,y10=NearMiss().fit_resample(X,y)
X11,y11=EditedNearestNeighbours().fit_resample(X,y)
X12,y12=RepeatedEditedNearestNeighbours().fit_resample(X,y)
X13,y13=AllKNN().fit_resample(X,y)
#X14,y14=CondensedNearestNeighbour().fit_resample(X,y)
X15,y15=OneSidedSelection().fit_resample(X,y)
X16,y16=NeighbourhoodCleaningRule().fit_resample(X,y)
X17,y17=InstanceHardnessThreshold().fit_resample(X,y)

#Técnicas combinadas
     SVMSMOTE(random_state=0)],
    ids=["borderline", "svm"],
)
def test_numerical_smote_extra_custom_nn(numerical_data, smote):
    X, y = numerical_data
    smote.set_params(m_neighbors=_CustomNearestNeighbors(n_neighbors=5))
    X_res, y_res = smote.fit_resample(X, y)

    assert X_res.shape == (120, 2)
    assert Counter(y_res) == {0: 60, 1: 60}


# FIXME: to be removed in 0.12
@pytest.mark.parametrize(
    "sampler",
    [
        ADASYN(random_state=0),
        BorderlineSMOTE(random_state=0),
        SMOTE(random_state=0),
        SMOTEN(random_state=0),
        SMOTENC([0], random_state=0),
        SVMSMOTE(random_state=0),
    ],
)
def test_n_jobs_deprecation_warning(numerical_data, sampler):
    X, y = numerical_data
    sampler.set_params(n_jobs=2)
    warning_msg = "The parameter `n_jobs` has been deprecated"
    with pytest.warns(FutureWarning, match=warning_msg):
        sampler.fit_resample(X, y)
Beispiel #9
0
# sns.heatmap(cm, annot=True)
# plt.title('Confusion matrix of the Isolation Forest classifier')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.savefig('./output/Random_Forest.png')
# plt.show()

# Oversampling by SMOTEN (Variant of SMOTE on categorical, using VDM)
print("Oversampling...")
counter = Counter(y_train_full)
print("Before oversampling, the class distribution is:")
print(counter)
class_dist = y_train_full.value_counts()
desired_ratio = {0: class_dist[0], 1: class_dist[0] // 5}
oversample_smoten = SMOTEN(sampling_strategy=desired_ratio,
                           random_state=seed,
                           n_jobs=-1)
X_train_full_fs, y_train_full = oversample_smoten.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After oversampling, the class distribution is:")
print(counter)

# Undersample with One-Sided Selection (Tomek Links + Condensed Nearest Neighbor)
print("Undersampling...")
# n_seeds_S is the number of majority class to be added to set C, which is then used as a reference for a kNN on the remaining majority samples not in set C
undersample_oss = OneSidedSelection(n_neighbors=1,
                                    n_seeds_S=counter[1],
                                    n_jobs=-1,
                                    random_state=seed)
X_train_full_fs, y_train_full = undersample_oss.fit_resample(