Beispiel #1
0
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.8
    smote = SMOTETomek(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.20622591, 0.0582794],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234],
                     [0.45784496, -0.1053161]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #2
0
def test_validate_estimator_deprecation():
    """Test right processing while passing old parameters"""

    X_gt = np.array([[0.20622591, 0.0582794],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439],
                     [0.93976473, -0.06570176],
                     [0.70319159, -0.02571668],
                     [0.75052536, -0.19246517]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
                     0])

    smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    smt = SMOTETomek(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def outer_cv_loop(Xdata,Ydata,clf,parameters=[],
                    n_splits=10,test_size=0.25):

    pred=numpy.zeros(len(Ydata))
    importances=[]
    kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size)
    rocscores=[]
    for train,test in kf.split(Xdata,Ydata):
        if numpy.var(Ydata[test])==0:
           print('zero variance',varname)
           rocscores.append(numpy.nan)
           continue
        Ytrain=Ydata[train]
        Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:])
        Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:])
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
           smt = SMOTETomek()
           Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train])
        # filter out bad folds
        clf.fit(Xtrain,Ytrain)
        pred=clf.predict(Xtest)
        if numpy.var(pred)>0:
           rocscores.append(roc_auc_score(Ydata[test],pred))
        else:
           rocscores.append(numpy.nan)
        importances.append(clf.feature_importances_)
    return rocscores,importances
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTETomek(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 8)
    assert_equal(smote.stats_c_[1], 12)
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 20)
    sm = SMOTETomek(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)

    # multiclass case
    y = np.array([0] * 3 + [1] * 2 + [2] * 15)
    sm = SMOTETomek(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)
 def prep_data(self, test_ratio, smoteenn, smotomek):
     # split data into train and test
     X_train, X_test, y_train, y_test = train_test_split(
         self.X, self.y, test_size=test_ratio, random_state=4)
     # if smoteenn is true, use smoteenn sampling
     if smoteenn:
         sme = SMOTEENN(random_state=1)
         X_train, y_train = sme.fit_resample(X_train, y_train)
     # if smotomek is true, use smotomek sampling
     if smotomek:
         smt = SMOTETomek(random_state=1)
         X_train, y_train = smt.fit_resample(X_train, y_train)
     return X_train, X_test, y_train, y_test
Beispiel #8
0
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 8)
    assert_equal(smote.stats_c_[1], 12)
Beispiel #9
0
def smote_tomek(X,
                y,
                visualize=False,
                pca2d=True,
                pca3d=True,
                tsne=True,
                pie_evr=True):
    smt = SMOTETomek(random_state=42)
    X_res, y_res = smt.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def load_kinematics_and_labels(data_dir, trial_name):
    """ Load kinematics data and labels.

    Args:
        data_dir: A string.
        trial_name: A string.

    Returns:
        A 2-D NumPy array with time on the first axis. Labels are appended
        as a new column to the raw kinematics data (and are therefore
        represented as floats).
    """
    '''
    labels_dir = os.path.join(data_dir, 'transcriptions')
    labels_path = os.path.join(labels_dir, trial_name + '.txt')

    
    raw_labels_data = np.genfromtxt(labels_path, dtype=np.int,
                                    converters=LABELS_CONVERTERS,
                                    usecols=LABELS_USECOLS)
    frames = np.arange(1, kinematics_data.shape[0]+1, dtype=np.int)
    labels = np.zeros(frames.shape, dtype=np.int)
    for start, end, label in raw_labels_data:
        mask = (frames >= start) & (frames <= end)
        labels[mask] = label
    labels_data = labels.reshape(-1, 1)
    '''
    print('TRIAL NAME:', trial_name)
    kinematics_data = load_kinematics(data_dir, trial_name)
    trial_name=trial_name.replace('_capture1','')
    trial_name=trial_name.replace('_capture2','')
    val = df_labels.loc[df_labels['filename'].str.match(trial_name),['label']]
    labels_data=np.array(val)
        
    if 'Suturing_G001' in trial_name:
        kinematics_data = downsample(kinematics_data,factor=8)
        labels_data = downsample(labels_data,factor=8)
    else:
        kinematics_data = downsample(kinematics_data)
        labels_data = downsample(labels_data)
    
    print(kinematics_data.shape,labels_data.shape)
    smt = SMOTETomek(sampling_strategy='auto', ratio=sample(labels_data))
    X_smt, y_smt = smt.fit_sample(kinematics_data, labels_data)
    y_smt = np.expand_dims(y_smt, axis=1)
    print('X_smt.shape:',X_smt.shape,y_smt.shape)
    data = np.concatenate([X_smt, y_smt], axis=1)
    #labeled_data_only_mask = labels_data.flatten() != 0

    return data#[labeled_data_only_mask, :]
Beispiel #11
0
def sampling (train):
  clustering = AgglomerativeClustering(n_clusters=10).fit(train.drop(columns = 'failure'))
  train['clusters'] = clustering.labels
  smt = SMOTETomek(ratio='auto')
  
  frame = []
  for i in range(10):
    hold = train.loc[df['clusters'] == i]
    if hold['failures'] >= train['failures'].sum()*.1:
      frame[i] = smt.fit_sample(hold.drop(columns = 'failure'), hold['failures'])
      
  train = pd.concat(frame)
  
  return(train)
Beispiel #12
0
def test_sample_regular_half():
    ratio = 0.8
    smote = SMOTETomek(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769], [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695], [-0.23374509, 0.18370049],
                     [-0.00288378, 0.84259929], [1.79580611, -0.02219234],
                     [0.45784496, -0.1053161]])
    y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def rm_main(data):
    column_vals = data.columns
    X = np.array(data.loc[:, data.columns != CONST_CLASS_LABEL]
                 )  # All of the features into an np array
    y = np.array(data.loc[:, data.columns ==
                          CONST_CLASS_LABEL])  # The class into an np array
    smt = SMOTETomek(random_state=2)
    # performs SMOTE and TOMEK
    # For SMOTE only.. use SMOTE(random_state=2)
    feature_train_res, class_train_res = smt.fit_sample(
        X, y.ravel())  # performs SMOTE
    full_training_set = np.column_stack((feature_train_res, class_train_res))
    df = pd.DataFrame(data=full_training_set, columns=column_vals)
    return df
def smote_tomek(X, y):
    """Balancing data using SMOTETomek

    Args:
        X: Training set without Class Target
        y:Training set Class Target

    Returns:
        balanced train_x, test_x
    """
    sample = SMOTETomek(random_state=42, sampling_strategy='all')
    X, y = sample.fit_resample(X, y)
    print('after balancing:', X.shape)
    return X, y
    def Smote_Tomek(self):
        '''
        First oversamples the minority classes using SMOTE based on the number
        of instances selected and then cleans all the data using Tomek Links.

        Returns
        -------
        None.
        '''
        X_train = self.X_train.copy()
        y_train = self.y_train.copy()
        smt = SMOTETomek(random_state=2020)
        (self.X_train_balanced,
         self.y_train_balanced) = smt.fit_resample(X_train, y_train)
def test_validate_estimator_default():
    smt = SMOTETomek(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976], [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981], [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052], [0.19893132, -0.47761769],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049], [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234], [0.38307743, -0.05670439],
                     [0.70319159, -0.02571667], [0.75052536, -0.19246518]])
    y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Beispiel #17
0
def resample():
    test_switch = np.load('data/test_switch_w_64_f_20.npy')
    test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy')
    train_switch = np.load('data/train_switch_w_64_f_20.npy')
    train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy')

    resample_train = SMOTETomek(sampling_strategy='all',
                                smote=SMOTE(n_jobs=4),
                                tomek=TomekLinks(n_jobs=4))
    resampe_test = SMOTETomek(sampling_strategy='all',
                              smote=SMOTE(n_jobs=4),
                              tomek=TomekLinks(n_jobs=4))

    print('Beginning train resample...')
    X = np.concatenate((train_switch, train_non_switch))
    y = np.concatenate(
        (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0])))
    X_res, y_res = resample_train.fit_resample(X, y)

    train_switch = []
    train_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            train_switch.append(X_res[i])
        else:
            train_non_switch.append(X_res[i])

    np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch))
    np.save('data/train_non_switch_w_64_f_20_samp.npy',
            np.array(train_non_switch))

    print('Beginning test resample...')
    X = np.concatenate((test_switch, test_non_switch))
    y = np.concatenate(
        (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0])))
    X_res, y_res = resample_test.fit_resample(X, y)

    test_switch = []
    test_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            test_switch.append(X_res[i])
        else:
            test_non_switch.append(X_res[i])

    np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch))
    np.save('data/test_non_switch_w_64_f_20_samp.npy',
            np.array(test_non_switch))
    return
def main_cv_loop(Xdata,
                 Ydata,
                 clf,
                 parameters,
                 n_folds=4,
                 oversample_thresh=0.1,
                 verbose=False):

    # use stratified K-fold CV to get roughly equal folds
    #kf=StratifiedKFold(n_splits=nfolds)
    kf = StratifiedShuffleSplit(n_splits=4, test_size=0.2)
    # use oversampling if the difference in prevalence is greater than 20%
    if numpy.abs(numpy.mean(y) - 0.5) > oversample_thresh:
        oversample = 'smote'
    else:
        oversample = 'none'

    # variables to store outputs
    pred = numpy.zeros(len(y))  # predicted values
    kernel = []
    C = []
    fa_ctr = 0

    for train, test in kf.split(Xdata, Ydata):
        Xtrain = Xdata[train, :]
        Xtest = Xdata[test, :]
        Ytrain = Ydata[train]
        if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2:
            if verbose:
                print('oversampling using SMOTETomek')
            sm = SMOTETomek()
            Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain)

        best_estimator_, bestroc, fa = inner_cv_loop(Xtrain,
                                                     Ytrain,
                                                     clf,
                                                     parameters,
                                                     verbose=True)
        if not fa is None:
            if verbose:
                print('transforming using fa')
                print(fa)
            tmp = fa.transform(Xtest)
            Xtest = tmp
            fa_ctr += 1
        pred.flat[test] = best_estimator_.predict_proba(Xtest)
        kernel.append(best_estimator_.kernel)
        C.append(best_estimator_.C)
    return roc_auc_score(y, pred, average='weighted'), y, pred
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #21
0
def my_BalancedSample(df, target, choice=1):

    from imblearn.combine import SMOTETomek
    from imblearn.combine import SMOTEENN

    columns = df.columns.difference([target])
    print('\nthe data originally has a shape--------->\n',
          df[target].value_counts())
    model = SMOTETomek() if choice == 1 else SMOTEENN()
    X_smt, y_smt = model.fit_sample(df[columns], df[target])
    X_smt = pd.DataFrame(X_smt, columns=columns)
    X_smt[target] = y_smt
    print('\nthe data now has a shape------->\n', X_smt[target].value_counts())

    return (X_smt)
Beispiel #22
0
def balance_samples(features, labels):
    prev_count = len(features)
    analize_fold_balance(labels)

    print('Performing resample with SMOTETomek...')
    print('Original train hfo count : {0}'.format(prev_count))

    # smt = RepeatedEditedNearestNeighbours( n_jobs=-1)
    # smt = NeighbourhoodCleaningRule(sampling_strategy='majority', n_neighbors=3, n_jobs=-1)
    smt = SMOTETomek(sampling_strategy=1, random_state=42, n_jobs=4)
    features, labels = smt.fit_resample(features, labels)
    post_count = len(features)

    print('{0} instances after SMOTE...'.format(post_count))
    return features, labels
Beispiel #23
0
def learning_curve(X_train,
                   X_test,
                   y_train,
                   y_test,
                   model=sklearn.svm.SVC(),
                   observations=[50, 75, 100, 125, 150]):
    recalls = []
    f1s = []
    precs = []
    accs = []

    for n in observations:
        smt = SMOTETomek(ratio='auto')
        smt.fit(X_train, y_train)
        X_resampled, y_resampled = smt.fit_sample(X_train, y_train)
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test)

        f1 = f1_score(y_pred=y_pred, y_true=y_test, average='macro')
        acc = accuracy_score(y_pred=y_pred, y_true=y_test)
        prec = precision_score(y_pred=y_pred,
                               y_true=y_test,
                               average='weighted')
        recall = recall_score(y_pred=y_pred, y_true=y_test)

        f1s.append(f1)
        accs.append(acc)
        precs.append(prec)
        recalls.append(recall)
    plt.plot(observations, f1s, linewidth=4, color='blue', label='f1')
    plt.plot(observations, accs, linewidth=4, color='red', label='aacuracy')
    plt.plot(observations,
             precs,
             linewidth=4,
             color='green',
             label='precision')
    plt.plot(observations,
             recalls,
             linewidth=4,
             color='orange',
             label='recalls')
    plt.legend()

    plt.title("RandomUnderSampler Learning Curve", fontsize=16)
    plt.gca().set_xlabel("# of Points per Class", fontsize=14)
    plt.gca().set_ylabel("Training Accuracy", fontsize=14)
    sns.despine()
    return f1s, accs, precs, recalls, smt, model
Beispiel #24
0
def getUnderAndOverSamplers():
    samplers = {
        'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1),
        # 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1)
        'SMOTETomek': SMOTETomek(sampling_strategy=0.5, n_jobs=-1)
    }
    return samplers
Beispiel #25
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
Beispiel #26
0
    def __init__(self,  window_size=6, training_ratio=.7, seq="sequence", pos="label"):
        self.training_ratio = training_ratio  # Float value representing % of data used for training
        self.features = []
        self.labels = []
        self.words = []
        self.window_size = window_size
        self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4),
                                       "mlp_adam": MLPClassifier(),
                                       "svc": svm.SVC(verbose=1),
                                       "xgb": XGBClassifier(max_delta_step=5),
                                       "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf")
                                       }

        self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(),
                                    "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(),
                                    "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(),
                                    "near_miss": NearMiss(), "pass": -1}
        self.seq = seq
        self.pos = pos
        self.random_data = 0
        self.test_results = 0
        self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"}
        self.vector = 0
        self.features_labels = {}
        self.test_cv = 0
        self.benchmark_mcc = 0
        self.mcc_scorer = make_scorer(matthews_corrcoef)
    def balanceData(self, method: str = "mixsampling") -> None:

        """
        Function -> balanceData
        Balance data classes wiht method selected

        Parameters
        ---------------------------------------------------------------------------
            method => mixsampling, undersampling or oversampling

        Return
        ---------------------------------------------------------------------------
            None => Modify self.balanceObj
        """

        if method == "mixsampling":
            from imblearn.combine import SMOTETomek
            self.balanceObj = SMOTETomek(sampling_strategy='auto')

        elif method == "undersampling":
            from imblearn.under_sampling import NearMiss
            self.balanceObj = NearMiss(sampling_strategy= "auto", n_neighbors=3, version=2)

        elif method == "oversampling":
            from imblearn.over_sampling import RandomOverSampler
            self.balanceObj = RandomOverSampler(sampling_strategy = "auto")

        else:
            raise NameError(f"{method} method not defined")
def smote_classify(X, y):
    # X_class, y_class = make_classification(
    #                                n_samples=10000,
    #                                random_state=10,
    #                                n_classes=2,
    #                                n_informative = 4
    #                                      )
    X_class, y_class = make_classification(
        random_state=10,
        n_classes=2,
    )
    print('Original dataset shape %s' % Counter(y))
    smt = SMOTETomek(random_state=42)
    X_res, y_res = smt.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_res))
    return X_res, y_res
def test_validate_estimator_default():
    smt = SMOTETomek(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [
        0.62366841, -0.21312976
    ], [1.61091956, -0.40283504], [-0.37162401,
                                   -2.19400981], [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052], [0.19893132, -0.47761769],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [
                         -0.23374509, 0.18370049
                     ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [
                         0.38307743, -0.05670439
                     ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]])
    y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Beispiel #30
0
def sampling_factory(X,Y,ratio,cat):
    if cat == 'ros':
        sampling = 'random_over_sampling'
        data = RandomOverSampler(ratio=ratio)
    elif cat == 'rus':
        sampling = 'random_under_sampling'
        data = RandomUnderSampler(ratio=ratio)
    elif cat == 'smo':
        sampling = 'SMOTE_over_sampling'
        data = SMOTE(ratio=ratio)
    elif cat == 'smob1':
        sampling = 'borderline_SMOTE1_over_sampling'
        data = SMOTE(ratio=ratio,kind='borderline1')
    elif cat == 'smob2':
        sampling = 'borderline_SMOTE2_over_sampling'
        data = SMOTE(ratio=ratio,kind='borderline2')
    elif cat == 'sme':
        sampling = 'SMOTEENN_combine_sampling'
        data = SMOTEENN(ratio=ratio,random_state=42)
    else :
        sampling = 'SMOTETomek_combine_sampling'
        data = SMOTETomek(random_state=42)
    X_resampled,y_resampled = data.fit_sample(X,Y)
    X2 = pd.DataFrame(X_resampled)
    # columns rename
    X2.columns = X.columns.values
    return X2
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
def test_smote_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    assert_raises(RuntimeError, smote.sample, X, Y)
def test_sample_regular_half():
    sampling_strategy = {0: 9, 1: 12}
    smote = SMOTETomek(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [
        1.61091956, -0.40283504
    ], [-0.37162401, -2.19400981], [0.74680821,
                                    1.63827342], [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769], [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [
                         -0.00288378, 0.84259929
                     ], [1.79580611, -0.02219234], [0.45784496, -0.1053161]])
    y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.5
    smote = SMOTETomek(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, "data", "smote_tomek_reg_x_05.npy"))
    y_gt = np.load(os.path.join(currdir, "data", "smote_tomek_reg_y_05.npy"))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #35
0
def SMOTE_methods(df_train, target, method):
    '''The output data has been normalized by MinMaxScaler'''
    scaler = MinMaxScaler()
    X = df_train.drop([target], axis=1)
    y = df_train[target]
    X_normalized = scaler.fit_transform(X)
    if method == 'regular':
        X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y)
    elif method == 'borderline1':
        X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y)
    elif method == 'borderline2':
        X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y)
    elif method == 'svm':
        X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y)
    elif method == 'Tomek':
        sm = SMOTETomek()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    elif method == 'ENN':
        sm = SMOTEENN()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    else:
        raise ValueError('输入方法有误')
    df_final = pd.DataFrame(X_res, columns=X.columns)
    df_final['target'] = y_res
    return df_final
def test_error_wrong_object():
    smote = 'rnd'
    tomek = 'rnd'
    smt = SMOTETomek(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_sample(X, Y)
    smt = SMOTETomek(tomek=tomek, random_state=RND_SEED)
    with raises(ValueError, match="tomek needs to be a TomekLinks"):
        smt.fit_sample(X, Y)
Beispiel #37
0
class Resampling:
    def __init__(self, name):
        self.strategie = None
        self.name = name

        if name == "enn":
            self.strategie = EditedNearestNeighbours(sampling_strategy='auto',
                                                     n_neighbors=3,
                                                     kind_sel='all',
                                                     n_jobs=-1)
        elif name == "allknn":
            self.strategie = AllKNN(sampling_strategy='auto',
                                    n_neighbors=3,
                                    kind_sel='all',
                                    allow_minority=False,
                                    n_jobs=-1)
        elif name == "renn":
            self.strategie = RepeatedEditedNearestNeighbours(
                sampling_strategy='auto',
                n_neighbors=3,
                max_iter=100,
                kind_sel='all',
                n_jobs=-1)

        elif name == "tomek":
            self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1)

        elif name == "smote":
            self.strategie = SMOTE(sampling_strategy='auto',
                                   k_neighbors=5,
                                   n_jobs=-1,
                                   random_state=42)

        elif name == "bdsmote":
            self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1)

        elif name == "adasyn":
            self.strategie = ADASYN(sampling_strategy='auto',
                                    n_neighbors=5,
                                    n_jobs=-1,
                                    random_state=42)

        elif name == "smoteenn":
            self.strategie = SMOTEENN(sampling_strategy='auto',
                                      smote=None,
                                      enn=None,
                                      n_jobs=-1,
                                      random_state=42)

        elif name == "smotetomek":
            self.strategie = SMOTETomek(sampling_strategy='auto',
                                        smote=None,
                                        tomek=None,
                                        n_jobs=-1,
                                        random_state=42)

    def fit_resample(self, x, y):
        x_res, y_res = self.strategie.fit_resample(x, y)
        return x_res, y_res
Beispiel #38
0
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False):
    '''
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    If normalize flag is True then the data are being normalised
    The sampling parameter sets the type of sampling to be used
    '''
    print('----------{} with {}----------'.format(clf_name, sampling))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    plot_ind = randint(0, 9)
    j = 0
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(usx, usy):
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]

        if sampling == 'SMOTE':
            x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ADASYN':
            x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ENN':
            x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train)
        elif sampling == 'Tomek':
            x_train, y_train = TomekLinks().fit_resample(x_train, y_train)
        elif sampling == 'SMOTETomek':
            x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'SMOTEENN':
            x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'NCR':
            x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train)
        elif sampling == 'OSS':
            x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train)

        if normalize:
            scaler = StandardScaler().fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

        clf.fit(x_train, y_train)

        # if plot_ind == j and clf_name == 'DecisionTreeClassifier':
        #     plot_decision_tree(clf)

        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
def SMOTE_Tomek(X_train,
                Y_train,
                seed,
                sampling_strategy,
                k_neighbors_smote=5):
    tl = TomekLinks(random_state=seed, n_jobs=-1)
    smote = SMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors_smote)
    smote_tomek = SMOTETomek(random_state=seed, smote=smote, tomek=tl)
    print('Before SMOTE + Tomek : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = smote_tomek.fit_resample(
        X_train, Y_train)
    print('After SMOTE + Tomek : ', sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
def main_cv_loop(Xdata,Ydata,clf,parameters,
                n_folds=4,oversample_thresh=0.1,verbose=False):

    # use stratified K-fold CV to get roughly equal folds
    #kf=StratifiedKFold(n_splits=nfolds)
    kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2)
    # use oversampling if the difference in prevalence is greater than 20%
    if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh:
        oversample='smote'
    else:
        oversample='none'

    # variables to store outputs
    pred=numpy.zeros(len(Ydata))  # predicted values
    pred_proba=numpy.zeros(len(Ydata))  # predicted values
    kernel=[]
    C=[]
    fa_ctr=0

    for train,test in kf.split(Xdata,Ydata):
        Xtrain=Xdata[train,:]
        Xtest=Xdata[test,:]
        Ytrain=Ydata[train]
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
            if verbose:
                print('oversampling using SMOTETomek')
            sm = SMOTETomek()
            Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain)

        best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf,
                    parameters,verbose=True)
        if not fa is None:
            if verbose:
                print('transforming using fa')
                print(fa)
            tmp=fa.transform(Xtest)
            Xtest=tmp
            fa_ctr+=1
        pred_proba.flat[test]=best_estimator_.predict_proba(Xtest)
        pred.flat[test]=best_estimator_.predict(Xtest)
        kernel.append(best_estimator_.kernel)
        C.append(best_estimator_.C)
    return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
def test_error_wrong_object():
    smote = 'rnd'
    tomek = 'rnd'
    smt = SMOTETomek(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_resample(X, Y)
    smt = SMOTETomek(tomek=tomek, random_state=RND_SEED)
    with raises(ValueError, match="tomek needs to be a TomekLinks"):
        smt.fit_resample(X, Y)
def test_parallelisation():
    # Check if default job count is 1
    smt = SMOTETomek(random_state=RND_SEED)
    smt._validate_estimator()
    assert smt.n_jobs == 1
    assert smt.smote_.n_jobs == 1
    assert smt.tomek_.n_jobs == 1

    # Check if job count is set
    smt = SMOTETomek(random_state=RND_SEED, n_jobs=8)
    smt._validate_estimator()
    assert smt.n_jobs == 8
    assert smt.smote_.n_jobs == 8
    assert smt.tomek_.n_jobs == 8
def test_error_wrong_object(smote_params, err_msg):
    smt = SMOTETomek(**smote_params)
    with pytest.raises(ValueError, match=err_msg):
        smt.fit_resample(X, Y)
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply SMOTE + Tomek links
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=0.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],