Example #1
0
def test_nm_wrong_nn_obj():
    ratio = 'auto'
    nn = 'rnd'
    nm = NearMiss(ratio=ratio, random_state=RND_SEED,
                  version=VERSION_NEARMISS,
                  return_indices=True,
                  n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        nm.fit_sample(X, Y)
    nn3 = 'rnd'
    nn = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=3, return_indices=True,
                   n_neighbors=nn, n_neighbors_ver3=nn3)
    with raises(ValueError, match="has to be one of"):
        nm3.fit_sample(X, Y)
Example #2
0
def test_nm3_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nn3 = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(ratio=ratio,
                   random_state=RND_SEED,
                   version=VERSION_NEARMISS,
                   return_indices=True,
                   n_neighbors=nn,
                   n_neighbors_ver3=nn3)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                     [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #3
0
def test_nm3_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .7

    # Create the object
    nm3 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212],
                     [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228],
                     [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718],
                     [0.03142011, 0.12323596],
                     [-0.05903827, 0.10947647],
                     [1.15157493, -1.2981518],
                     [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728],
                     [0.45713638, 1.31069295]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #4
0
def under_sample_near_miss(X, y, k_neighbors: int, under_sampling_mode: int, out_file_name: str):
    print('Undersampling using "Near Miss"-Method.')
    n_samples, n_x, n_y, n_z = X.shape
    f = open(out_file_name, 'w')

    f.write('Read y==0 count: ' + str(np.count_nonzero(y == 0)) + '\n')
    f.write('Read y==1 count: ' + str(np.count_nonzero(y == 1)) + '\n')
    f.write('Read X Shape: ' + str(X.shape) + '\n')
    f.write('Read y Shape: ' + str(y.shape) + '\n')
    f.write('Undersampling version: ' + str(under_sampling_mode) + '\n')
    f.write('Undersampling k_neighbors: ' + str(k_neighbors) + '\n')

    under_sampler = NearMiss(version=under_sampling_mode, n_neighbors_ver3=k_neighbors)
    f.write('Undersampler: ' + str(under_sampler) + '\n')

    under_X = X.reshape(X.shape[0], -1)
    under_y = y.ravel()
    del X
    del y

    under_X, under_y = under_sampler.fit_sample(under_X, y)
    under_X = under_X.reshape(under_X.shape[0], n_x, n_y, n_z)
    under_y = under_y[:, np.newaxis]

    print('Undersampled y==0 count: ' + str(np.count_nonzero(under_y == 0)))
    print('Undersampled y==1 count: ' + str(np.count_nonzero(under_y == 1)))
    print('Undersampled X Shape: ' + str(under_X.shape))
    print('Undersampled y Shape: ' + str(under_y.shape))
    f.write('Undersampled y==0 count: ' + str(np.count_nonzero(under_y == 0)) + '\n')
    f.write('Undersampled y==1 count: ' + str(np.count_nonzero(under_y == 1)) + '\n')
    f.write('Undersampled X Shape: ' + str(under_X.shape) + '\n')
    f.write('Undersampled y Shape: ' + str(under_y.shape) + '\n')

    f.close()
    return under_X, under_y
def test_nm3_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nn3 = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(
        ratio=ratio,
        random_state=RND_SEED,
        version=VERSION_NEARMISS,
        return_indices=True,
        n_neighbors=nn,
        n_neighbors_ver3=nn3)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                     [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_nm2_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nm2 = NearMiss(ratio=ratio,
                   random_state=RND_SEED,
                   version=VERSION_NEARMISS,
                   return_indices=True,
                   n_neighbors=nn)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                     [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #7
0
def test_nm_fit_sample_auto():
    sampling_strategy = 'auto'
    X_gt = [
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718],
                  [0.03142011, 0.12323596], [1.15157493, -1.2981518],
                  [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])
    ]
    y_gt = [
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    ]
    for version_idx, version in enumerate(VERSION_NEARMISS):
        nm = NearMiss(sampling_strategy=sampling_strategy, version=version)
        X_resampled, y_resampled = nm.fit_sample(X, Y)
        assert_array_equal(X_resampled, X_gt[version_idx])
        assert_array_equal(y_resampled, y_gt[version_idx])
Example #8
0
def test_nm1_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nm1 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS, return_indices=True,
                   n_neighbors=nn)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm1.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212],
                     [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228],
                     [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596],
                     [-0.60413357, 0.24628718],
                     [0.50701028, -0.17636928],
                     [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #9
0
def test_nm_fit_sample_auto_indices():
    ratio = 'auto'
    X_gt = [
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                  [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                  [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                  [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                  [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                  [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                  [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                  [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                  [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                  [0.99272351, -0.11631728]])
    ]
    y_gt = [
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    ]
    idx_gt = [
        np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]),
        np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]),
        np.array([3, 10, 11, 0, 5, 8, 14, 4, 12])
    ]
    for version_idx, version in enumerate(VERSION_NEARMISS):
        nm = NearMiss(ratio=ratio, version=version, return_indices=True)
        X_resampled, y_resampled, idx_under = nm.fit_sample(X, Y)
        assert_array_equal(X_resampled, X_gt[version_idx])
        assert_array_equal(y_resampled, y_gt[version_idx])
        assert_array_equal(idx_under, idx_gt[version_idx])
Example #10
0
def main():
    train = pd.read_csv("../data/processed/train.csv")
    train.pop("id")
    target = train.pop("血糖")

    train_x = train.as_matrix()
    train_y = target.as_matrix()

    best_score = 10
    best_left = 0
    for left in np.arange(3.5,5.0,0.02):
        label_Y = np.zeros(train_y.shape[0])
        for i in range(train_y.shape[0]):
            if train_y[i] >= left and train_y[i] <= 6.1:
                label_Y[i] = 0
            elif train_y[i] < left:
                label_Y[i] = 1
            else:
                label_Y[i] = 2
        nm = NearMiss(ratio={0: 3000, 1: len(np.where(label_Y == 1)[0]), 2: len(np.where(label_Y == 2)[0])},
                      random_state=42, return_indices=True, version=2,n_neighbors=10)
        X_res, y_res, index = nm.fit_sample(train_x, label_Y)
        new_x = train_x[index]
        new_y = train_y[index]
        s = score(new_x, new_y)
        if s < best_score:
            best_score = s
            best_left = left
            print("greater")
            print(best_score, best_left)
        print(best_score, best_left)
    print(best_score, best_left)
Example #11
0
def test_nm1_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .7

    # Create the object
    nm1 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm1.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212],
                     [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228],
                     [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596],
                     [-0.60413357, 0.24628718],
                     [1.17737838, -0.2002118],
                     [0.50701028, -0.17636928],
                     [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def under_sampling(self, x_train, y_train):
        file = open('Training_Logs/General_Log.txt', 'a+')
        self.logger_object.log(
            file,
            'Entered under_sampling() method of PreProcessor class of data_preprocessing package'
        )
        file.close()

        self.x_train = x_train
        self.y_train = y_train
        try:
            self.logger_object.log(
                self.file_object,
                'x_train label 0 shape before under-sampling :: %s' %
                str(sum(self.y_train == 0)))
            self.logger_object.log(
                self.file_object,
                'x_train label 1 shape before under-sampling :: %s' %
                str(sum(self.y_train == 1)))
            self.logger_object.log(
                self.file_object,
                'x_train label 2 shape before under-sampling :: %s' %
                str(sum(self.y_train == 2)))

            nm = NearMiss(version=1)
            self.x_train, self.y_train = nm.fit_sample(self.x_train,
                                                       self.y_train)

            self.logger_object.log(
                self.file_object,
                'x_train label 0 shape after under-sampling :: %s' %
                str(sum(self.y_train == 0)))
            self.logger_object.log(
                self.file_object,
                'x_train label 1 shape after under-sampling :: %s' %
                str(sum(self.y_train == 1)))
            self.logger_object.log(
                self.file_object,
                'x_train label 2 shape after under-sampling :: %s' %
                str(sum(self.y_train == 2)))

            file = open('Training_Logs/General_Log.txt', 'a+')
            self.logger_object.log(
                file,
                'Successfully Executed under_sampling() method of PreProcessor class of data_preprocessing package'
            )
            file.close()

            return x_train, y_train
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in under_sampling() method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Under Sampling Unsuccessful. Exited the under_sampling() method of the Preprocessor class'
            )
            raise Exception()
def sampling(dataset):
    X,y  = split_dataset(dataset)
    print("Under Sampling")
    rus = NearMiss(random_state = 42)
    x_res, y_res = rus.fit_sample(X, y)
    #sm = SMOTE(random_state=12, ratio = 1.0)
    #x_res, y_res = sm.fit_sample(X, y)
    return x_res,y_res
def apply_near_miss(df):
    x = df.iloc[:, df.columns != 'y']
    y = df.iloc[:, df.columns == 'y']

    near_miss = NearMiss()
    x, y = near_miss.fit_sample(x, y.values.ravel())
    # print(np.bincount(y))
    return x, y
Example #15
0
def model_subSampling(X_train, y_train, X_test, y_test):
    nm = NearMiss(version=1)
    print('NearMiss - clase minoritaria = 1 (antes) {}'.format(
        Counter(y_train)))
    X_train_res, y_train_res = nm.fit_sample(X_train, y_train)
    print('NearMiss - clase minoritaria = 1 {}'.format(Counter(y_train_res)))
    model_reg_log(X_train_res, y_train_res, X_test, y_test)
    return None
Example #16
0
def modeloNaiveBayesSS():

    #Carga del dataset almacenado en csv
    dataset = pd.read_csv('dataset2.csv')

    #Reducción de la dimensionalidad, con Feature Selection, usando SelctKBest de Sklearn
    X = dataset.drop(['Plag'], axis=1)
    y = dataset['Plag']

    best = SelectKBest(k=50)
    X_new = best.fit_transform(X, y)
    X_new.shape
    selected = best.get_support(indices=True)
    #print(X.columns[selected])
    used_features = X.columns[selected]

    # Separación los datos del dataset en los cjtos de entrenamiento y test:
    X_train, X_test = train_test_split(dataset, test_size=0.3, random_state=6)
    y_train = X_train["Plag"]
    y_test = X_test["Plag"]

    #Aplicación del muestreo por subsampling en el cjto de entrenamiento sobre
    #la clase mayoritaria, reduciendo
    #las observaciones a la misma cantidad de la clase minoritaria

    us = NearMiss(sampling_strategy='auto',
                  version=1,
                  n_neighbors=3,
                  n_neighbors_ver3=3,
                  n_jobs=1)
    X_train_res, y_train_res = us.fit_sample(X_train, y_train)
    X_test_res, y_test_res = (X_test, y_test)

    # Uso del clasificador Gausiano
    gnb = GaussianNB()

    #Con el modelo creado, se utiliza fit() para el aprendizaje
    gnb.fit(X_train_res[used_features].values, y_train_res)
    y_pred = gnb.predict(X_test_res[used_features])

    #Calculo de la precisión

    print('Precisión en el set de Entrenamiento: {:.2f}'.format(
        gnb.score(X_train_res[used_features], y_train_res)))
    print('Precisión en el set de Test: {:.2f}'.format(
        gnb.score(X_test_res[used_features], y_test_res)))

    #Calculo de la matriz de confusión
    print(confusion_matrix(y_test_res, y_pred))

    print("Distribución inicial de entrenamiento{}".format(Counter(y_train)))
    print("Distribución finalde entrenamiento: {}".format(
        Counter(y_train_res)))

    print("Distribución inicial de test {}".format(Counter(y_test)))
    print("Distribución final de test: {}".format(Counter(y_test_res)))
Example #17
0
def under_sampling(input_data_path, out_sample_path):
    'Perform NearMiss under sampling to balance the dataset'

    X, Y = load_data(input_data_path)
    near_miss = NearMiss()
    X_res, y_res = near_miss.fit_sample(X, Y)
    print('Original data shape {}'.format(Counter(Y)))
    print('Resampled data shape {}'.format(Counter(y_res)))
    resampled_validation = np.c_[X_res, y_res]
    np.save(out_sample_path, resampled_validation)
 def resample(self, sampling_type):
     os = RandomOverSampler(random_state=10)
     nm = NearMiss()
     smote = SMOTETomek(random_state=10)
     if sampling_type == 'over':
         self.xtrain, self.ytrain = os.fit_sample(self.xtrain, self.ytrain)
     elif sampling_type == 'under':
         self.xtrain, self.ytrain = nm.fit_sample(self.xtrain, self.ytrain)
     else:
         self.xtrain, self.ytrain = smote.fit_sample(
             self.xtrain, self.ytrain)
Example #19
0
def treinaModelo(dfPopulacao, model):
    nr = NearMiss()
    X, y = nr.fit_sample(dfPopulacao.drop(['portifolio', 'id'], axis=1),
                         dfPopulacao.portifolio)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=1,
                                                        test_size=0.01,
                                                        stratify=y)
    model.fit(X_train, y_train)
    return model
Example #20
0
def modelEvaluation(train, test, sampling='normal', n_jobs=None):

    for _ in np.asarray(train.select_dtypes(include='category').columns):
        encoder = LabelEncoder()
        train[_] = encoder.fit_transform(train[_])
        test[_] = encoder.fit_transform(test[_])

    x_train, y_train = [
        np.asarray(train.drop('churned', 1)),
        np.asarray(train['churned'])
    ]
    x_test, y_test = [
        np.asarray(test.drop('churned', 1)),
        np.asarray(test['churned'])
    ]

    grid_values = {
        'n_estimators': [900, 1200, 1500],
        'criterion': ['gini', 'entropy'],
        'max_depth': [7, 8, 9],
        'max_features': [5, 7, 9]
    }

    model = RandomForestClassifier()

    if sampling == 'smote':
        smote = SMOTE()
        x_train, y_train = smote.fit_sample(x_train, y_train)
    if sampling == 'nearmiss':
        nm = NearMiss()
        x_train, y_train = nm.fit_sample(x_train, y_train)

    grid = GridSearchCV(model,
                        param_grid=grid_values,
                        scoring='f1',
                        cv=3,
                        n_jobs=n_jobs)
    grid.fit(x_train, y_train)
    best_params = grid.best_params_
    predictions = grid.predict(x_test)
    rf_accuraccy = accuracy_score(predictions, y_test)
    rf_f1_score = f1_score(predictions, y_test)

    with open('ValidationResults.txt', 'a+') as f:
        f.write('{}\n'.format(datetime.now()))
        f.write(name.upper() + ':\n')
        f.write(sampling.upper() + ':\n')
        f.write(
            'Parameters found: {}\nAccuracy: {}\nF1_Score :{}\n\n\n'.format(
                best_params, rf_accuraccy, rf_f1_score))
        f.close()

    return best_params, rf_accuraccy, rf_f1_score
def load_dataset(data_file):
    with open(data_file, "rb") as f:
        X, Y, embed_mat = pickle.load(f)
    X = pad_sequences(X, maxlen=MAX_SENTENCE_LEN, truncating="post")
    Y = np.array(Y)
    nm1 = NearMiss(random_state=0, version=1)
    x_resampled, y_resampled = nm1.fit_sample(X, Y)
    x_train, x_dev, y_train, y_dev = train_test_split(x_resampled,
                                                      y_resampled,
                                                      test_size=0.2,
                                                      random_state=0)
    neg_sent_count = sum(Y)
    print("# neg:", neg_sent_count)
    return x_train, x_dev, y_train, y_dev, embed_mat
Example #22
0
    def _fix_imbalance(self):
        ''' Fix imbalance of size between classes '''

        # FIXME find best ratio
        card = self.y.iloc[:, 0].value_counts()
        ratio = card.max() / card.min()

        if ratio > 1.5:
            st = NearMiss(ratio=0.75, random_state=42)
            fX, fy = st.fit_sample(self.X.values, self.y.values.ravel())
            samples = [f'smp{i}' for i in range(fX.shape[0])]
            self.fX = pd.DataFrame(fX, index=samples, columns=self.X.columns)
            self.fy = pd.DataFrame(fy, index=samples, columns=self.y.columns)
        else:
            self.fX = self.X
            self.fy = self.y

        st = SMOTETomek(random_state=42)
        fX, fy = st.fit_sample(self.fX.values, self.fy.values.ravel())
        samples = [f'smp{i}' for i in range(fX.shape[0])]
        self.fX = pd.DataFrame(fX, index=samples, columns=self.X.columns)
        self.fy = pd.DataFrame(fy, index=samples, columns=self.y.columns)

        log.info(f'Keeping {fX.shape[0]} samples, {fX.shape[1]} features')
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    nm1 = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS)
    X_resampled, y_resampled = nm1.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS)
    X_resampled, y_resampled = nm.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 166)
    assert_equal(count_y_res[2], 144)
def test_nm2_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #26
0
def test_nm2_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_nm2_fit_sample_auto_indices():
    """Test fit and sample routines with auto ratio and indices support"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS, return_indices=True)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #28
0
def test_nm2_fit_sample_auto_indices():
    """Test fit and sample routines with auto ratio and indices support"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS, return_indices=True)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_nm3_fit_sample_auto():
    """Test fit and sample routines with auto ratio"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm3 = NearMiss(
        ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                     [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #30
0
def test_nm1_fit_sample_auto():
    """Test fit and sample routines with auto ratio"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm1 = NearMiss(
        ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm1.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                     [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
        actual_predictions_labels,\
        loss, accuracy, f1_score, precision, recall, roc_auc=\
    test_run(allfeatures_smote_X,
             np.array(oh_smote_y),
             test_allfeatures,
             np.array(y_test_df['oh_label'].tolist()))
test_results.append(('SMOTE',(clfmodel, history, cm,\
        test_predictions_labels,\
        actual_predictions_labels,\
        loss, accuracy, f1_score, precision, recall, roc_auc)))
print('SMOTE:', precision, roc_auc)

# NearMiss test
nr = NearMiss()
allfeatures_nearmiss_X, nearmiss_y =\
    nr.fit_sample(allfeatures_nosmote_X,
                  tosmote_y)
oh_nearmiss_y = []
for slabel in nearmiss_y:
    ohl = np.zeros((len(nosmote_y[0], )))
    ohl[slabel] = 1
    oh_nearmiss_y.append(ohl)

clfmodel, history, cm,\
        test_predictions_labels,\
        actual_predictions_labels,\
        loss, accuracy, f1_score, precision, recall, roc_auc=\
    test_run(allfeatures_nearmiss_X,
             np.array(oh_nearmiss_y),
             test_allfeatures,
             np.array(y_test_df['oh_label'].tolist()))
Example #32
0
model_vars = ['Customer Address Country', 'Carrier', 'PPU day', 'FHS day', 'FDA day', 'Delivery day', 'PPU-FHS', 'FHS-FDA', 'FDA-Delivery', 'Delayed']

rel_data = df[model_vars]
rel_data_encoded = pd.get_dummies(rel_data)     # convert categorical vars into numerical.Yields 56 cols

# Separate predictor from target variable
x = rel_data_encoded.drop(['Delayed'], axis = 1)       # predictor vars
y = rel_data_encoded['Delayed']                        # target variable

# Creating training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

# Undersampling with NearMiss
print('Under-sampiling over-represented data...')
nm = NearMiss('not minority', random_state=42)
nm_x_train, nm_y_train = nm.fit_sample(x_train, y_train)

df_y_train = pd.DataFrame(data = nm_y_train, columns= ['Delayed'])
print('Length of under-sampled data is', len(nm_x_train))   # 4668
print('Number of delayed shipments in oversampled data is', len(nm_y_train[df_y_train['Delayed']==True]))   # 2334
print('Number of on time shipments is', len(nm_y_train[df_y_train['Delayed']==False]))  # 2334

# Scale data to feed prediction models
print('Scaling data...')
scaler = StandardScaler().fit(nm_x_train)

nm_x_train = scaler.transform(nm_x_train)
x_test = scaler.transform(x_test)

"""
Building prediction models
y = data.loc[:, data.columns == 'Class']
print("data size", collections.Counter(y['Class']))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    train_size=0.7,
                                                    random_state=0)
print("test data size", collections.Counter(y_test['Class']))
print("original training data size", collections.Counter(y_train['Class']))
y_train_arr = np.array(y_train['Class'])
X_train_arr = np.array(X_train)
A = [3, 5, 7]
for i in A:
    nm = NearMiss(version=3, random_state=5, n_neighbors_ver3=7)
    #ratio after sampling Nmin/Mmaj n_neighbors=5 number of neighbour to be taken in consideration at a time
    X_train_sampled, y_train_sampled = nm.fit_sample(X_train_arr, y_train_arr)
    print("sampled training data size", collections.Counter(y_train_sampled))

    #random forest
    clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    clf.fit(X_train_sampled, y_train_sampled)
    y_pred = clf.predict(X_test)
    cn_mat = confusion_matrix(y_pred, y_test['Class'])
    print(cn_mat)
    TP = cn_mat[1][1]
    TN = cn_mat[0][0]
    FN = cn_mat[0][1]
    FP = cn_mat[1][0]
    pre1 = TP / (TP + FP)
    recall1 = TP / (TP + FN)
    NPV1 = TN / (TN + FN)
Example #34
0
rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
print(np.vstack({tuple(row) for row in X_resampled}).shape)
#(181, 2) 重复抽样

'''
NearMiss函数则添加了一些启发式(heuristic)的规则来选择样本,通过设定version参数来实现三种启发式的规则.
假设正样本是需要下采样的(多数类样本),负样本是少数类的样本.
NearMiss-1:选择离N个近邻的负样本的平均距离最小的正样本;
NearMiss-2:选择离N个负样本最远的平均距离最小的正样本;
NearMiss-3:是一个两段式的算法.首先,对于每一个负样本,保留它们的M个近邻样本;接着,那些到N个近邻样本平均距离最大的正样本将被选择.
'''
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(random_state=0, version=1)
X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

'''
Cleaning under-sampling techniques
omek’s links
TomekLinks:样本x与样本y来自于不同的类别,满足以下条件,它们之间被称之为TomekLinks;
不存在另外一个样本z,使得d(x,z)<d(x,y)或者 d(y,z)<d(x,y)成立.其中d(.)表示两个样本之间的距离,也就是说两个样本之间互为近邻关系.
这个时候,样本x或样本y很有可能是噪声数据,或者两个样本在边界的位置附近.
TomekLinks函数中的auto参数控制Tomek’s links中的哪些样本被剔除.
默认的ratio='auto'移除多数类的样本,当ratio='all'时,两个样本均被移除.
'''
from imblearn.under_sampling import TomekLinks
tl =TomekLinks(random_state=0,ratio='all')
X_resampled, y_resampled = tl.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
Example #35
0
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_f.values)
df_f_scaled = pd.DataFrame(scaled_features, columns=df_f.columns)

df_d = df.drop(
    ['V3', 'V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18'],
    axis=1)
df_f = df.drop(df_d, axis=1)
df_t = df['Class']

clf1 = XGBClassifier(random_state=42)

# Implementing Undersampling for Handling Imbalanced
nm = NearMiss(sampling_strategy={0: 100000, 1: 492})
X_res, y_res = nm.fit_sample(df_f, df_t)

df3_train, df3_test, df_t_train, df_t_test = train_test_split(X_res,
                                                              y_res,
                                                              test_size=.3,
                                                              stratify=y_res,
                                                              random_state=42)

clf1.fit(df3_train, df_t_train)

# Saving model to disk
pickle.dump(clf1, open('model.pkl', 'wb'))

# Loading model to compare the results
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict(df3_test))
Example #36
0
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SVMSMOTE
#from imblearn.over_sampling import KMeansSMOTE


data_=pd.read_csv(r'train.csv')
data1=np.array(data_)
data=data1[:,1:]
[m1,n1]=np.shape(data)
label1=np.ones((4242,1))#Value can be changed   int(m1/2)
label2=np.zeros((71809,1))
label=np.append(label1,label2)
shu=scale(data)

X=shu
y=label#.astype('int64')

nm1 = NearMiss(version=2)#version=2,version=3
X_resampled, y_resampled = nm1.fit_sample(X, y)

shu=X_resampled
X1=scale(shu)
y1=y_resampled

#shu2 =X_resampled
#shu3 =y_resampled
data_csv = pd.DataFrame(data=X1)
data_csv.to_csv('NearMiss_train.csv')
data_csv = pd.DataFrame(data=y1)
data_csv.to_csv('label_NearMiss_train.csv')
Example #37
0

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 2
nm2 = NearMiss(version=2)
X_resampled, y_resampled = nm2.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Example #38
0
# Creamos un índice que luego nos servirá para mergear el dataset 'All_peptides'
# con el dataset balanceado que creemos.
copy['ID_Sequence'] = range(len(copy))

A = copy.loc[:, ['Label']]
B = copy.loc[:, ['ID_Sequence']]

# Categorizamos la variable cualitativa 'Label' a numérica
# Lable: Toxic = 1
#        NonToxic = 0
A = pd.get_dummies(A, columns=["Label"], drop_first=True)

# Realizamos el balanceo:
from imblearn.under_sampling import NearMiss
nr = NearMiss()
B_res, A_res = nr.fit_sample(B, A)

DBSCAN_subsampling = pd.merge(B_res, A_res, right_index=True, left_index=True)

DBSCAN_subsampling = pd.merge(DBSCAN_subsampling, copy, on='ID_Sequence')

# Vemos que el balanceo se ha realizado correctamente, ya que tenemos
# 703 péptidos de cada clase.
DBSCAN_subsampling.groupby('Label').size()

# Exportamos el fichero para poder utilizarlo en los demás algoritmos.
DBSCAN_subsampling.to_csv("BBDD/DBSCAN_subsampling.txt", index=None)

# MODELO SVM del modelo DBSCAN.

# Preparación del dataset para la generación de los modelos predictivos.