def Dados_Balanceados_SMOTE_NearMiss_Sem_Municipio_Orgao():
    #sampling_strategy = 0.1 ==> 10 x 1
    #sampling_strategy = 0.2 ==> 5 x 1
    #sampling_strategy = 1 ==> 1 x 1
    feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao')
    X_data, y_data = load_svmlight_file(
        'treino_desbalanceado_onehot_sem_municipio_orgao.svm',
        n_features=len(feature_names))  # pylint: disable=unbalanced-tuple-unpacking

    sm = SMOTE(random_state=6439, sampling_strategy=0.1)
    X_res, y_res = sm.fit_resample(X_data, y_data)
    nm = NearMiss(sampling_strategy=0.1)
    X_res_new, y_res_new = nm.fit_resample(X_res, y_res)
    dump_svmlight_file(X_res_new, y_res_new,
                       'smote_nearmiss_10_1_onehot_sem_municipio_orgao.svm')

    sm = SMOTE(random_state=6439, sampling_strategy=0.175)
    X_res, y_res = sm.fit_resample(X_data, y_data)
    nm = NearMiss(sampling_strategy=0.175)

    X_res_new, y_res_new = nm.fit_resample(X_res, y_res)
    dump_svmlight_file(X_res_new, y_res_new,
                       'smote_nearmiss_5_1_onehot_sem_municipio_orgao.svm')

    sm = SMOTE(random_state=6439, sampling_strategy=0.5)
    X_res, y_res = sm.fit_resample(X_data, y_data)
    nm = NearMiss(sampling_strategy=0.5)
    X_res_new, y_res_new = nm.fit_resample(X_res, y_res)
    dump_svmlight_file(X_res_new, y_res_new,
                       'smote_nearmiss_1_1_onehot_sem_municipio_orgao.svm')
def resampling(train_data, train_labels, resampling_type, resampling_stragey):
    train_data_new = np.reshape(train_data,
                                (train_data.shape[0], train_data.shape[1] *
                                 train_data.shape[2] * train_data.shape[3]))
    if resampling_type == 'SMOTE':
        train_data_resampled, train_labels_resampled = SMOTE(
            random_state=42).fit_resample(train_data_new, train_labels.values)

    elif resampling_type == 'over_sampling':
        over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = over_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'under_sampling':
        under_sampler = RandomUnderSampler(
            sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = under_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'tomelinks':
        t1 = TomekLinks(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = t1.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'near_miss_neighbors':
        undersample = NearMiss(version=1, n_neighbors=3)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'one_sided_selection':
        undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    return train_data_resampled, train_labels_resampled
Esempio n. 3
0
def near_miss2(X, Y):
    from imblearn.under_sampling import NearMiss
    nm1 = NearMiss(version=2)
    nm1.fit_resample(X, Y)
    indexes = nm1.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
Esempio n. 4
0
 def Balancing(self, method=None):
     self.method = method
     if self.method == None or self.method == "nearmiss":
         balance = NearMiss()
         self.Xtrainb, self.Ytrainb = balance.fit_resample(
             self.Xtrain, self.Y_train)
     elif self.method == "smote":
         balance = SMOTE()
         self.Xtrainb, self.Ytrainb = balance.fit_resample(
             self.Xtrain, self.Y_train)
     self.Xtrainb, self.Ytrainb
Esempio n. 5
0
def near_miss2(X, Y):
    from imblearn.under_sampling import NearMiss
    nm1 = NearMiss(version=2)
    nm1.fit_resample(X, Y)
    indexes = nm1.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
def base_model():
    mongo_connect = MongoHandler()
    like_tweets = mongo_connect.retrieve_from_collection("twitter_new")
    df = pd.DataFrame(list(like_tweets))

    # text = df['text']
    # df = df.drop(['user_name','user_location','hashtags','mentions','created_at'],axis=1)

    # Column / Feature selection
    base = df[[
        'user_followers', 'user_friends', 'user_favourites', 'user_months',
        'user_statuses', 'user_verified', 'retweets'
    ]]
    per_month = round((base['user_statuses'] + 1) / (base['user_months'] + 1),
                      2)
    per_month = pd.DataFrame(per_month)
    per_month.columns = ['tweet_per_month']
    base = pd.concat([base, per_month], axis=1)
    target = df['favorites']
    # base = base[['user_followers', 'retweets', 'user_favourites', 'user_statuses']]
    columns = base.columns.values.tolist()

    # Tranform the problem of regression into a multi-class classification. Classes: zero, low, medium, high
    for i in range(len(target)):
        if 0 < target[i] < 6:
            target[i] = 1
        elif 5 < target[i] < 11:
            target[i] = 2
        elif target[i] >= 11:
            target[i] = 3

    # target.hist()
    # plt.show()
    nm1 = NearMiss(version=1)  # Under-sampling technique
    base, target = nm1.fit_resample(base, target)

    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        base, target, random_state=1, test_size=0.3)

    # testing different ML algorithms
    # model = tree.DecisionTreeClassifier(criterion="entropy", random_state=5)  # class_weight="balanced")
    # model = linear_model.LogisticRegression(solver="lbfgs", random_state=5)
    # model = naive_bayes.MultinomialNB()
    # model = tree.ExtraTreeClassifier(random_state=5)
    # model = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10), n_estimators=1000, random_state=5)
    model = RandomForestClassifier(n_estimators=500, random_state=5)

    k_fold_cv(model, x_train, y_train, False)

    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    evaluation(y_test, y_predict)

    # feature importance / interpretability
    new_y_train = model.predict(x_train)
    tree_model = tree.DecisionTreeClassifier(
        criterion="entropy", random_state=5)  # class_weight="balanced")
    tree_model.fit(x_train, new_y_train)
    tree_feature_importance(tree_model, columns,
                            x_train)  # calls function for interpretable ML
Esempio n. 7
0
def limpieza(df):
    #librerías necesarias
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from imblearn.under_sampling import NearMiss
    from imblearn.over_sampling import SMOTE
    # Eliminamos la variable Id
    df = df.drop("Unnamed: 0", axis=1)
    # Establecemos las variables de la X y de la Y.
    classification_X = df.drop(["misstate"], axis=1)
    classification_y = df["misstate"]
    # iniciando Smote under-sampling
    sm = SMOTE()
    X_sm, y_sm = sm.fit_resample(classification_X, classification_y)
    nm = NearMiss(version=1)
    X_nm, y_nm = nm.fit_resample(classification_X, classification_y)
    # Definimos el train y el test SMOTE
    X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(
        X_sm, y_sm, test_size=0.30, random_state=0)
    # Definimos el train y el test NEARMISS
    X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(
        X_nm, y_nm, test_size=0.30, random_state=0)

    return (X_train_nm, X_test_nm, y_train_nm, y_test_nm, X_train_sm,
            X_test_sm, y_train_sm, y_test_sm)
Esempio n. 8
0
def test_nm_fit_resample_auto():
    sampling_strategy = 'auto'
    X_gt = [
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                  [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                  [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                  [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [-0.05903827, 0.10947647],
                  [0.03142011, 0.12323596], [-0.60413357, 0.24628718],
                  [0.50701028, -0.17636928], [0.4960075, 0.86130762],
                  [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                  [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                  [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                  [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                  [0.99272351, -0.11631728]])
    ]
    y_gt = [
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    ]
    for version_idx, version in enumerate(VERSION_NEARMISS):
        nm = NearMiss(sampling_strategy=sampling_strategy, version=version)
        X_resampled, y_resampled = nm.fit_resample(X, Y)
        assert_array_equal(X_resampled, X_gt[version_idx])
        assert_array_equal(y_resampled, y_gt[version_idx])
Esempio n. 9
0
def nearmiss(data, k):

    X = np.array(data['X'].tolist())
    y = np.array(data['y'].tolist())

    max_X = np.max(X[:, 0])
    min_X = np.min(X[:, 0])

    max_y = np.max(X[:, 1])
    min_y = np.min(X[:, 1])

    counter = Counter(y)

    print("Before undersampling:", counter)
    plot_scatter(X, y, counter, max_X, min_X, max_y, min_y)

    # define the undersampling method, version: 3, k is a target hyper-parameter
    undersample = NearMiss(version=3, n_neighbors_ver3=k)

    # perform NearMiss sampling
    X_sampled, y_sampled = undersample.fit_resample(X, y)
    X_list = X_sampled.tolist()
    sample_data = pd.DataFrame({'y': y_sampled, 'X': X_list})
    counter = Counter(y_sampled)
    print("After undersampling:", counter)

    plot_scatter(X_sampled, y_sampled, counter, max_X, min_X, max_y, min_y)

    return sample_data
Esempio n. 10
0
def test_nm_fit_resample_auto():
    sampling_strategy = 'auto'
    X_gt = [
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718],
                  [0.03142011, 0.12323596], [1.15157493, -1.2981518],
                  [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])
    ]
    y_gt = [
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    ]
    for version_idx, version in enumerate(VERSION_NEARMISS):
        nm = NearMiss(sampling_strategy=sampling_strategy, version=version)
        X_resampled, y_resampled = nm.fit_resample(X, Y)
        assert_array_equal(X_resampled, X_gt[version_idx])
        assert_array_equal(y_resampled, y_gt[version_idx])
def NearMiss_us(X_train,
                Y_train,
                seed,
                sampling_strategy,
                n_neighbors=3,
                n_neighbors_ver3=3,
                version=1):
    if not isinstance(sampling_strategy, str):
        sampling_strategy = compute_sampling_strategy(sampling_strategy,
                                                      Y_train, 'undersampling')
    nm = NearMiss(random_state=seed,
                  version=version,
                  n_neighbors=n_neighbors,
                  n_neighbors_ver3=n_neighbors_ver3,
                  n_jobs=-1,
                  ratio=None,
                  sampling_strategy=sampling_strategy)
    print('Before NearMiss version ' + str(version) + ' undersampling : ',
          sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = nm.fit_resample(X_train, Y_train)
    print('After NearMiss version ' + str(version) + ' undersampling : ',
          sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
Esempio n. 12
0
def nearmiss1(X, y):
    undersample = NearMiss(version=1, n_neighbors=3)
    X, y = undersample.fit_resample(X, y)
    counter = Counter(y)
    print("NearMiss-1", counter)

    plot_dataset(X, y, counter)
Esempio n. 13
0
def under_sampling(X,Y,ss='not minority'):
    Y = Y.ravel()
    if isinstance(ss,dict):
        ss = ss.copy()
        for y, c in zip(*np.unique(Y,return_counts=True)):
            ss[y] = min(ss[y],c)
    sampler = NearMiss(sampling_strategy=ss)
    data, labels = sampler.fit_resample(X, Y)
    return data, labels
    def classify(self, X, type: str, classifier: str, test_prop: float, res: None, res_method: None):

        if type == 'binary':
            y = self.df['class'].replace(0,1)
        elif type == 'multi':
            y = self.df['class']
        else:
            raise TypeError("Choose a proper type of classification")

        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_prop, stratify=y)

        if res == True:
            if res_method == 'down':
                nm = NearMiss()
                X_res, Y_res = nm.fit_resample(X_train, Y_train)
            elif res_method == 'up':
                sm = ADASYN()
                X_res, Y_res = sm.fit_resample(X_train, Y_train)
            else:
                raise TypeError("Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling.")

        if classifier == 'lr':
            model = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l2')
        elif classifier == 'svc':
            model = LinearSVC(C=0.004, penalty='l2')
        elif classifier == 'rf':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = RandomForestClassifier(n_estimators=n_est, bootstrap=True, max_depth=5)
        elif classifier == 'xgb':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = XGBClassifier(n_estimators=n_est, bootstrap=True, max_depth=5, reg_lamba=0.4)
        elif classifier == 'ada':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005)
        else:
            raise TypeError("Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' .")

        if res == True:
            model.fit(X_res, Y_res)
        else:
            model.fit(X_train, Y_train)

        Y_pred = model.predict(X_test)

        # Accuracy Percentage
        print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%")

        # Classification Report
        print(classification_report(Y_pred, Y_test))

        # Matthew's Correlation Coefficient
        print(f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}")

        # Plots of Confusion Matrix and ROC Curve
        plot_confusion_matrix(Y_test, Y_pred, figsize=(10,10)) 

        return model
Esempio n. 15
0
def test_nm_wrong_nn_obj():
    sampling_strategy = 'auto'
    nn = 'rnd'
    nm = NearMiss(sampling_strategy=sampling_strategy,
                  version=VERSION_NEARMISS,
                  return_indices=True,
                  n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        nm.fit_resample(X, Y)
    nn3 = 'rnd'
    nn = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(sampling_strategy=sampling_strategy,
                   version=3,
                   return_indices=True,
                   n_neighbors=nn,
                   n_neighbors_ver3=nn3)
    with raises(ValueError, match="has to be one of"):
        nm3.fit_resample(X, Y)
Esempio n. 16
0
def test_nm_wrong_nn_obj():
    sampling_strategy = 'auto'
    nn = 'rnd'
    nm = NearMiss(
        sampling_strategy=sampling_strategy,
        version=VERSION_NEARMISS,
        return_indices=True,
        n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        nm.fit_resample(X, Y)
    nn3 = 'rnd'
    nn = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(
        sampling_strategy=sampling_strategy,
        version=3,
        return_indices=True,
        n_neighbors=nn,
        n_neighbors_ver3=nn3)
    with raises(ValueError, match="has to be one of"):
        nm3.fit_resample(X, Y)
Esempio n. 17
0
def undersample_NearMiss(df, variant=2, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    nm = NearMiss(random_state=0, version=variant)
    X_res, y_res = nm.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
def under_sampling_shift(x, y, delta=0.5):
    labels = np.unique(y)
    y_counts = Counter(np.squeeze(y))
    sampling_strategy = dict()
    for label in labels:
        sampling_strategy[label] = int(delta * y_counts[label])

    # version 3 subsamples respecting more the initial data structure,
    # but it gives less control on the n of final samples (initial resampling phase)
    # thus let use version 2 as the default
    nm1 = NearMiss(version=2, sampling_strategy=sampling_strategy)
    x_resampled, y_resampled = nm1.fit_resample(x, y)
    return x_resampled, y_resampled
Esempio n. 19
0
def near_miss(X,
              y,
              visualize=False,
              pca2d=True,
              pca3d=True,
              tsne=True,
              pie_evr=True):
    nm = NearMiss()
    X_res, y_res = nm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def Dados_Balanceados_NearMiss_Sem_Municipio_Orgao():
    #sampling_strategy = 0.1 ==> 10 x 1
    #sampling_strategy = 0.2 ==> 5 x 1
    #sampling_strategy = 1 ==> 1 x 1
    feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao')
    X_data, y_data = load_svmlight_file(
        'treino_desbalanceado_onehot_sem_municipio_orgao.svm',
        n_features=len(feature_names))  # pylint: disable=unbalanced-tuple-unpacking

    nm = NearMiss(sampling_strategy=0.1)
    X_res, y_res = nm.fit_resample(X_data, y_data)
    dump_svmlight_file(X_res, y_res,
                       'nearmiss_10_1_onehot_sem_municipio_orgao.svm')

    nm = NearMiss(sampling_strategy=0.2)
    X_res, y_res = nm.fit_resample(X_data, y_data)
    dump_svmlight_file(X_res, y_res,
                       'nearmiss_5_1_onehot_sem_municipio_orgao.svm')

    nm = NearMiss(sampling_strategy=1)
    X_res, y_res = nm.fit_resample(X_data, y_data)
    dump_svmlight_file(X_res, y_res,
                       'nearmiss_1_1_onehot_sem_municipio_orgao.svm')
Esempio n. 21
0
def undersampling(X, y):
    """Balancing data using NearMiss

    Args:
        X: Training set without Class Target
        y:Training set Class Target

    Returns:
        balanced train_x, test_x
    """
    sample = NearMiss(version=1)
    X, y = sample.fit_resample(X, y)
    print('after balancing:', X.shape)
    return X, y
Esempio n. 22
0
    def __resample_data_NearMiss(self):
        '''
        Resampling imbalanced data with near miss algorithm. (Undersampling)

        :param: None
        :return: None
        '''
        name_train = self.__attributes_train.columns

        print("resampling data...")
        nm = NearMiss(random_state=6)
        X_train_res, y_train_res = nm.fit_resample(self.__attributes_train,
                                                   self.__labels_train)
        self.__attributes_train, self.__labels_train = pd.DataFrame(
            X_train_res, columns=name_train), pd.DataFrame(y_train_res)
        print("[respamling finished]")
Esempio n. 23
0
def clf_corregido(x_orig, y_orig):
    '''Función que se utiliza para corregir el desbalance sufrido por los datos'''

    nearmiss = NearMiss(sampling_strategy=0.20, n_neighbors=3, version=2)
    x_us, y_us = nearmiss.fit_resample(x_orig, y_orig)
    clf_us = LogisticRegression(random_state=0,
                                class_weight='balanced').fit(x_us, y_us)
    y_clf_us = clf_us.predict(x_orig)

    print('''
--------------------------------------------------------------------------------

''')
    print(classification_report(y_orig, y_clf_us))

    _, recall_corregido, _, _ = score(y_orig, y_clf_us)

    recall_corregido_prom = (recall_corregido[0] + recall_corregido[1]) / 2

    DICC_['clf_corregido'] = recall_corregido_prom

    return clf_us
Esempio n. 24
0
def apply_nearMiss(X, y, near_miss_type=3):

	'''
	Using NearMiss algorithm, undersamples the majority class to match the 
	sample size of the minority class
	
	Args:
	X (np.array): features associated with examples of majority and minority classes;
	              shape = number of examples x number of features
	y (np.array): lables of each example; 
	              shape = number of examples x 1
	near_miss_type (int): Type of NearMiss algorithm

	Returns:
	X_samp (np.array): features associated with undersampled majority examples and 
	                   all minority class examples; 
	                   shape = 2*number of minority examples x number of features
	y_samp (np.array): lables of each example; 
	                   shape = 2*number of minority examples x 1
	              
	'''
	undersample = NearMiss(version=1, n_neighbors=near_miss_type)
	X_smap, y_samp = undersample.fit_resample(X, y)
	return X_samp, y_samp
Esempio n. 25
0

model = run_model_balanced(X_train, X_test, y_train, y_test)
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Penalizacion')

# Estrategia: Undersampling en la clase mayoritaria
# Lo que haremos es utilizar un algoritmo para reducir la clase mayoritaria.
# Lo haremos usando un algoritmo que hace similar al k-nearest neighbor para
# ir seleccionando cuales eliminar. Fijemonos que reducimos bestialmente de
# 199.020 muestras de clase cero (la mayoría) y pasan a ser 688. y Con esas
# muestras entrenamos el modelo.

us = NearMiss(sampling_strategy=0.5, n_neighbors=3, version=2)

X_train_res, y_train_res = us.fit_resample(X_train, y_train)
print(f'Distribution before resampling {Counter(y_train)}')
print(f'Distribution after resampling {Counter(y_train_res)}')

model = run_model(X_train_res, X_test, y_train_res, y_test)
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Undersampling')

# Estrategia: Oversampling de la clase minoritaria
# En este caso, crearemos muestras nuevas “sintéticas” de la clase minoritaria.
# Usando RandomOverSampler. Y vemos que pasamos de 344 muestras de
# fraudes a 99.510.
os = RandomOverSampler(sampling_strategy=0.5)
X_train_res, y_train_res = os.fit_resample(X_train, y_train)
print(f'Distribution before resampling {Counter(y_train)}')
print(f'Distribution after resampling {Counter(y_train_res)}')
Esempio n. 26
0
def down_sampling(x_train, y_train):
    print("Down Sampling My friend.....")
    from imblearn.under_sampling import NearMiss
    nm1 = NearMiss(version=1)
    x_train, y_train = nm1.fit_resample(x_train, y_train)
    return x_train, y_train
Esempio n. 27
0
def underSampling(X, Y):
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(X, Y)
    return X_resampled, y_resampled
Esempio n. 28
0
def test_deprecation_random_state():
    nm = NearMiss(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        nm.fit_resample(X, Y)
Esempio n. 29
0
# Undersample imbalanced dataset with NearMiss-2
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import NearMiss
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# define the undersampling method
undersample = NearMiss(version=2, n_neighbors=3)
# transform the dataset
X, y = undersample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()
Esempio n. 30
0
def test_nearmiss_wrong_version():
    version = 1000
    nm = NearMiss(version=version)
    with raises(ValueError, match="must be 1, 2 or 3"):
        nm.fit_resample(X, Y)
Esempio n. 31
0
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

from imblearn.over_sampling import ADASYN

ada=ADASYN(random_state=0, n_neighbors=5)

X_resampled, y_resampled = ada.fit_resample(X,y) 

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

from imblearn.under_sampling import NearMiss
nm=NearMiss(version=1)
nm.sample_indices=True
X_resampled, y_resample = nm.fit_resample(X,y)

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

deleted_ind = np.setdiff1d(np.arange(len(X)), ind)

plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

from imblearn.under_sampling import OneSidedSelection

oss=OneSidedSelection(random_state=0, n_neighbors=1, n_seeds_S=1)

X_resampled, y_resampled = oss.fit_resample(X,y)
Esempio n. 32
0
    'gender', 'label_emotion', 'polarity', 'label_polarity', 'path', 'source'
],
                        axis=1).to_numpy().squeeze()

print('y_full after drop: ', y_full[:5])
print('y_full shape after drop: ', y_full.shape)

# reshape to undersample
X_full_reshape = X_full.reshape((X_full.shape[0], -1))
print('X_full_under shape: ', X_full_reshape.shape)

# create undersampler
undersample = NearMiss(sampling_strategy="not minority")

print('Starting undersampling...')
X_under, y_under = undersample.fit_resample(X_full_reshape, y_full)

y_counter = Counter(y_under)
print('Information after undersampling')
print('y_under count: ', y_counter)

print('Under set shapes:')
print('X_under shape: ', X_under.shape)
print('y_under shape: ', y_under.shape)

X_train_reshape, X_test_reshape, y_train, y_test = train_test_split(
    X_under, y_under, test_size=0.1, shuffle=True, random_state=42)

print('Shapes after train_test_splits')
print('X_train_reshape: ', X_train_reshape.shape)
print('y_train: ', y_train.shape)
Esempio n. 33
0
def test_deprecation_random_state():
    nm = NearMiss(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        nm.fit_resample(X, Y)
Esempio n. 34
0
def test_nearmiss_wrong_version():
    version = 1000
    nm = NearMiss(version=version)
    with raises(ValueError, match="must be 1, 2 or 3"):
        nm.fit_resample(X, Y)