コード例 #1
0
def test_borderline_smote(kind, data):
    bsmote = BorderlineSMOTE(kind=kind, random_state=42)
    bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42,
                                k_neighbors=NearestNeighbors(n_neighbors=6),
                                m_neighbors=NearestNeighbors(n_neighbors=11))

    X_res_1, y_res_1 = bsmote.fit_resample(*data)
    X_res_2, y_res_2 = bsmote_nn.fit_resample(*data)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
コード例 #2
0
ファイル: test_smote.py プロジェクト: yueyuep/TCNN
def test_borderline_smote(kind):
    bsmote = BorderlineSMOTE(kind=kind, random_state=42)
    bsmote_nn = BorderlineSMOTE(kind=kind,
                                random_state=42,
                                k_neighbors=NearestNeighbors(n_neighbors=6),
                                m_neighbors=NearestNeighbors(n_neighbors=11))

    X_res_1, y_res_1 = bsmote.fit_resample(X, Y)
    X_res_2, y_res_2 = bsmote_nn.fit_resample(X, Y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
コード例 #3
0
 def train(self, gridsearch=False):
     tic = time.time()
     self.set_pipeline()
     X_train_preproc = self.pipeline_feature.fit_transform(self.X_train)
     bm = BorderlineSMOTE(random_state=2,
                          sampling_strategy='minority',
                          k_neighbors=1,
                          m_neighbors=20)
     self.X_train_smote, self.y_train_smote = bm.fit_resample(
         X_train_preproc, self.y_train)
     if gridsearch:
         self.model = RandomizedSearchCV(
             estimator=self.get_estimator(),
             param_distributions=self.model_params,
             n_iter=10,
             cv=2,
             verbose=5,
             random_state=42,
             n_jobs=None,
         )
         self.model.fit(self.X_train_smote, self.y_train_smote)
         self.mlflow_log_metric("train_time", int(time.time() - tic))
         print(colored(f'best score: {self.model.best_score_}', "blue"))
         print(colored(f'best params: {self.model.best_params_}', "blue"))
         self.model = self.model.best_estimator_
     else:
         self.model = self.get_estimator()
         self.model.fit(self.X_train_smote, self.y_train_smote)
         self.mlflow_log_metric("train_time", int(time.time() - tic))
コード例 #4
0
def create_metric(soft, metric, release, fold=3, boderlinesmote=False):
    all = []
    for i in range(release):
        path = 'F:\\orca-master\\exampledata\\mData\\ordinalRegressionData\\Three severity\\' + metric + '\\' + soft + '\\' + str(
            i + 1) + '_code&network_metrics&bugs.csv'
        auto_spearman_metric, auto_spearman_metric_data = getAutoSpearmanMetric(
            path)
        all.append(auto_spearman_metric)
        for k in range(fold):
            if boderlinesmote:
                # 使用borderlinSMOTE
                auto_spearman_metric_data = auto_spearman_metric_data.dropna(
                    axis=1)
                x = auto_spearman_metric_data.iloc[:, 0:-1]
                y = auto_spearman_metric_data.iloc[:, -1:]

                bord_smote = BorderlineSMOTE(random_state=16,
                                             kind="borderline-1")
                x_res, y_res = bord_smote.fit_resample(x, y)
                auto_spearman_metric_data = pd.merge(x_res,
                                                     y_res,
                                                     how='left',
                                                     left_index=True,
                                                     right_index=True)
            save_path = 'F:\\orca-master\\exampledata\\' + metric + '\\' + soft + '\\' + str(
                fold) + '-fold\\' + soft + str(
                    i +
                    1) + '\\matlab\\' + 'train_' + soft + str(i +
                                                              1) + '.' + str(k)
            tmp = shuffle(auto_spearman_metric_data)
            tmp.to_csv(save_path, header=None, index=False, sep=" ")
    return all
コード例 #5
0
def over_under_sampling(x, y):
    print('Generating synthetic samples...')
    over = BorderlineSMOTE()
    # under = RandomUnderSampler(sampling_strategy=0.5)
    # steps = [('o', over), ('u', under)]
    # pipeline = Pipeline(steps=steps)
    # x, y = pipeline.fit_resample(x, y)
    x, y = over.fit_resample(x, y.idxmax(axis=1))
    y = pd.get_dummies(y)
    return x, y
コード例 #6
0
def bordersmote(x, y):
    # Borderline-SMOTE
    k_neighbors = math.ceil(sum(y) * 0.01)
    m_neighbors = math.ceil(sum(y) * 0.01)
    
    bordersmote = BorderlineSMOTE(sampling_strategy=1, 
                                  k_neighbors=k_neighbors, 
                                  m_neighbors=m_neighbors)
    
    return bordersmote.fit_resample(x, y)
コード例 #7
0
def up_sampling(X_train, y_train, ratio=2):
    pos_num = (y_train == 1).sum()
    if pos_num == 0:
        return X_train, y_train
    pos_sap_num = int(pos_num * ratio)
    X_train.fillna(0, inplace=True)
    smo = BorderlineSMOTE(sampling_strategy={1: pos_sap_num},
                          random_state=2019,
                          n_jobs=8)
    X_train, y_train = smo.fit_resample(X_train, y_train)

    return X_train, y_train
コード例 #8
0
def borderline_smote(X,
                     y,
                     visualize=False,
                     pca2d=True,
                     pca3d=True,
                     tsne=True,
                     pie_evr=True):
    sm = BorderlineSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
コード例 #9
0
def smote_tomek(x_train, y_train):
    oversample = BorderlineSMOTE(sampling_strategy=0.5,
                                 random_state=0,
                                 k_neighbors=5,
                                 m_neighbors=10,
                                 n_jobs=-1,
                                 kind='borderline-1')
    X, y = oversample.fit_resample(x_train, y_train)

    tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1)
    X, y = tom_lin.fit_resample(X, y)
    # print(len([i for i in y_train.values if i==1]))
    # print(len([i for i in y.values if i==1]))
    # print(len(y_train))
    # print(len(y))
    return X, y
コード例 #10
0
ファイル: ml-helpper.py プロジェクト: HaritzSaiz/ml-helpper
def oversample_borderline_SMOTE(df, variant=1, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    if variant == 1:
        sm = BorderlineSMOTE(random_state=0, kind="borderline-1")
    else:
        sm = BorderlineSMOTE(random_state=0, kind="borderline-2")

    X_res, y_res = sm.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
コード例 #11
0
def Borderline_DBSCAN(train_data, label, eps=20.1, min_samples=5):
    label_index = 0
    if label == 'c':
        label_index = 1
    if label == 'b':
        label_index = 0

    print(train_data['label'].value_counts())
    boSMOTE = BorderlineSMOTE(kind='borderline-1')
    x, y = boSMOTE.fit_resample(train_data.iloc[:, :-1], train_data.iloc[:, -1])

    # print(boSMOTE.sample)
    BMG_sample = boSMOTE.sample[label_index][1]
    BMG_sample = pd.DataFrame(BMG_sample, columns=train_data.columns.values.tolist()[:-1])
    BMG_sample['label'] = label


    max_sample = []
    min_sample = []
    # print(train_data.shape[0])
    for temp in range(train_data.shape[0]):
        if train_data.iloc[temp, -1] == label:
            min_sample.append(train_data.iloc[temp, :].values)
        else:
            max_sample.append(train_data.iloc[temp, :].values)

    max_sample = pd.DataFrame(max_sample, columns=train_data.columns.values.tolist())
    min_sample = pd.DataFrame(min_sample, columns=train_data.columns.values.tolist())
    mergeSample = pd.concat([max_sample, BMG_sample], ignore_index=False)
    # print(min_sample.shape[0])
    # print(max_sample.shape[0])
    # print("**9**")
    # print(mergeSample.shape[0])
    dbsc = DBSCAN(eps=eps, min_samples=min_samples).danger_fit(X=mergeSample, danger_sample=BMG_sample)
    array_neighborhoods = dbsc.neighborhoods
    neighborhoods_index = []
    array_n_neighbors = dbsc.n_neighbors
    for temp in range(len(array_n_neighbors)):
        if array_n_neighbors[temp] >= 5:
            for i in range(array_n_neighbors[temp]):
                neighborhoods_index.append(array_neighborhoods[temp][i])
    new_sample_index = list(set(neighborhoods_index))
    num_sample = BMG_sample.shape[0]
    # print(array_neighborhoods)
    # print(len(new_sample_index))
    # print(train_data.shape[0])
    return min_sample, mergeSample, new_sample_index, num_sample
コード例 #12
0
def imbalanced_sampler(input_data, input_labels, method='SMOTE'):
    if method == 'SMOTE':
        sampler = BorderlineSMOTE(n_jobs=4, random_state=RANDOM_STATE)
    elif method == 'Near Miss':
        sampler = NearMiss(n_jobs=4, random_state=RANDOM_STATE)
    else:
        print('Invalid sampler type. Only `SMOTE` (Borderline) and `Near Miss` are supported...')
        sys.exit(0)
    # TODO save samples by class to reduce file size
    max_class_num = np.max(input_labels)
    class_range = np.arange(1, max_class_num)
    x_sampled, y_sampled = sampler.fit_resample(input_data, input_labels)
    for i in class_range:
        idx = np.argwhere(y_sampled == i)
        pickle.dump(x_sampled[idx][:], open(method + '_Class_' + str(i) + '_data_samples.pkl', 'wb'))
        pickle.dump(y_sampled[idx], open(method + '_Class_' + str(i) + '_label_samples.pkl', 'wb'))
    return x_sampled, y_sampled
def Borderline_SMOTE_os(X_train,
                        Y_train,
                        seed,
                        sampling_strategy,
                        k_neighbors=5):
    if not isinstance(sampling_strategy, str):
        sampling_strategy = compute_sampling_strategy(sampling_strategy,
                                                      Y_train, 'oversampling')
    smote = BorderlineSMOTE(random_state=seed,
                            n_jobs=-1,
                            k_neighbors=k_neighbors,
                            sampling_strategy=sampling_strategy)
    print('Before Borderline SMOTE oversampling : ',
          sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)
    print('After Borderline SMOTE oversampling : ',
          sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
コード例 #14
0
    def k_folds():
        for train_ind, val_ind in kf_gen:
            fold_train = (train_data[train_ind], train_exist[train_ind],
                          train_label[train_ind])
            fold_val = (train_data[val_ind], train_exist[val_ind],
                        train_label[val_ind])
            fold_val = TensorDataset(*from_numpy(*fold_val))

            if oversampling in ['borderline_smote', 'svm_smote', 'smotenc']:
                from imblearn.over_sampling import SMOTENC, BorderlineSMOTE, SVMSMOTE
                if oversampling == 'borderline_smote':
                    smote = BorderlineSMOTE(random_state=random_state)
                elif oversampling == 'svm_smote':
                    smote = SVMSMOTE(random_state=random_state)
                elif oversampling == 'smotenc':
                    categorical_features = [
                        2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 24, 25, 26, 31, 47, 49, 50, 51, 52, 53, 54,
                        56, 57, 58
                    ] + [1, 48]
                    smote = SMOTENC(
                        categorical_features=categorical_features,
                        random_state=random_state,
                        sampling_strategy='auto',
                        neighbors=5,
                    )
                X, y = smote.fit_resample(fold_train[0],
                                          fold_train[2].argmax(axis=1))
                y = np.array([[1, 0], [0, 1]])[y]
                exist = np.ones_like(X)

                fold_train = (X, exist, y)

            pos_weight = fold_train[2].shape[0] / np.sum(fold_train[2][:,
                                                                       1]) - 1

            fold_train = TensorDataset(*from_numpy(*fold_train))

            yield fold_train, fold_val, pos_weight
コード例 #15
0
def resample_to_csv(X, y, random_state, path, method):
    """Re-samples dataset using desired method of oversampling and writes output to CSV.

    :param X: Original Features
    :param y: Original Labels
    :param randomState: Random intialization
    :param path: Path to output location and name of CSV
    :param method: Either SMOTE-NN method or BorderLineSMOTE (borderline) method.
    See imbalanced-learn documentation for more information.
    https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.BorderlineSMOTE.html

    :return: none
    """

    if method == 'SMOTE-NN':
        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)
    elif method == 'borderline':
        borderlineSmote = BorderlineSMOTE(random_state=random_state)
        X_resampled, y_resampled = borderlineSmote.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)

    elif method == 'adasyn':
        adasyn = ADASYN(random_state=random_state)
        X_resampled, y_resampled = adasyn.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)

    elif method == 'tomek':
        tomek = SMOTETomek(random_state=random_state)
        X_resampled, y_resampled = tomek.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)
コード例 #16
0
    print('valor c ideal SVM SMOTE', best_params_smote['C'], 'valor gamma ideal SVM SMOTE', best_params_smote['gamma'])

    border_sm = BorderlineSMOTE(k_neighbors=27, random_state=91, sampling_strategy=1)

    sm = SVMSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, svm_estimator=SVM_smote)

    ada = ADASYN(random_state=91, n_neighbors=27, sampling_strategy=1, n_jobs=6)

    Kmeans = KMeansSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, n_jobs=6,
                         kmeans_estimator=MiniBatchKMeans(n_clusters=20))

    '''Muestreo Sintetico'''

    #Xtrain, ytrain = SMOTE().fit_resample(Xtrain, ytrain)
    Xtrain, ytrain = border_sm.fit_resample(Xtrain, ytrain)

    '''Selección de caracteristicas'''

    # rel_MI = SelectKBest(score_func=score_func, k=num_features)
    # Xtrain = rel_MI.fit_transform(Xtrain, ytrain)
    # Xtest = rel_MI.transform(Xtest)
    # rel_MI_support = rel_MI.get_support()
    # rel_MI_feature = X_frame.loc[:, rel_MI_support].columns.tolist()
    # rel_MI_scores = rel_MI.scores_[rel_MI_support].tolist()
    # feature_selection_df = pd.DataFrame({'Feature': rel_MI_feature, 'Score':rel_MI_scores})

    Xtrain = Xtrain[:, [71, 83, 88, 70, 89, 56, 86, 53, 58, 59, 29, 28, 69, 41, 74, 23, 87]]
    Xtest = Xtest[:, [71, 83, 88, 70, 89, 56, 86, 53, 58, 59, 29, 28, 69, 41, 74, 23, 87]]

    '''  
コード例 #17
0
    def do_oversampling_and_plot(self):

        X = self.X
        y = self.y

        sm = SMOTE(random_state=111)
        ad = ADASYN(random_state=111)
        bs = BorderlineSMOTE(random_state=111)
        X_new_sm, y_new_sm = sm.fit_resample(X, y)
        X_new_ad, y_new_ad = ad.fit_resample(X, y)
        X_new_bs, y_new_bs = bs.fit_resample(X, y)

        #before oversampling
        data_old = np.concatenate((X, y.reshape(-1, 1)), axis=1)

        data_1_old = data_old[data_old[:, 7] == 1]  #data with class '1'
        data_0_old = data_old[data_old[:, 7] == 0]  #data with class '0'

        #after oversampling (SMOTE)
        a = X_new_sm[:, 0:7]
        b = y_new_sm.reshape(-1, 1)  #class/target column
        data = np.concatenate((a, b), axis=1)

        data_1 = data[data[:, 7] == 1]  #data with class '1'
        data_0 = data[data[:, 7] == 0]  #data with class '0'

        #after oversampling (ADASYN)
        a_ad = X_new_ad[:, 0:7]
        b_ad = y_new_ad.reshape(-1, 1)  #class/target column
        data_ad = np.concatenate((a_ad, b_ad), axis=1)

        data_1_ad = data_ad[data_ad[:, 7] == 1]  #data with class '1'
        data_0_ad = data_ad[data_ad[:, 7] == 0]  #data with class '0'

        #after oversampling (Borderline_SMOTE)
        a_bs = X_new_bs[:, 0:7]
        b_bs = y_new_bs.reshape(-1, 1)  #class/target column
        data_bs = np.concatenate((a_bs, b_bs), axis=1)

        data_1_bs = data_bs[data_bs[:, 7] == 1]  #data with class '1'
        data_0_bs = data_bs[data_bs[:, 7] == 0]  #data with class '0'

        ### create 3D plot
        fig = plt.figure(constrained_layout=True, figsize=(12, 7.5))

        gs = GridSpec(2, 2, figure=fig)
        ax1 = fig.add_subplot(gs[0, 0], projection='3d')
        ax2 = fig.add_subplot(gs[0, 1], projection='3d')
        ax3 = fig.add_subplot(gs[1, 1], projection='3d')
        ax4 = fig.add_subplot(gs[1, 0], projection='3d')

        size = 10.5  #label font size

        ## scatter plot before oversampling
        scatter1 = ax1.scatter(data_1_old[:, [1]],
                               data_1_old[:, [2]],
                               data_1_old[:, [4]],
                               c='yellow',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)
        scatter2 = ax1.scatter(data_0_old[:, [1]],
                               data_0_old[:, [2]],
                               data_0_old[:, [4]],
                               c='r',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)

        ax1.set_xlabel('Age (years)', fontsize=size)
        ax1.set_ylabel('Elapsed time (months)', fontsize=size)
        ax1.set_zlabel('Wart type', fontsize=size)
        ax1.set_zticks(range(1, 4, 1))
        ax1.set_title('(a) Before oversampling\n',
                      fontsize=14,
                      fontweight='bold')
        # set legend
        legend = ax1.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'],
                            numpoints=1,
                            loc='best',
                            fontsize=size)
        legend.get_frame().set_edgecolor('k')

        ## scatter plot after oversampling (SMOTE)
        scatter1 = ax2.scatter(data_1[:, [1]],
                               data_1[:, [2]],
                               data_1[:, [4]],
                               c='yellow',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)
        scatter2 = ax2.scatter(data_0[:, [1]],
                               data_0[:, [2]],
                               data_0[:, [4]],
                               c='r',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)

        ax2.set_xlabel('Age (years)', fontsize=size)
        ax2.set_ylabel('Elapsed time (months)', fontsize=size)
        ax2.set_zlabel('Wart type', fontsize=size)
        ax2.set_zticks(range(1, 4, 1))
        ax2.set_title('(b) After oversampling (SMOTE)\n',
                      fontsize=14,
                      fontweight='bold')
        # set legend
        legend = ax2.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'],
                            numpoints=1,
                            loc='best',
                            fontsize=size)
        legend.get_frame().set_edgecolor('k')

        ## scatter plot after oversampling (ADASYN)
        scatter1 = ax3.scatter(data_1_ad[:, [1]],
                               data_1_ad[:, [2]],
                               data_1_ad[:, [4]],
                               c='yellow',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)
        scatter2 = ax3.scatter(data_0_ad[:, [1]],
                               data_0_ad[:, [2]],
                               data_0_ad[:, [4]],
                               c='r',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)

        ax3.set_xlabel('Age (years)', fontsize=size)
        ax3.set_ylabel('Elapsed time (months)', fontsize=size)
        ax3.set_zlabel('Wart type', fontsize=size)
        ax3.set_zticks(range(1, 4, 1))
        ax3.set_title('\n(d) After oversampling (ADASYN)\n',
                      fontsize=14,
                      fontweight='bold')
        # set legend
        legend = ax3.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'],
                            numpoints=1,
                            loc='best',
                            fontsize=size)
        legend.get_frame().set_edgecolor('k')

        ## scatter plot after oversampling (ADASYN)
        scatter1 = ax4.scatter(data_1_bs[:, [1]],
                               data_1_bs[:, [2]],
                               data_1_bs[:, [4]],
                               c='yellow',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)
        scatter2 = ax4.scatter(data_0_bs[:, [1]],
                               data_0_bs[:, [2]],
                               data_0_bs[:, [4]],
                               c='r',
                               marker='o',
                               s=40,
                               edgecolors='k',
                               depthshade=0)

        ax4.set_xlabel('Age (years)', fontsize=size)
        ax4.set_ylabel('Elapsed time (months)', fontsize=size)
        ax4.set_zlabel('Wart type', fontsize=size)
        ax4.set_zticks(range(1, 4, 1))
        ax4.set_title('\n(c) After oversampling (Borderline-SMOTE)\n',
                      fontsize=14,
                      fontweight='bold')
        # set legend
        legend = ax4.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'],
                            numpoints=1,
                            loc='best',
                            fontsize=size)
        legend.get_frame().set_edgecolor('k')

        #        #save figure
        #        fig.savefig("Oversampling plot_3D.png", dpi=300, bbox_inches='tight')
        plt.show()
コード例 #18
0
    def fit(self, X, y):
        """Fitting."""
        # if not hasattr(self, "base_estimator"):
        # self.set_base_clf()
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y

        minority_X = X[y == 1]
        minority_y = y[y == 1]
        majority_X = X[y == 0]
        majority_y = y[y == 0]

        for i in range(self.ensemble_size):
            self.estimators_.append(base.clone(self.base_estimator))

        for n, estimator in enumerate(self.estimators_):
            np.random.seed(self.random_state + (n * 2))
            bagXminority = minority_X[np.random.choice(
                minority_X.shape[0], len(minority_y), replace=True), :]
            bagXmajority = majority_X[np.random.choice(
                majority_X.shape[0], len(majority_y), replace=True), :]

            bagyminority = np.ones(len(minority_y)).astype('int')
            bagymajority = np.zeros(len(majority_y)).astype('int')

            train_X = np.concatenate((bagXmajority, bagXminority))
            train_y = np.concatenate((bagymajority, bagyminority))

            unique, counts = np.unique(train_y, return_counts=True)

            if self.oversampler == "ROS":
                ros = RandomOverSampler(random_state=self.random_state +
                                        (n * 2))
                try:
                    train_X, train_y = ros.fit_resample(train_X, train_y)
                except:
                    pass
            elif self.oversampler == "B2":
                b2 = BorderlineSMOTE(random_state=self.random_state + (n * 2),
                                     kind='borderline-2')
                try:
                    train_X, train_y = b2.fit_resample(train_X, train_y)
                except:
                    pass
            elif self.oversampler == "RUS":
                rus = RandomUnderSampler(random_state=self.random_state +
                                         (n * 2))
                try:
                    train_X, train_y = rus.fit_resample(train_X, train_y)
                    # _, ys_counter = np.unique(train_ys, return_counts=True)

                    # if np.sum(ys_counter) < 9:
                    # rus = RandomUnderSampler(random_state=self.random_state+(n*2), sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]})
                    # train_Xs, train_ys = rus.fit_resample(train_X, train_y)
                    # train_X, train_y = train_Xs, train_ys
                    # else:
                    # train_X, train_y = train_Xs, train_ys
                except:
                    pass
            elif self.oversampler == "CNN":
                cnn = CondensedNearestNeighbour(
                    random_state=self.random_state + (n * 2))
                try:
                    train_X, train_y = cnn.fit_resample(train_X, train_y)
                except:
                    pass
            # if train_X.shape[0] >= 5:
            estimator.fit(train_X, train_y)
            # else:
            #     print("Padlem, więc biorę %i sasiadow" % train_X.shape[0])
            #     self.estimators_[n] = KNeighborsClassifier(weights='distance', n_neighbors=train_X.shape[0]).fit(train_X, train_y)

        # Return the classifier
        return self
コード例 #19
0
ファイル: test_smote.py プロジェクト: yueyuep/TCNN
def test_borderline_smote_wrong_kind():
    bsmote = BorderlineSMOTE(kind='rand')
    with pytest.raises(ValueError, match='The possible "kind" of algorithm'):
        bsmote.fit_resample(X, Y)
コード例 #20
0
ファイル: OverSample.py プロジェクト: 392369223/MyRepository
 def over_sample(self, x_train, y_train):
     sm = BorderlineSMOTE(random_state=151, kind='borderline-1')
     # 获取过采样后的数据集
     x_train, y_train = sm.fit_resample(x_train, y_train)
     return x_train, y_train
コード例 #21
0
ki67_exp = ki67[:, 1:]
ki67_exp = np.array(ki67_exp, dtype=np.float32)
ki67_exp.shape

ki67_label = ki67[:, 0]
ki67_label = np.array(ki67_label, dtype=np.int32)
ki67_label.shape

# use oversampling to balance the training set

# use borderlineSMOTE
oversample = BorderlineSMOTE()

# fit and apply the transform
ki67_exp, ki67_label = oversample.fit_resample(ki67_exp, ki67_label)

ki67_exp = np.reshape(ki67_exp, (-1, 64, 64, 4))

# del array objects to save memory
del df
del df2
del ki67

# batch size and numEpochs
numEpoch = 200
batchSize = 25
LR = 0.001
DECAY = 0.000
EPSILON = 0.99
DROPOUT = 0.5
コード例 #22
0
def borderline_smote(X, y):
    sm = BorderlineSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
コード例 #24
0
x_train, x_test, y_train, y_test = train_test_split(bank_X,
                                                    bank_y,
                                                    stratify=bank_y,
                                                    train_size=0.7,
                                                    random_state=0)

#classifier = svm.SVC(kernel = 'rbf',C=1000,gamma=0.001)
classifier = LogisticRegression(max_iter=10000, C=0.1)

#easy_ensemble = imblearn.ensemble.EasyEnsembleClassifier(n_estimators=35, base_estimator=classifier, sampling_strategy='majority', n_jobs=-1)

oversample = BorderlineSMOTE(sampling_strategy=0.5,
                             n_jobs=-1,
                             kind='borderline-1')
x_train, y_train = oversample.fit_resample(x_train, y_train)
tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1)
x_train, y_train = tom_lin.fit_resample(x_train, y_train)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

h.printResults2(y_test, y_pred)
h.plotConfusionMatrix(y_test, y_pred, norm=True)
h.plotConfusionMatrix(y_test, y_pred, norm=False)

#White-box explanation
feature_names = bank_X.columns.values
interpr.plotFeaturesCoefficientGlobal(classifier, feature_names)

new_x_train = x_train
コード例 #25
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        train_X, train_y = X, y

        unique, counts = np.unique(train_y, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            train_X, train_y = smote.fit_resample(train_X, train_y)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                train_X, train_y = svmSmote.fit_resample(train_X, train_y)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                train_X, train_y = adasyn.fit_resample(train_X, train_y)
            except RuntimeError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            train_X, train_y = sls.sample(train_X, train_y)

        # Testing all models
        scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        if len(self.ensemble_) > 1:
            alpha_good = scores > (0.5 + self.alpha)
            self.ensemble_ = [
                self.ensemble_[i] for i in np.where(alpha_good)[0]
            ]

        if len(self.ensemble_) > self.ensemble_size - 1:
            worst = np.argmin(scores)
            del self.ensemble_[worst]

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
コード例 #26
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
コード例 #27
0
label_autism_data.head()

X = label_autism_data.iloc[:, 0:14]
y = label_autism_data.iloc[:, 14]

X

y

# Borderline smote
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=42)
X_res_borderline, y_res_bol = sm.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_res_bol))

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_res_borderline,
                                                    y_res_bol,
                                                    test_size=0.25,
                                                    random_state=0)

len(X_train)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
コード例 #28
0
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from xgboost import XGBClassifier
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
train_x = train.drop(columns='class', axis=1) # class 열을 삭제한 새로운 객체
y = train['class'] # 결과 레이블(class)
test_x = test
class0 = len(y[y == 0])
class1 = len(y[y == 1])
class2 = len(y[y == 2])
total = len(y)
scaler = StandardScaler()
x = scaler.fit_transform(train_x)
TEST = scaler.transform(test_x)
BS = BorderlineSMOTE(random_state=42, n_jobs=-1, k_neighbors=3)
x, y = BS.fit_resample(x, y)
print(pd.value_counts(y))
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42)
evals = [(test_x, test_y)]
xgb = XGBClassifier(n_estimators=600, n_jobs=-1, learning_rate=0.05, subsample=0.65, max_depth=50, objective="multi:softmax", random_state=42)
xgb.fit(train_x, train_y, early_stopping_rounds=30, eval_set=evals)
# ans_pred += forest_pred
# ans_pred += xgb_pred
# ans_pred /= 2.0
print("acc: {}".format(xgb.score(train_x, train_y)))
print("acc: {}".format(xgb.score(test_x, test_y)))
y_pred = np.argmax(xgb.predict_proba(TEST), axis=1)
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)
# 테스트 데이터에 대해 94.6% 정도의 accuracy가 나왔지만 실제로는 92.215%가 나옴
コード例 #29
0
def test_borderline_smote_wrong_kind(data):
    bsmote = BorderlineSMOTE(kind='rand')
    with pytest.raises(ValueError, match='The possible "kind" of algorithm'):
        bsmote.fit_resample(*data)
コード例 #30
0
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE
from plotDataset import plot_dataset

X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=1)
counter = Counter(y)
print(counter)
plot_dataset(X, y, counter)

oversample = BorderlineSMOTE()
X, y = oversample.fit_resample(X, y)

counter = Counter(y)
print(counter)
plot_dataset(X, y, counter)
コード例 #31
0
                                                    test_size=0.2)
model_oversample = lr.fit(x_train, y_train)
y_predict = model_oversample.predict(x_test)
print(classification_report(y_test, y_predict))
#SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=2020).fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=2020)
X_res, y_res = sm.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
from imblearn.over_sampling import KMeansSMOTE
sm = KMeansSMOTE(random_state=2020, cluster_balance_threshold=0.1)
X_res, y_res = sm.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
nhg_exp = np.array(nhg_exp, dtype=np.float32)
# nhg_exp = np.reshape(nhg_exp, (-1, 64, 64, 4))
nhg_exp.shape

# get diagnosis
nhg_label = nhg[:, 0]
nhg_label = np.array(nhg_label, dtype=np.int32)
nhg_label = to_categorical(nhg_label)

# use oversampling to balance the training set

# use borderlineSMOTE
oversample = BorderlineSMOTE()

# fit and apply the transform
nhg_exp, nhg_label = oversample.fit_resample(nhg_exp, nhg_label)

nhg_exp = np.reshape(nhg_exp, (-1, 64, 64, 4))

# delete array objects to save memory
del nhg1
del nhg2
del nhg

# fix random seed for reproducibility
seed = 1234
np.random.seed(seed)

# batch size and numEpochs
numEpoch = 500
batchSize = 100