Ejemplo n.º 1
0
def test_ada_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.8
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
                     0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 2
0
def test_ada_fit_sample_nn_obj():
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.94899098, -0.30508981],
                     [0.28204936, -0.13953426],
                     [1.58028868, -0.04089947],
                     [0.66117333, -0.28009063]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 3
0
def test_ada_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 4
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert_raises(RuntimeError, ada.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
Ejemplo n.º 5
0
def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 6
0
def test_ada_fit():
    """Test the fitting method"""

    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    # Fit the data
    ada.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ada.min_c_, 0)
    assert_equal(ada.maj_c_, 1)
    assert_equal(ada.stats_c_[0], 8)
    assert_equal(ada.stats_c_[1], 12)
Ejemplo n.º 7
0
def oversample(X, y, bal_strategy):

	if(bal_strategy == "SMOTESVN"  or bal_strategy == "ALL"):
		# Apply SMOTE SVM
		sm = SMOTE(kind='svm')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "SMOTE"  or bal_strategy == "ALL"):
		# Apply regular SMOTE
		sm = SMOTE(kind='regular')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "ADASYN"  or bal_strategy == "ALL"):
	# Apply the random over-sampling
		ada = ADASYN()
		X_sampled, y_sampled = ada.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE'
		sys.exit(1)


	return (X_sampled, y_sampled)
Ejemplo n.º 8
0
def main():

    climate_data = pd.read_csv(".././DataSets/Lead_10_Hist_10.csv")
    climate_data = np.asarray(climate_data)
    end_col = climate_data.shape[1]
    print(climate_data.shape)

    #---------------------------------------------
    #segregating the predictand and predictors
    X = climate_data[:, :end_col - 1]
    Y = climate_data[:, end_col - 1]
    # print(X.shape, Y.shape)
    # print(X[0][1], Y)

    #----------------------------------------------
    # checking the number of samples for each class
    print("\nSamples of each rainfall class in overall set: ",
          collections.Counter(Y))

    #------------------------------------------------------------------
    # dividing into training and test set
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        shuffle=False)
    print("\nSamples in training set: ", collections.Counter(Y_train))
    # ---------------------------------------------------
    # Upsampling the data for increasing the balance between class
    #resampling should be done over the training set and test set should be put away from it

    # #method 1: SMOTE
    # X_resampled1, Y_resampled1 = SMOTE().fit_resample(X_train, Y_train)
    # print("\nSMOTE:", sorted(collections.Counter(Y_resampled1).items()))

    #method 2: ADASYN
    X_resampled2, Y_resampled2 = ADASYN().fit_resample(X_train, Y_train)
    print("\nADASYN:", sorted(collections.Counter(Y_resampled2).items()))

    #-----------------------------------------------------------------
    # Calling the classifier module
    Y_pred_all = pipe_selkbest_RF(X_resampled2, Y_resampled2, X_test)
    Y_true = Y_test

    ind = 0
    for comp in n_components_to_test:
        # evaluating the classification
        print("\n Reduced number of features: ", comp)
        calculateEvaluationMetrics(Y_true, Y_pred_all[ind])
        ind = ind + 1
def plot_data(X, Y):
    # train_X = PCA(n_components=2).fit_transform(train_X)
    plt.rcParams['figure.figsize'] = (27.0, 5.0)
    fig = plt.figure()
    ax0 = fig.add_subplot(1, 5, 1)
    ax0.scatter(X[:, 0], X[:, 1], c=Y)
    #ax0.set_title('Original dataset')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X1, Y1 = SMOTE().fit_sample(X, Y)
    ax1 = fig.add_subplot(1, 5, 2)
    ax1.scatter(X1[:, 0], X1[:, 1], c=Y1)
    #ax1.set_title('SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y)
    ax2 = fig.add_subplot(1, 5, 3)
    ax2.scatter(X2[:, 0], X2[:, 1], c=Y2)
    #ax2.set_title('Borderline-SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    enn = EditedNearestNeighbours()
    X3, Y3 = enn.fit_sample(X, Y)
    smo = SMOTE(k_neighbors=5)
    X3, Y3 = smo.fit_sample(X3, Y3)
    ax3 = fig.add_subplot(1, 5, 4)
    ax3.scatter(X3[:, 0], X3[:, 1], c=Y3)
    #ax3.set_title('ADASYN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y)
    ax4 = fig.add_subplot(1, 5, 4)
    ax4.scatter(X4[:, 0], X4[:, 1], c=Y4)
    #ax4.set_title('SMOTE+ENN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y)
    ax5 = fig.add_subplot(1, 5, 5)
    ax5.scatter(X5[:, 0], X5[:, 1], c=Y5)
    #ax5.set_title('MC-ODG')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    plt.show()
Ejemplo n.º 10
0
def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__var_smoothing':
            np.logspace(0, -9, num=100)
        }]

        return params_debug
    def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__n_neighbors': [3, 5],
            'model__weights': ['uniform', 'distance']
        }]

        return params_debug
    def tackle_data_imbalance(self, X, Y):

        increase = 3
        counter = Counter(Y)

        total_classes = len(counter)
        total_data_points = sum(counter.values())
        expected_points = total_data_points * increase
        avg_points_per_class = int(expected_points / total_classes)

        # generating highest amount of data for each class
        # higest_key, highest_val = max(counter.items(), key=operator.itemgetter(1))
        # famous_dict = dict((key, highest_val) for key in counter)

        famous_dict = dict(
            (key, avg_points_per_class)
            for key in counter)  # generating double of previous for each class

        over = ADASYN(n_neighbors=1, sampling_strategy=famous_dict)
        under = RandomUnderSampler(sampling_strategy="auto")

        X, Y = over.fit_resample(X, Y)
        X, Y = under.fit_resample(X, Y)
        return X, Y
Ejemplo n.º 14
0
 def initializeSamplers(self):
     self.makeDataDirectory()
     random_sampler = RandomOverSampler(sampling_strategy=0.1,
                                        random_state=42)
     smote = SMOTE(sampling_strategy=0.1, random_state=42)
     ada = ADASYN(sampling_strategy=0.1, random_state=42)
     smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42)
     smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
     self.samplers = [random_sampler, smote, ada, smote_tomek, smote_enn]
     self.names = [
         "RandomSample", "SMOTE", "ADASYN", "SMOTETomek", "SMOTEEnn"
     ]
     for sampler, name in zip(self.samplers, self.names):
         self.runSampler(sampler, name)
     self.loadAll()
Ejemplo n.º 15
0
def compare_different_oversample_method(model, sample_method, X, Y):
    n_split = 5
    skf = StratifiedKFold(n_splits=n_split, shuffle=True)
    res_list = np.zeros(4)
    cnt=0
    for train_indices, test_indices in skf.split(X, Y):
        cnt+=1
        print('正在进行第{}次交叉验证'.format(cnt))
        train_X, train_Y, test_X, test_Y = X[train_indices], Y[train_indices], X[test_indices], Y[test_indices]
        min_k_kearest = min(Counter(train_Y)) - 1
        if sample_method == 'SMOTE_ENN':
            enn = EditedNearestNeighbours()
            train_X, train_Y = enn.fit_sample(train_X, train_Y)
            smo = SMOTE(k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'smote':
            smo = SMOTE(k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'borderline_smote':
            smo = BorderlineSMOTE(kind='borderline-1', k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'adasyn':
            ada = ADASYN(n_neighbors=min(2, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = ada.fit_sample(train_X, train_Y)
        elif sample_method:
            train_X, train_Y = sample_method.fit_sample(train_X, train_Y)
        model.fit(train_X, train_Y)
        y_score = model.predict(test_X)
        y_score_prob = model.predict_proba(test_X)[:, 1]
        # res_list1 += cal_multi_class_matrics(test_Y,y_sampled_score,y_sampled_score_prob)
        res_list += cal_multi_class_matrics(test_Y, y_score, y_score_prob)
    return res_list / n_split
Ejemplo n.º 16
0
def resample_to_csv(X, y, random_state, path, method):
    """Re-samples dataset using desired method of oversampling and writes output to CSV.

    :param X: Original Features
    :param y: Original Labels
    :param randomState: Random intialization
    :param path: Path to output location and name of CSV
    :param method: Either SMOTE-NN method or BorderLineSMOTE (borderline) method.
    See imbalanced-learn documentation for more information.
    https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.BorderlineSMOTE.html

    :return: none
    """

    if method == 'SMOTE-NN':
        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)
    elif method == 'borderline':
        borderlineSmote = BorderlineSMOTE(random_state=random_state)
        X_resampled, y_resampled = borderlineSmote.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)

    elif method == 'adasyn':
        adasyn = ADASYN(random_state=random_state)
        X_resampled, y_resampled = adasyn.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)

    elif method == 'tomek':
        tomek = SMOTETomek(random_state=random_state)
        X_resampled, y_resampled = tomek.fit_resample(X, y)
        X_resampled['BK'] = y_resampled
        X_resampled.to_csv(path)
Ejemplo n.º 17
0
def over_sample(X, y, sampler="SMOTE"):
    samplers = {
        "RandomOverSampler": RandomOverSampler(),
        "ADASYN": ADASYN(),
        "SMOTE": SMOTE(),
        "BorderlineSMOTE": BorderlineSMOTE(),
        "SVMSMOTE": SVMSMOTE(),
        "SMOTENC": SMOTENC(categorical_features=[]),
    }
    sampler = samplers[sampler]

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    return X_resampled, y_resampled
Ejemplo n.º 18
0
def test_ada_fit_resample():
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_resample(X, Y)
    X_gt = np.array(
        [
            [0.11622591, -0.0317206],
            [0.77481731, 0.60935141],
            [1.25192108, -0.22367336],
            [0.53366841, -0.30312976],
            [1.52091956, -0.49283504],
            [-0.28162401, -2.10400981],
            [0.83680821, 1.72827342],
            [0.3084254, 0.33299982],
            [0.70472253, -0.73309052],
            [0.28893132, -0.38761769],
            [1.15514042, 0.0129463],
            [0.88407872, 0.35454207],
            [1.31301027, -0.92648734],
            [-1.11515198, -0.93689695],
            [-0.18410027, -0.45194484],
            [0.9281014, 0.53085498],
            [-0.14374509, 0.27370049],
            [-0.41635887, -0.38299653],
            [0.08711622, 0.93259929],
            [1.70580611, -0.11219234],
            [0.88161986, -0.2829741],
            [0.35681689, -0.18814597],
            [1.4148276, 0.05308106],
            [0.3136591, -0.31327875],
        ]
    )
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]
    )
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 19
0
 def retrieve_data(self, ml_cfg):
     """Pass config file to retrieve generator for training data"""
     self.ml_cfg = ml_cfg
     data_list = list()
     meta_df = pd.DataFrame()
     batch_count = 1
     for data, meta in self.data_in.retrieve_data(self.ml_cfg):
         data_list.extend(data)
         meta_df = pd.concat([meta_df, meta])
         batch_count += 1
     print("Concatenated {} training batches".format(batch_count))
     data_resp = np.nan_to_num(np.array(data_list))
     meta_resp = df_to_one_hot(meta_df, 'target', 2)
     data_resp, meta_resp = ADASYN().fit_resample(data_resp, meta_resp)
     meta_resp = np.squeeze(np.eye(2)[meta_resp].astype('int16'))
     yield data_resp, meta_resp
Ejemplo n.º 20
0
    def synthetic_sampling_ADASYN(self, dataset):
        """

            :param dataset:
            :return:
        """
        try:
            data = dataset.iloc[:, :-2]
            y = dataset.iloc[:, -1]
            X_resampled, y_resampled = ADASYN().fit_sample(data, y)
            X_resampled = pd.DataFrame(X_resampled)
            y_resampled = pd.DataFrame(y_resampled)
            new_dataset = pd.concat([X_resampled, y_resampled], axis=1)
            return new_dataset
        except Exception as e:
            print(e)
Ejemplo n.º 21
0
def runRandomUnderSample(train, test, seed):
    for i in range(5):
        X_resampled, y_resampled = ADASYN().fit_sample(train[vars], train.wtbz)
        print(len(X_resampled))
        trained = pd.DataFrame(
            np.concatenate((X_resampled, y_resampled.reshape(-1, 1)), axis=1))
        trained.columns = vars + ['wtbz']
        rf1Default = baseModel(
            GradientBoostingClassifier(
                n_estimators=models['defaultGBM']['n_estimators'],
                learning_rate=models['defaultGBM']['learning_rate'],
                max_depth=models['defaultGBM']['max_depth'],
                max_features=models['defaultGBM']['max_features'],
                random_state=seed + 29), vars, "rfbase" + str(i), trained,
            test, seed)
    return train, test
Ejemplo n.º 22
0
def setup_data(data, resample=True):
    X        = np.concatenate([data['train']['b'], data['train']['x'][:,0,:], data['train']['a'][:,0,:]], axis=-1)
    Y        = data['train']['ys_seq'][:,0]
    if resample: 
        ros = RandomOverSampler(random_state=0)
        smote = SMOTE(); ada = ADASYN()
        print('resampling...')
        X, Y = ros.fit_resample(X, Y)
    X_valid  = np.concatenate([data['valid']['b'], data['valid']['x'][:,0,:], data['valid']['a'][:,0,:]], axis=-1)
    Y_valid  = data['valid']['ys_seq'][:,0]
    CE_valid = data['valid']['ce']
    S, S_oh  = None, None
    if 'subtype' in data['train']:
        S       = data['train']['subtype']
        S_oh    = data['train']['subtype_oh']
    return X, Y, S, S_oh, X_valid, Y_valid, CE_valid
Ejemplo n.º 23
0
 def fit(self, X, y, by, random_state=None, visualize=False):
     '''
     by: String
         The method used to perform re-sampling
         support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
             'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 
             'SMOTETomek', 'ORG']
     '''
     if by == 'RUS':
         sampler = RandomUnderSampler(random_state=random_state)
     elif by == 'CNN':
         sampler = CondensedNearestNeighbour(random_state=random_state)
     elif by == 'ENN':
         sampler = EditedNearestNeighbours()
     elif by == 'NCR':
         sampler = NeighbourhoodCleaningRule()
     elif by == 'Tomek':
         sampler = TomekLinks()
     elif by == 'ALLKNN':
         sampler = AllKNN()
     elif by == 'OSS':
         sampler = OneSidedSelection(random_state=random_state)
     elif by == 'NM':
         sampler = NearMiss()
     elif by == 'CC':
         sampler = ClusterCentroids(random_state=random_state)
     elif by == 'SMOTE':
         sampler = SMOTE(random_state=random_state)
     elif by == 'ADASYN':
         sampler = ADASYN(random_state=random_state)
     elif by == 'BorderSMOTE':
         sampler = BorderlineSMOTE(random_state=random_state)
     elif by == 'SMOTEENN':
         sampler = SMOTEENN(random_state=random_state)
     elif by == 'SMOTETomek':
         sampler = SMOTETomek(random_state=random_state)
     elif by == 'ORG':
         sampler = None
     else:
         raise Error('Unexpected \'by\' type {}'.format(by))
     
     if by != 'ORG':
         X_train, y_train = sampler.fit_resample(X, y)
     else:
         X_train, y_train = X, y
     self.base_estimator.fit(X_train, y_train)
def balance_smote(X, y, logger_ins, freq_dct, method, seed_num, n_cluster=5):
    if method == "SMOTE":
        logger_ins.info("The sampling method is", method)
        imbl = SMOTE(sampling_strategy=freq_dct,
                     random_state=seed_num,
                     k_neighbors=n_cluster)
    else:
        imbl = ADASYN(sampling_strategy=freq_dct,
                      random_state=seed_num,
                      n_neighbors=n_cluster)
    logger_ins.info("The sampling method is", method, "using", n_cluster,
                    "as the number of clusters")
    logger_ins.info("frequencies should match", freq_dct)

    X_res, y_res = imbl.fit_resample(X, y)
    logger_ins.info('Resampled dataset shape %s' % Counter(y_res))
    return X_res, y_res
Ejemplo n.º 25
0
def makeOverSamplesADASYN(X, y):
    from imblearn.over_sampling import ADASYN
    """
    Purpose
    ----------
    Increasing the observation of minority class 

    Parameters
    ----------
    X: Independent Variable in DataFrame
    y: Dependent Variable in Pandas DataFrame format
    Returns:
    ----------
    Returns Independent and Dependent variable with resampling minority class
    """
    X_resampled, y_resampled = ADASYN(random_state=7).fit_sample(X, y)
    return (X_resampled, y_resampled)
Ejemplo n.º 26
0
def oversample(X, y, method="smote", pos_neg_frac=0.5, plot=False):
    from imblearn.over_sampling import SMOTE
    from imblearn.over_sampling import ADASYN
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.combine import SMOTEENN
    from imblearn.combine import SMOTETomek
    sampler = None
    X, y = verify_pandas(X, y)
    pos_neg_frac_now = np.sum(y) / np.sum(~y)
    if pos_neg_frac <= pos_neg_frac_now:
        print(
            "Oversampling isn't need since Pos/Neg current = %.3f is greater than passed Pos/Neg ratio = %.3f"
            % (pos_neg_frac_now, pos_neg_frac))
        return X, y
    cols = X.columns
    if method is None:
        return X, y
    elif method == "smote":
        sampler = SMOTE(sampling_strategy=pos_neg_frac)
    elif method == "adasyn":
        sampler = ADASYN(sampling_strategy=pos_neg_frac)
    elif method == "randomoversampler":
        sampler = RandomOverSampler(sampling_strategy=pos_neg_frac)
    elif method == "smoteenn":
        sampler = SMOTEENN(sampling_strategy=pos_neg_frac)
    elif method == "smotetomek":
        sampler = SMOTETomek(sampling_strategy=pos_neg_frac)
    else:
        raise ValueError("Over sampler not found")

    X_res, y_res = X.copy(deep=True), y.copy(deep=True)
    X_res, y_res = sampler.fit_resample(X_res, y_res)
    X_res = pd.DataFrame(X_res, columns=cols)

    y_res = pd.Series(y_res)

    if plot:
        print("=" * 100 + "\nPlotting Imbalance and Noise after Oversampling")
        plot_imbalance(y, y_res)
        plot_reduced_dim(X,
                         y,
                         X_res,
                         y_res,
                         title1="Before Oversampling",
                         title2="After Oversampling")
    return X_res, y_res
Ejemplo n.º 27
0
    def sample(self, nb_data_to_load, mode='combine'):
        X, y = self.load_data(nb_data_to_load)

        # Init sampler
        sampler = {
            'over': ADASYN(),
            'under': TomekLinks(),
            'combine': SMOTETomek(),
        }.get(mode)
        X_resampled, y_resampled = sampler.fit_resample(X, y)

        # Round datetime, stage and temperature
        X_resampled[:, 0] = X_resampled[:, 0].round()
        X_resampled[:, 1] = X_resampled[:, 1].round()
        X_resampled[:, 2] = X_resampled[:, 2].round(1)

        self.save_data(self.file_to_save, X_resampled, y_resampled)
Ejemplo n.º 28
0
def load_data(mode: str, normalize: bool = True):
    df, hidden_df = __load_data_first_time()

    # Extract x and y
    y = np.array(df['earnings'].to_numpy(), dtype=int)
    del df['earnings']

    x = np.array(df.to_numpy(), dtype=float)

    # Hidden to numpy
    hidden = hidden_df.to_numpy()

    if mode == 'vanilla':
        pass

    elif mode == 'smote':
        x, y = SMOTE().fit_sample(x, y)

    elif mode == 'adasyn':
        x, y = ADASYN().fit_sample(x, y)

    elif mode == 'bordersmote':
        x, y = BorderlineSMOTE().fit_sample(x, y)

    elif mode == 'randomover':
        x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'randomunder':
        x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'tomek':
        x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'knn':
        x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y)
        hidden = hidden[idxs]

    if normalize:
        x -= np.mean(x, axis=0)
        x /= np.std(x, axis=0)

    return x, y, hidden
def get_oversampling_models():
    models, names = list(), list()
    # RandomOverSampler
    models.append(RandomOverSampler())
    names.append('ROS')
    # SMOTE
    models.append(SMOTE())
    names.append('SMOTE')
    # BorderlineSMOTE
    models.append(BorderlineSMOTE())
    names.append('BLSMOTE')
    # SVMSMOTE
    models.append(SVMSMOTE())
    names.append('SVMSMOTE')
    # ADASYN
    models.append(ADASYN())
    names.append('ADASYN')
    return models, names
Ejemplo n.º 30
0
    def over_sample(self,
                    features,
                    method="BorderLine",
                    sampling_strategy="minority",
                    random_state=42,
                    k_neighbors=5,
                    n_neighbors=10,
                    kind="borderline-1"):
        """
        过采样方法
        : param features: list 特征集
        :param method: str, option: ADASYN, BorderLine,Random,SVM
        :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m}
        :param random_state:int
        :param k_neighbors:int
        :param n_neighbors:int
        :param kind:str, borderline-1,borderline-2
        :return:df
        """
        X = self._df[features].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "ADASYN":
            overSm = ADASYN(sampling_strategy=sampling_strategy,
                            random_state=random_state,
                            n_neighbors=k_neighbors)
        elif method == "BorderLine":
            overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy,
                                     random_state=random_state,
                                     k_neighbors=k_neighbors,
                                     m_neighbors=n_neighbors,
                                     kind=kind)
        elif method == "Random":
            overSm = RandomOverSampler(sampling_strategy=sampling_strategy,
                                       random_state=random_state)
        elif method == "SVM":
            overSm = SVMSMOTE(sampling_strategy=sampling_strategy,
                              random_state=random_state,
                              k_neighbors=k_neighbors,
                              m_neighbors=n_neighbors,
                              out_step=0.5)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = overSm.fit_resample(X, y)
        print("overSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data, columns=features + [self._target])
        return df_new
Ejemplo n.º 31
0
def classify(over_sampl, tf_idf, use_idf, pca, alphas, neighbors, slack,
             estimators, portion):
    """
    input:
        over_sampl: string variable to indicate the name of oversampling method 
        tf_idf: boolean variable to indicate whether to use tf or not
        use_idf: boolean variable to indicate whether to use idf or not
        pca: int variable to indicate whether to use PCA or not (<=0 means no, yes otherwise)
        alphas: NB tuning parameter
        neighbors: KNN tuning parameter
        slack: SVM tuning parameter
        estimators: GradientBoosting, AdaBoost tuning parameter
        portion: which airline data to work with (None means all airlines)
    """
    if not tf_idf:
        if pca > 0:
            return None
        else:
            message = "Preprocessing used is Word2Vec & Over Sampling method is  " + over_sampl + "  data portion  " + portion
    else:
        if use_idf:
            message = "Preprocessing used is tf-idf & Over Sampling method is  " + over_sampl + "   PCA dimension = " + str(
                pca) + "  data portion  " + portion
        else:
            message = "Preprocessing used is tf & Over Sampling method is  " + over_sampl + "   PCA dimension = " + str(
                pca) + "  data portion  " + portion
    # load dataset
    ds = get_dataset()
    X_train, X_test, Y_train, Y_test = ds.load_data(tf_idf=tf_idf,
                                                    use_idf=use_idf,
                                                    use_pca=pca,
                                                    airway_name=portion)
    if over_sampl == "RandomOverSampler":
        X_train, Y_train = RandomOverSampler().fit_sample(X_train, Y_train)
    elif over_sampl == "SMOTE":
        X_train, Y_train = SMOTE().fit_sample(X_train, Y_train)
    elif over_sampl == "ADASYN":
        X_train, Y_train = ADASYN().fit_sample(X_train, Y_train)
    clas = classifier()
    print(message)
    SVM_result, GB_result, AB_result, KNN_result, NB_result = clas.classify(
        X_train, X_test, Y_train, Y_test)
    compare_performance(SVM_result, GB_result, AB_result, KNN_result,
                        NB_result, message)
Ejemplo n.º 32
0
def get_sampling_technique():

    sampling_technique = list()
    sampling_name = list()

    # RandomOverSampler
    sampling_technique.append(RandomOverSampler(random_state=123))
    sampling_name.append('RandomOverSampler')

    # SMOTE
    sampling_technique.append(SMOTE(random_state=123))
    sampling_name.append('SMOTE')

    # ADASYN
    sampling_technique.append(ADASYN(random_state=123))
    sampling_name.append('ADASYN')

    # Downsampling tech
    sampling_technique.append(RandomUnderSampler(random_state=123))
    sampling_name.append('RandomUnderSampler')

    #SMOTEENN
    sampling_technique.append(SMOTEENN(random_state=123))
    sampling_name.append('SMOTEENN')

    #SMOTETomek
    sampling_technique.append(SMOTETomek(random_state=123))
    sampling_name.append('SMOTETomek')

    # Combine Over and Undersampling Methods
    over = RandomOverSampler(random_state=123)
    under = RandomUnderSampler(random_state=123)

    sampling_technique.append(Pipeline(steps=[('o', over), ('u', under)]))
    sampling_name.append('Over-Under Resampling Combination')

    # Combine Over and Undersampling Methods
    smote = SMOTE(random_state=123)
    under = RandomUnderSampler(random_state=123)

    sampling_technique.append(Pipeline(steps=[('smote', over), ('u', under)]))
    sampling_name.append('SMOTE-Under Resampling Combination')

    return sampling_technique, sampling_name
Ejemplo n.º 33
0
def minority_oversample(X_train, Y_train, algorithm='random_oversample'):
    '''
    Oversample the minority class using the specified algorithm
    :param X_train: Training set features
    :param Y_train: Training set labels
    :param algorithm: The oversampling algorithm to use. One of {"random_oversample", "smote", "adasyn"}
    :return: A new training set containing oversampled examples
    '''
    if algorithm == 'random_oversample':
        sampler = RandomOverSampler(random_state=np.random.randint(0, high=1000))
    elif algorithm == 'smote':
        sampler = SMOTE(random_state=np.random.randint(0, high=1000))
    elif algorithm == 'adasyn':
        sampler = ADASYN(random_state=np.random.randint(0, high=1000))
    else:
        sampler = RandomOverSampler(random_state=np.random.randint(0, high=1000))
    X_resampled, Y_resampled = sampler.fit_resample(X_train, Y_train)
    print("Train set shape before oversampling: ", X_train.shape, " Train set shape after resampling: ", X_resampled.shape)
    return X_resampled, Y_resampled
Ejemplo n.º 34
0
def get_data_for_prediction(train,
                            test,
                            best_feature,
                            sampling_strategy=0.4,
                            random_state=21):
    """
    Get scaled and balanced train (x, y) and test (y) data
    """
    scaler = StandardScaler()
    x_train = scaler.fit_transform(
        train.drop(['user_id', 'is_churned'], axis=1)[best_feature])
    y_train = train['is_churned']

    x_train_bal, y_train_bal = ADASYN(
        random_state=random_state,
        sampling_strategy=sampling_strategy).fit_sample(x_train, y_train)

    x_test = scaler.transform(test.drop(['user_id'], axis=1)[best_feature])
    return x_train_bal, y_train_bal, x_test
Ejemplo n.º 35
0
def get_best_feature(data, n_feature, random_state=21, sampling_strategy=0.4):
    """
    List of best feature for model
    """
    x = StandardScaler().fit_transform(
        data.drop(['user_id', 'is_churned'], axis=1))
    y = data['is_churned']

    x_bal, y_bal = ADASYN(random_state=random_state,
                          sampling_strategy=sampling_strategy).fit_sample(
                              x, y)

    clf = xgb.XGBClassifier(n_estimators=100,
                            learning_rate=0.1,
                            random_state=random_state,
                            n_jobs=-1)
    clf.fit(x_bal, y_bal)
    best_feature = clf.feature_importances_
    return best_feature[0][:n_feature]
def over_sampling_shift(x, y, delta=0.5, mode='smote', n_neighbors=5):

    assert(mode in ['smote', 'adasyn'])

    y_counts = Counter(np.squeeze(y))

    x_resampled, y_resampled = under_sampling_shift(x, y, delta=delta)

    n_min_samples = np.min(list(Counter(y_resampled).values()))
    n_neighbors = min(n_neighbors, n_min_samples - 1)

    if mode == 'smote':
        x_resampled, y_resampled = SMOTE(
            sampling_strategy=y_counts, k_neighbors=n_neighbors).fit_resample(x_resampled, y_resampled)
    elif mode == 'adasyn':
        x_resampled, y_resampled = ADASYN(
            sampling_strategy=y_counts, n_neighbors=n_neighbors).fit_resample(x_resampled, y_resampled)

    return x_resampled, y_resampled
    def _check_imbalance(self,
                         method: str = 'SMOTE',
                         random_seed: int = 1769) -> dict:
        """
        This function checks for imbalance. Further, it resamples the data and return the dataframe.
        Currently, we are only using Oversampling.
        :param: method: This defines the type of sampling to be done. Possible values: ['SMOTE', 'RANDOM']
        :return: None
        """
        output = self.raw_data[self.output_column]
        self.feature_columns = list(
            set(self.raw_data.columns) - set(self.output_column))
        features = self.raw_data[self.feature_columns]

        before_sampling = Counter(self.raw_data[self.output_column])

        if method == 'SMOTE':
            sampler = SMOTE(sampling_strategy='auto',
                            random_state=random_seed,
                            n_jobs=-1)
        elif method == 'ADASYN':
            sampler = ADASYN(sampling_strategy='auto',
                             random_state=random_seed,
                             n_jobs=-1)
        else:
            sampler = RandomOverSampler(sampling_strategy='auto',
                                        random_state=random_seed)

        features_resampled, output_resampled = sampler.fit_resample(
            features, output)
        pickle.dump(features_resampled, open("./data/output/feature.pkl",
                                             'wb'))
        pickle.dump(output_resampled, open("./data/output/output.pkl", 'wb'))

        after_sampling = Counter(output_resampled)

        return {
            'before_sampling_counter': before_sampling,
            'after_sampling_counter': after_sampling,
            'feature_data': features_resampled,
            'output_resampled': output_resampled
        }
Ejemplo n.º 38
0
def test_ada_fit_sampling_strategy_error():
    sampling_strategy = {0: 9, 1: 12}
    ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED)
    with raises(ValueError, match="No samples will be generated."):
        ada.fit_resample(X, Y)
Ejemplo n.º 39
0
from imblearn.over_sampling import ADASYN

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
Ejemplo n.º 40
0
def test_adasyn_error(adasyn_params, err_msg):
    adasyn = ADASYN(**adasyn_params)
    with pytest.raises(ValueError, match=err_msg):
        adasyn.fit_resample(X, Y)
Ejemplo n.º 41
0
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#define the size of test
#sklearn.model_selection.train_test_split随机划分训练集与测试集
#train_test_split(train_data,train_target,test_size=数字, random_state=0)

#ADASYN
ada = ADASYN()
os_X,os_y = ada.fit_sample(X_train,y_train)
os_X = pd.DataFrame(os_X)
os_y = pd.DataFrame(os_y)

#logistic
best_c = printing_Kfold_scores(os_X,os_y)
clf_l = LogisticRegression(C = best_c, penalty = 'l1')
clf_l.fit(os_X,os_y.values.ravel())
y_pred = clf_l.predict(X_test)
#调用ravel()函数将矩阵转变成一维数组
#(ravel()函数与flatten()的区别)
# 两者所要实现的功能是一致的(将多维数组降为一维),
# 两者的区别在于返回拷贝(copy)还是返回视图(view),
# numpy.flatten() 返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵,
# 而numpy.ravel()返回的是视图(view),会影响(reflects)原始矩阵。
Ejemplo n.º 42
0
def test_ada_wrong_nn_obj():
    nn = 'rnd'
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ada.fit_sample(X, Y)
Ejemplo n.º 43
0
def test_ada_fit_ratio_error():
    ratio = {0: 9, 1: 12}
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    with raises(ValueError, match="No samples will be generated."):
        ada.fit_sample(X, Y)
Ejemplo n.º 44
0
def test_ada_fit():
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert ada.ratio_ == {0: 4, 1: 0}