def undersampling(X, y, sampling_strategy='auto', n_neighbors=1):
    sampler = OneSidedSelection(n_jobs=36,
                                sampling_strategy=sampling_strategy,
                                n_neighbors=n_neighbors)
    X_us, y_us = sampler.fit_sample(X, y)

    return X_us.copy(), y_us.copy()
def test_oss_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    oss.fit(X, Y)
    assert_raises(RuntimeError, oss.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_oss_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    oss.fit(X, Y)
    assert_raises(RuntimeError, oss.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Beispiel #4
0
def test_oss_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_fit():
    """Test the fitting method"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    # Fit the data
    oss.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(oss.min_c_, 0)
    assert_equal(oss.maj_c_, 1)
    assert_equal(oss.stats_c_[0], 6)
    assert_equal(oss.stats_c_[1], 9)
Beispiel #7
0
def test_oss_fit_resample():
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_fit():
    """Test the fitting method"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    # Fit the data
    oss.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(oss.min_c_, 0)
    assert_equal(oss.maj_c_, 1)
    assert_equal(oss.stats_c_[0], 6)
    assert_equal(oss.stats_c_[1], 9)
Beispiel #9
0
def one_sided_selection(X,
                        y,
                        visualize=False,
                        pca2d=True,
                        pca3d=True,
                        tsne=True,
                        pie_evr=True):
    oss = OneSidedSelection(random_state=42)
    X_res, y_res = oss.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 15)
    oss = OneSidedSelection(random_state=RND_SEED)
    assert_warns(UserWarning, oss.fit, X, y)

    # multiclass case
    y = np.array([0] * 10 + [1] * 3 + [2] * 2)
    oss = OneSidedSelection(random_state=RND_SEED)
    assert_warns(UserWarning, oss.fit, X, y)
def test_oss_fit_resample():
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [
        -0.65571327, 0.42412021
    ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [
        -0.00717161, 0.00318087
    ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def use_OSSSMOTEENN(self):
        X,y = preparation(self.path)
##############################
        dy = pd.DataFrame(y)
        dy.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
#################################
        oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority")
        X_res,y_res = oss.fit_sample(X,y)

        dy_res = pd.DataFrame(y_res)
        dy_res.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
##############################
        sme = SMOTEENN(random_state=42,n_jobs=-1)
        X_sme, y_sme = sme.fit_sample(X_res, y_res)

    #draw bar

        dy_sme = pd.DataFrame(y_sme)
        dy_sme.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()

    #generate csv

        df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1)

        df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f')
        
    ###the first line of data will be delete    


    ##########draw PCA
        pca = PCA(n_components=2)
        X_sme = pca.fit_transform(X_sme)
        plot_2d_space(X_sme,y_sme, 'SMOTE + ENN')

        return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv')




# if __name__ == '__main__':
#     path ="++Final_Test++_pre.csv"
#     #draw_bar(path)
#     mhi = My_handle_imbalance(path)
#     mhi.use_OSSSMOTEENN()
#
#     #use_SMOTETomek(path)
#     #draw_origin(path)
Beispiel #13
0
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices():
    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_init():
    # Define a ratio
    oss = OneSidedSelection(random_state=RND_SEED)

    assert_equal(oss.n_seeds_S, 1)
    assert_equal(oss.n_jobs, 1)
    assert_equal(oss.random_state, RND_SEED)
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
Beispiel #18
0
def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled
def test_oss_with_wrong_object():
    """Test if an error is raised while passing a wrong object"""

    # Resample the data
    knn = 'rnd'
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    assert_raises(ValueError, oss.fit_sample, X, Y)
def resampling(train_data, train_labels, resampling_type, resampling_stragey):
    train_data_new = np.reshape(train_data,
                                (train_data.shape[0], train_data.shape[1] *
                                 train_data.shape[2] * train_data.shape[3]))
    if resampling_type == 'SMOTE':
        train_data_resampled, train_labels_resampled = SMOTE(
            random_state=42).fit_resample(train_data_new, train_labels.values)

    elif resampling_type == 'over_sampling':
        over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = over_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'under_sampling':
        under_sampler = RandomUnderSampler(
            sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = under_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'tomelinks':
        t1 = TomekLinks(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = t1.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'near_miss_neighbors':
        undersample = NearMiss(version=1, n_neighbors=3)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'one_sided_selection':
        undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    return train_data_resampled, train_labels_resampled
Beispiel #21
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
def test_oss_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    assert_raises(RuntimeError, oss.sample, X, Y)
Beispiel #23
0
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False):
    '''
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    If normalize flag is True then the data are being normalised
    The sampling parameter sets the type of sampling to be used
    '''
    print('----------{} with {}----------'.format(clf_name, sampling))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    plot_ind = randint(0, 9)
    j = 0
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(usx, usy):
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]

        if sampling == 'SMOTE':
            x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ADASYN':
            x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ENN':
            x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train)
        elif sampling == 'Tomek':
            x_train, y_train = TomekLinks().fit_resample(x_train, y_train)
        elif sampling == 'SMOTETomek':
            x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'SMOTEENN':
            x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'NCR':
            x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train)
        elif sampling == 'OSS':
            x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train)

        if normalize:
            scaler = StandardScaler().fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

        clf.fit(x_train, y_train)

        # if plot_ind == j and clf_name == 'DecisionTreeClassifier':
        #     plot_decision_tree(clf)

        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
Beispiel #24
0
	def under_sampling(self, data, label, n_neighbors=5, method=None):
		#Input
		#	data: 2D array data (im_height*im_width, num of band)
		#	label: 1D array label(0,1,2...) per each data
		#	n_neighbors: num of neighbors used in OSS
		#	method: select under sampling method (OSS)
		#Output
		#	return under sampled data, label
		if method in self.under_method:
			print("Before sampling label proportion: ",Counter(label))
			if method == 'OSS' or method == 'OneSidedSelection':	  
				undersample = OneSidedSelection(n_neighbors=n_neighbors, n_seeds_S=200)
				data, label = undersample.fit_resample(data, label)
				
			print("After sampling label proportion: ",Counter(label))
		
		return data, label
def test_oss_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    oss = OneSidedSelection(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, oss.fit, X, y_single_class)
def test_oss_init():
    """Test the initialisation of the object"""

    # Define a ratio
    oss = OneSidedSelection(random_state=RND_SEED)

    assert_equal(oss.n_seeds_S, 1)
    assert_equal(oss.n_jobs, 1)
    assert_equal(oss.random_state, RND_SEED)
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #28
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
Beispiel #29
0
def get_under_sample_models():
    models, names = list(), list()
    models.append(TomekLinks())
    names.append('TomesLinks')
    models.append(EditedNearestNeighbours())
    names.append('EditedNearestNeighbors')
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    models.append(OneSidedSelection())
    names.append('OneSidedSelection')
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def test_oss_with_object():
    """Test the fit sample routine with an knn object"""

    # Resample the data
    knn = KNeighborsClassifier(n_neighbors=1)
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    # Resample the data
    knn = 1
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #31
0
    def resample(self, X, y, by, random_state=None):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'ROS':
            sampler = RandomOverSampler(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y

        return X_train, y_train
Beispiel #32
0
def test_oss_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    oss = OneSidedSelection(random_state=RND_SEED, verbose=verbose)

    assert_equal(oss.size_ngh, 1)
    assert_equal(oss.n_seeds_S, 1)
    assert_equal(oss.n_jobs, -1)
    assert_equal(oss.random_state, RND_SEED)
    assert_equal(oss.verbose, verbose)
    assert_equal(oss.min_c_, None)
    assert_equal(oss.maj_c_, None)
    assert_equal(oss.stats_c_, {})
def UnderSample(X, Y, method='Random', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'Cluster':  # 默认kmeans估计器
        sampler = ClusterCentroids(ratio='auto',
                                   random_state=random_state,
                                   estimator=None)
    elif method is 'Random':
        sampler = RandomUnderSampler(ratio='auto',
                                     random_state=random_state,
                                     replacement=False)
    elif method is 'NearMiss_1':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=1)
    elif method is 'NearMiss_2':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=2)
    elif method is 'NearMiss_3':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=3)
    elif method is 'TomekLinks':
        sampler = TomekLinks(ratio='auto', random_state=random_state)
    elif method is 'ENN':  # kind_sel可取'all'和'mode'
        sampler = EditedNearestNeighbours(ratio='auto',
                                          random_state=random_state,
                                          kind_sel='all')
    elif method is 'RENN':  # kind_sel可取'all'和'mode'
        sampler = RepeatedEditedNearestNeighbours(ratio='auto',
                                                  random_state=random_state,
                                                  kind_sel='all')
    elif method is 'All_KNN':
        sampler = AllKNN(ratio='auto',
                         random_state=random_state,
                         kind_sel='all')
    elif method is 'CNN':
        sampler = CondensedNearestNeighbour(ratio='auto',
                                            random_state=random_state)
    elif method is 'One_SS':
        sampler = OneSidedSelection(ratio='auto', random_state=random_state)
    elif method is 'NCR':
        sampler = NeighbourhoodCleaningRule(ratio='auto',
                                            random_state=random_state,
                                            kind_sel='all',
                                            threshold_cleaning=0.5)
    elif method is 'IHT':
        sampler = InstanceHardnessThreshold(estimator=None,
                                            ratio='auto',
                                            random_state=random_state)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    return X_resampled, Y_resampled
Beispiel #34
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
def equalize_training_dataset_with_OneSidedSel(x_train, y_train):
    from imblearn.under_sampling import OneSidedSelection

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = OneSidedSelection(
        sampling_strategy={i: 180
                           for i in range(0, 43)},
        n_seeds_S=5,
        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
def get_models():
    models, names = list(), list()
    # TL
    models.append(TomekLinks())
    names.append('TL')
    # ENN
    models.append(EditedNearestNeighbours())
    names.append('ENN')
    # RENN
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    # OSS
    models.append(OneSidedSelection())
    names.append('OSS')
    # NCR
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def get_samplers():
    samplers = {
        # Under-samplers
        'RandomUn': RandomUnderSampler(),
        'TL': TomekLinks(),
        # 'ENN': EditedNearestNeighbours(),
        'RENN': RepeatedEditedNearestNeighbours(),
        'OSS': OneSidedSelection(),
        'NCR': NeighbourhoodCleaningRule(),
        'IHT': InstanceHardnessThreshold(),
        # Over-Samplers
        'RandomOv': RandomOverSampler(),
        'SMOTE': SMOTE(),
        'SMOTESVM': SVMSMOTE(),
        # 'SMOTEKMeans': KMeansSMOTE(),
        'ADASYN': ADASYN(),
        # Combined Under and Over Samplers
        'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')),
        'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
    }
    return samplers
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply One-Sided Selection
oss = OneSidedSelection(return_indices=True)
X_resampled, y_resampled, idx_resampled = oss.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
Beispiel #39
0
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply One-Sided Selection
oss = OneSidedSelection()
X_resampled, y_resampled = oss.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
def test_oss_with_wrong_object():
    knn = 'rnd'
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int"):
        oss.fit_resample(X, Y)