def undersampling(X, y, sampling_strategy='auto', n_neighbors=1): sampler = OneSidedSelection(n_jobs=36, sampling_strategy=sampling_strategy, n_neighbors=n_neighbors) X_us, y_us = sampler.fit_sample(X, y) return X_us.copy(), y_us.copy()
def test_oss_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object oss = OneSidedSelection(random_state=RND_SEED) oss.fit(X, Y) assert_raises(RuntimeError, oss.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_oss_fit_sample(): """Test the fit sample routine""" # Resample the data oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_fit(): """Test the fitting method""" # Create the object oss = OneSidedSelection(random_state=RND_SEED) # Fit the data oss.fit(X, Y) # Check if the data information have been computed assert_equal(oss.min_c_, 0) assert_equal(oss.maj_c_, 1) assert_equal(oss.stats_c_[0], 6) assert_equal(oss.stats_c_[1], 9)
def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def one_sided_selection(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): oss = OneSidedSelection(random_state=42) X_res, y_res = oss.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 15) oss = OneSidedSelection(random_state=RND_SEED) assert_warns(UserWarning, oss.fit, X, y) # multiclass case y = np.array([0] * 10 + [1] * 3 + [2] * 2) oss = OneSidedSelection(random_state=RND_SEED) assert_warns(UserWarning, oss.fit, X, y)
def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ -0.00717161, 0.00318087 ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def use_OSSSMOTEENN(self): X,y = preparation(self.path) ############################## dy = pd.DataFrame(y) dy.value_counts().plot(kind='bar',title='Count(label)') plt.show() ################################# oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority") X_res,y_res = oss.fit_sample(X,y) dy_res = pd.DataFrame(y_res) dy_res.value_counts().plot(kind='bar',title='Count(label)') plt.show() ############################## sme = SMOTEENN(random_state=42,n_jobs=-1) X_sme, y_sme = sme.fit_sample(X_res, y_res) #draw bar dy_sme = pd.DataFrame(y_sme) dy_sme.value_counts().plot(kind='bar',title='Count(label)') plt.show() #generate csv df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1) df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f') ###the first line of data will be delete ##########draw PCA pca = PCA(n_components=2) X_sme = pca.fit_transform(X_sme) plot_2d_space(X_sme,y_sme, 'SMOTE + ENN') return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') # if __name__ == '__main__': # path ="++Final_Test++_pre.csv" # #draw_bar(path) # mhi = My_handle_imbalance(path) # mhi.use_OSSSMOTEENN() # # #use_SMOTETomek(path) # #draw_origin(path)
def test_oss_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices(): # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_init(): # Define a ratio oss = OneSidedSelection(random_state=RND_SEED) assert_equal(oss.n_seeds_S, 1) assert_equal(oss.n_jobs, 1) assert_equal(oss.random_state, RND_SEED)
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
def test_oss_with_wrong_object(): """Test if an error is raised while passing a wrong object""" # Resample the data knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) assert_raises(ValueError, oss.fit_sample, X, Y)
def resampling(train_data, train_labels, resampling_type, resampling_stragey): train_data_new = np.reshape(train_data, (train_data.shape[0], train_data.shape[1] * train_data.shape[2] * train_data.shape[3])) if resampling_type == 'SMOTE': train_data_resampled, train_labels_resampled = SMOTE( random_state=42).fit_resample(train_data_new, train_labels.values) elif resampling_type == 'over_sampling': over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = over_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'under_sampling': under_sampler = RandomUnderSampler( sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = under_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'tomelinks': t1 = TomekLinks(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = t1.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'near_miss_neighbors': undersample = NearMiss(version=1, n_neighbors=3) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'one_sided_selection': undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) return train_data_resampled, train_labels_resampled
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def test_oss_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object oss = OneSidedSelection(random_state=RND_SEED) assert_raises(RuntimeError, oss.sample, X, Y)
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False): ''' Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation If normalize flag is True then the data are being normalised The sampling parameter sets the type of sampling to be used ''' print('----------{} with {}----------'.format(clf_name, sampling)) totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0 plot_ind = randint(0, 9) j = 0 skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(usx, usy): x_train, x_test = usx[train_index], usx[test_index] y_train, y_test = usy[train_index], usy[test_index] if sampling == 'SMOTE': x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ADASYN': x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ENN': x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train) elif sampling == 'Tomek': x_train, y_train = TomekLinks().fit_resample(x_train, y_train) elif sampling == 'SMOTETomek': x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'SMOTEENN': x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'NCR': x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train) elif sampling == 'OSS': x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train) if normalize: scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clf.fit(x_train, y_train) # if plot_ind == j and clf_name == 'DecisionTreeClassifier': # plot_decision_tree(clf) y_predict = clf.predict(x_test) for i in range(len(y_predict)): if y_test[i] and y_predict[i]: totalTP += 1 if not y_test[i] and y_predict[i]: totalFP += 1 if y_test[i] and not y_predict[i]: totalFN += 1 if not y_test[i] and not y_predict[i]: totalTN += 1 j += 1 print('TOTAL TP: ' + str(totalTP)) print('TOTAL FP: ' + str(totalFP)) print('TOTAL FN: ' + str(totalFN)) print('TOTAL TN: ' + str(totalTN))
def under_sampling(self, data, label, n_neighbors=5, method=None): #Input # data: 2D array data (im_height*im_width, num of band) # label: 1D array label(0,1,2...) per each data # n_neighbors: num of neighbors used in OSS # method: select under sampling method (OSS) #Output # return under sampled data, label if method in self.under_method: print("Before sampling label proportion: ",Counter(label)) if method == 'OSS' or method == 'OneSidedSelection': undersample = OneSidedSelection(n_neighbors=n_neighbors, n_seeds_S=200) data, label = undersample.fit_resample(data, label) print("After sampling label proportion: ",Counter(label)) return data, label
def test_oss_fit_single_class(): """Test either if an error when there is a single class""" # Create the object oss = OneSidedSelection(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, oss.fit, X, y_single_class)
def test_oss_init(): """Test the initialisation of the object""" # Define a ratio oss = OneSidedSelection(random_state=RND_SEED) assert_equal(oss.n_seeds_S, 1) assert_equal(oss.n_jobs, 1) assert_equal(oss.random_state, RND_SEED)
def test_oss_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train
def get_under_sample_models(): models, names = list(), list() models.append(TomekLinks()) names.append('TomesLinks') models.append(EditedNearestNeighbours()) names.append('EditedNearestNeighbors') models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') models.append(OneSidedSelection()) names.append('OneSidedSelection') models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def test_oss_with_object(): """Test the fit sample routine with an knn object""" # Resample the data knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) # Resample the data knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resample(self, X, y, by, random_state=None): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'ROS': sampler = RandomOverSampler(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y return X_train, y_train
def test_oss_init(): """Test the initialisation of the object""" # Define a ratio verbose = True oss = OneSidedSelection(random_state=RND_SEED, verbose=verbose) assert_equal(oss.size_ngh, 1) assert_equal(oss.n_seeds_S, 1) assert_equal(oss.n_jobs, -1) assert_equal(oss.random_state, RND_SEED) assert_equal(oss.verbose, verbose) assert_equal(oss.min_c_, None) assert_equal(oss.maj_c_, None) assert_equal(oss.stats_c_, {})
def UnderSample(X, Y, method='Random', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'Cluster': # 默认kmeans估计器 sampler = ClusterCentroids(ratio='auto', random_state=random_state, estimator=None) elif method is 'Random': sampler = RandomUnderSampler(ratio='auto', random_state=random_state, replacement=False) elif method is 'NearMiss_1': sampler = NearMiss(ratio='auto', random_state=random_state, version=1) elif method is 'NearMiss_2': sampler = NearMiss(ratio='auto', random_state=random_state, version=2) elif method is 'NearMiss_3': sampler = NearMiss(ratio='auto', random_state=random_state, version=3) elif method is 'TomekLinks': sampler = TomekLinks(ratio='auto', random_state=random_state) elif method is 'ENN': # kind_sel可取'all'和'mode' sampler = EditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'RENN': # kind_sel可取'all'和'mode' sampler = RepeatedEditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'All_KNN': sampler = AllKNN(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'CNN': sampler = CondensedNearestNeighbour(ratio='auto', random_state=random_state) elif method is 'One_SS': sampler = OneSidedSelection(ratio='auto', random_state=random_state) elif method is 'NCR': sampler = NeighbourhoodCleaningRule(ratio='auto', random_state=random_state, kind_sel='all', threshold_cleaning=0.5) elif method is 'IHT': sampler = InstanceHardnessThreshold(estimator=None, ratio='auto', random_state=random_state) X_resampled, Y_resampled = sampler.fit_sample(X, Y) return X_resampled, Y_resampled
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def equalize_training_dataset_with_OneSidedSel(x_train, y_train): from imblearn.under_sampling import OneSidedSelection old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = OneSidedSelection( sampling_strategy={i: 180 for i in range(0, 43)}, n_seeds_S=5, n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def get_models(): models, names = list(), list() # TL models.append(TomekLinks()) names.append('TL') # ENN models.append(EditedNearestNeighbours()) names.append('ENN') # RENN models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') # OSS models.append(OneSidedSelection()) names.append('OSS') # NCR models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def get_samplers(): samplers = { # Under-samplers 'RandomUn': RandomUnderSampler(), 'TL': TomekLinks(), # 'ENN': EditedNearestNeighbours(), 'RENN': RepeatedEditedNearestNeighbours(), 'OSS': OneSidedSelection(), 'NCR': NeighbourhoodCleaningRule(), 'IHT': InstanceHardnessThreshold(), # Over-Samplers 'RandomOv': RandomOverSampler(), 'SMOTE': SMOTE(), 'SMOTESVM': SVMSMOTE(), # 'SMOTEKMeans': KMeansSMOTE(), 'ADASYN': ADASYN(), # Combined Under and Over Samplers 'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')), 'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')), } return samplers
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply One-Sided Selection oss = OneSidedSelection(return_indices=True) X_resampled, y_resampled, idx_resampled = oss.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
palette = sns.color_palette() # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply One-Sided Selection oss = OneSidedSelection() X_resampled, y_resampled = oss.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int"): oss.fit_resample(X, Y)