def tomek_links(): # minority class X_minority = np.transpose([[1.4, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [0.4, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]]) # majority class X_majority = np.transpose( [[2.1, 1.5, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45, 3.00, 3.1, 1.5], [1.5, 2.2, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9, 1.00, 2.0, 0.3]]) # # fig, ax = plt.subplots(1, 1, figsize=(6, 6)) # ax.scatter(X_majority[:, 0], X_majority[:, 1], # label='Negative class', s=200, marker='_') # # ax.scatter(X_minority[:, 0], X_minority[:, 1], # label='Positive class', s=200, marker='+') # # # highlight the samples of interest # ax.scatter([X_minority[-1, 0], X_majority[1, 0]], # [X_minority[-1, 1], X_majority[1, 1]], # label='Tomek link', s=200, alpha=0.3) # ax.set_title('Illustration of a Tomek link') # make_plot_despine(ax) # fig.tight_layout() sampler = TomekLinks() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) ax_arr = (ax1, ax2) title_arr = ('Removing only majority samples', 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, [ TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all') ]): X_res, y_res = sampler.fit_resample( np.vstack((X_majority, X_minority)), np.array([0] * X_majority.shape[0] + [1] * X_minority.shape[0])) ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], label='Minority class', s=200, marker='+') ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Majority class', s=200, marker='_') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title(title) make_plot_despine(ax) fig.tight_layout() plt.show()
def undersample(X, y, bal_strategy): print 'Shape of X: ', X.shape print 'Shape of y_Train: ', y.shape if(bal_strategy == "RANDOM" or bal_strategy == "ALL"): # apply random under-sampling rus = RandomUnderSampler() X_sampled, y_sampled = rus.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"): # Apply Tomek Links cleaning tl = TomekLinks() X_sampled, y_sampled = tl.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE' sys.exit(1) return (X_sampled, y_sampled)
def workflow_no_oversampling(self, remove_tomeklinks, model_name): """ This function performs the workflow of classification without any oversampling :return: f1 score without oversampling """ train_x_expanded, train_y_binary = self.pre_process(test_data=False) inos_p_old = train_x_expanded[train_y_binary == 1] inos_n = train_x_expanded[train_y_binary == 0] print("debug, shape of inos_p_old, inos_n") print(inos_p_old.shape, inos_n.shape) x_res = pd.concat([inos_p_old, inos_n], axis=0) # create y_res y_res_p = np.ones(inos_p_old.shape[0]) y_res_n = np.zeros(inos_n.shape[0]) y_res = np.concatenate([y_res_p, y_res_n]) print("debug, shape of training data:") print(x_res.shape) print(y_res.shape) if remove_tomeklinks == True: tl = TomekLinks() x_res, y_res = tl.fit_resample(x_res, y_res) print("shape of training data after removing tomek links:") print(x_res.shape) print(y_res.shape) else: pass tmo = self.build_model(x_res, y_res, model_name) # evaluates performance x_test, y_test_binary = self.pre_process(test_data=True) # f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary) return f1_score, precision, recall
def test_tl_fit_sample(): """Test the fit sample routine""" # Resample the data tl = TomekLinks(random_state=RND_SEED) X_resampled, y_resampled = tl.fit_sample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resampling(train_data, train_labels, resampling_type, resampling_stragey): train_data_new = np.reshape(train_data, (train_data.shape[0], train_data.shape[1] * train_data.shape[2] * train_data.shape[3])) if resampling_type == 'SMOTE': train_data_resampled, train_labels_resampled = SMOTE( random_state=42).fit_resample(train_data_new, train_labels.values) elif resampling_type == 'over_sampling': over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = over_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'under_sampling': under_sampler = RandomUnderSampler( sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = under_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'tomelinks': t1 = TomekLinks(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = t1.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'near_miss_neighbors': undersample = NearMiss(version=1, n_neighbors=3) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'one_sided_selection': undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) return train_data_resampled, train_labels_resampled
def undersample_tomek_link(X, y, label='Tomek links under-sampling', plot=False): tl = TomekLinks(return_indices=True, ratio='all') X_tl, y_tl, id_tl = tl.fit_sample(X, y) X_tl = pd.DataFrame(X_tl, columns=X.columns) y_tl = pd.Series(y_tl, name=y.name) if plot == True: #print('Removed indexes:', id_tl) # plotting using pca pca = PCA(n_components=2) X_pca = pd.DataFrame(pca.fit_transform(X_tl)) colors = ['#1F77B4', '#FF7F0E'] markers = ['o', 's'] for l, c, m in zip(np.unique(y_tl), colors, markers): plt.scatter( X_pca.loc[y_tl == l, 0], # pc 1 X_pca.loc[y_tl == l, 1], # pc 2 c=c, label=l, marker=m) plt.title(label) plt.legend(loc='upper right') plt.show() return X_tl, y_tl, tl, id_tl
def _validate_estimator(self): """ Private function to validate SMOTE and ENN objects :return: """ if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = self.smote else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) else: self.smote_ = SMOTE(ratio=self.ratio, k_neighbors=3, random_state=self.random_state) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = self.tomek else: raise ValueError('tomek needs to be a TomekLinks object.' 'Got {} instead.'.format(type(self.tomek))) else: self.tomek_ = TomekLinks(ratio="all", random_state=self.random_state)
def _tomek_data(self): """Performs tomek links. Can not handle nominal values.""" if self.cols_nominal.size > 0: print("Skipping Tomek Links. Cannot perform with raw categorical data. Create dummies to use.") return tl = TomekLinks() self.X_train, self.y_train = tl.fit_sample(self.X_train, self.y_train)
def test_tl_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object tl = TomekLinks(random_state=RND_SEED) tl.fit(X, Y) assert_raises(RuntimeError, tl.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def get_tomeklinks_under_sampled_dataset(): tl = TomekLinks(return_indices=True, ratio='majority') X_tl, y_tl, id_tl = tl.fit_sample(X_train, y_train) print('Removed indexes:', id_tl) shuffle(X_tl) y_tl = X_tl[target] return X_tl, y_tl
def get_binary_Tomek_Links_cleaned_data(id_df, X_df, y_df): tLinks = TomekLinks() a = y_df.iloc[:, 0] tLinks.fit_sample(X_df, y_df.iloc[:, 0]) sample_indices = tLinks.sample_indices_ id_df_cleaned = id_df.iloc[sample_indices] X_df_cleaned = X_df.iloc[sample_indices] y_df_cleaned = y_df.iloc[sample_indices] return id_df_cleaned, X_df_cleaned, y_df_cleaned
def sample_all(self, nb_data_to_load): X = [] y = [] # 한번에 돌면서 ng 데이터 따로 저장, 정상 데이터 샘플링 따로 저장 # lg_train 폴더 안에 있는 파일을 하나씩 가져옴 file_list = glob(self.file_to_load + '/*.txt') for filepath in file_list: print(filepath) # list 안의 인덱스에 맞는 line 의 데이터 가져오기 # 정상, 불량 데이터 X,y에 저장 # Load normal data index = 0 with open(filepath, mode='r') as f: for i, line in enumerate(f): if line[0] == '0': # print("label : 0") curr_data = line.strip().split('\t') curr_data[2] = stage_value = int(curr_data[2][1]) X.append(curr_data[1:]) y.append(curr_data[0]) index += 1 if index % 1000 == 0: print(index) # Load random_700 data random_700 = '/workspace/peter/sampled/sampled_random700.txt' with open(random_700, mode='r') as f: if i in random_index: if line[0] == '0': curr_data = line.strip().split('\t') X.append(curr_data[1:]) y.append(curr_data[0]) # Possible type conversion required for sampling methods X_np = np.array(X).astype(np.float64) y_np = np.array(y).astype(np.int) # Undersampling with Tomeklinks undersampler = TomekLinks() X_resampled, y_resampled = undersampler.fit_resample(X_np, y_np) # Round datetime, stage and temperature X_resampled[:, 0] = X_resampled[:, 0].round() X_resampled[:, 1] = X_resampled[:, 1].round() X_resampled[:, 2] = X_resampled[:, 2].round(1) self.save_data(self.file_to_save, X_resampled, y_resampled)
def test_tl_fit_sample(): """Test the fit sample routine""" # Resample the data tl = TomekLinks(random_state=RND_SEED) X_resampled, y_resampled = tl.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def undersample_Tomeks_Link(df, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) tl = TomekLinks() X_res, y_res = tl.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def imbalanced_resampling(method, x, y): if method == "under": sampling = TomekLinks(sampling_strategy="auto") elif method == "over": sampling = SMOTE(ratio='auto') elif method == "combined": sampling = SMOTETomek() else: return x, y X, Y = sampling.fit_sample(x, y) return X, Y
def getData(splitData=True, useImbalancer=False, useStratify=False): global standard_scaler data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv") X = data.values[:, 1:-1] rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy() X = np.concatenate((X, rank_dummy), axis=1) y = data.values[:, 0].reshape(-1, 1) if useStratify: stratify = y else: stratify = None if splitData: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, shuffle=True, stratify=stratify) else: X_train = X y_train = y if useImbalancer and splitData: tl = TomekLinks(sampling_strategy='majority') X_train, y_train = tl.fit_sample(X=X_train, y=y_train) # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T) if splitData: unique, counts = np.unique(y_test, return_counts=True) # print("y_test\n", np.asarray((unique, counts)).T) if splitData: return X_train, X_test, y_train.ravel(), y_test.ravel() else: return X_train, y_train.ravel()
def trainModelWithResults(model, X, y,rd_state=None,autoscale=1,usetomeklinks=1): X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=rd_state) # stratify the split because we have unbalanced target if autoscale==1: scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) if usetomeklinks==1: tl = TomekLinks(return_indices=False) X_train, y_train = tl.fit_sample(X_train, y_train) mfitted = model.fit(X_train,y_train) predictions = mfitted.predict(X_test) print(confusion_matrix(y_test, predictions)) print(classification_report(y_test, predictions))
def tomeklinks(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): tl = TomekLinks() X_res, y_res = tl.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_tl_fit(): """Test the fitting method""" # Create the object tl = TomekLinks(random_state=RND_SEED) # Fit the data tl.fit(X, Y) # Check if the data information have been computed assert_equal(tl.min_c_, 0) assert_equal(tl.maj_c_, 1) assert_equal(tl.stats_c_[0], 500) assert_equal(tl.stats_c_[1], 4500)
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 20) tl = TomekLinks(random_state=RND_SEED) assert_warns(UserWarning, tl.fit, X, y) # multiclass case y = np.array([0] * 3 + [1] * 7 + [2] * 10) tl = TomekLinks(random_state=RND_SEED) assert_warns(UserWarning, tl.fit, X, y)
def oversample(self, train, labels): """ Over samples data according to SMOTE algorithm """ #Oversample sm = SMOTE(random_state=2) train_res, labels_res = sm.fit_sample(train, labels) #clear noise points that emerged from oversampling tl = TomekLinks(random_state=42) train_res, labels_res = tl.fit_sample(train_res, labels_res) return train_res, labels_res
def test_tl_fit(): """Test the fitting method""" # Create the object tl = TomekLinks(random_state=RND_SEED) # Fit the data tl.fit(X, Y) # Check if the data information have been computed assert_equal(tl.min_c_, 0) assert_equal(tl.maj_c_, 1) assert_equal(tl.stats_c_[0], 7) assert_equal(tl.stats_c_[1], 13)
def Tomek_us(X_train, Y_train, seed, sampling_strategy): tl = TomekLinks(random_state=seed, n_jobs=-1, sampling_strategy=sampling_strategy) print('Before Tomek undersampling : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = tl.fit_resample(X_train, Y_train) print('After Tomek undersampling : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def test_tl_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data tl = TomekLinks(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def undersampling(df): tl = TomekLinks(ratio='all', n_jobs=16, return_indices=True) X = [] y = [] add2list(df, X, y) X, y, idx = tl.fit_sample(X, y) criterion = [False] * len(df) for i in idx: criterion[i] = True newdf = df[criterion].reset_index(drop=True) return newdf
def dataset_sampling(X,y): sm = SMOTE(random_state=42,ratio='minority') smt = SMOTETomek(ratio='auto') ros = RandomOverSampler(random_state=0) rus = RandomUnderSampler(random_state=0) tl = TomekLinks(return_indices=True, ratio='majority') cc = ClusterCentroids(ratio={0: 10}) #X_res, y_res = sm.fit_resample(X, y) #X_res, y_res = ros.fit_resample(X, y) #X_res, y_res = rus.fit_resample(X, y) X_res, y_res, id_tl = tl.fit_sample(X, y) #X_res, y_res = cc.fit_sample(X, y) #X_res, y_res = smt.fit_sample(X, y) return X_res,y_res
def resample(): test_switch = np.load('data/test_switch_w_64_f_20.npy') test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy') train_switch = np.load('data/train_switch_w_64_f_20.npy') train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy') resample_train = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) resampe_test = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) print('Beginning train resample...') X = np.concatenate((train_switch, train_non_switch)) y = np.concatenate( (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0]))) X_res, y_res = resample_train.fit_resample(X, y) train_switch = [] train_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: train_switch.append(X_res[i]) else: train_non_switch.append(X_res[i]) np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch)) np.save('data/train_non_switch_w_64_f_20_samp.npy', np.array(train_non_switch)) print('Beginning test resample...') X = np.concatenate((test_switch, test_non_switch)) y = np.concatenate( (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0]))) X_res, y_res = resample_test.fit_resample(X, y) test_switch = [] test_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: test_switch.append(X_res[i]) else: test_non_switch.append(X_res[i]) np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch)) np.save('data/test_non_switch_w_64_f_20_samp.npy', np.array(test_non_switch)) return
def under_sample_data(matrix, y_train): add_to_log('Under Sampling') add_to_log('Sample distribution %s' % Counter(y_train)) # clean proximity samples using TomeKLinks tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1) X_res, y_res = tl.fit_resample(matrix, y_train) add_to_log('TomekLinks distribution %s' % Counter(y_res)) enn = EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1) X_res, y_res = enn.fit_resample(X_res, y_res) add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res)) return X_res, y_res
def test_tl_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object tl = TomekLinks(random_state=RND_SEED) assert_raises(RuntimeError, tl.sample, X, Y)
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs): if name == "rus": sampler = RandomUnderSampler( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "nm": sampler = NearMiss( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "enn": sampler = EditedNearestNeighbours(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "renn": sampler = RepeatedEditedNearestNeighbours( return_indices=return_indices, random_state=random_state, **kwargs) elif name == "allknn": sampler = AllKNN(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "tl": sampler = TomekLinks(return_indices=return_indices, random_state=random_state, **kwargs) else: raise ValueError return sampler
def smote_tomek(x_train, y_train): oversample = BorderlineSMOTE(sampling_strategy=0.5, random_state=0, k_neighbors=5, m_neighbors=10, n_jobs=-1, kind='borderline-1') X, y = oversample.fit_resample(x_train, y_train) tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1) X, y = tom_lin.fit_resample(X, y) # print(len([i for i in y_train.values if i==1])) # print(len([i for i in y.values if i==1])) # print(len(y_train)) # print(len(y)) return X, y
def test_tl_fit_sample_with_indices(): tl = TomekLinks(return_indices=True) X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) idx_gt = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
sampling_strategy = 'not majority' ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # With **cleaning method**, the number of samples in each class will not be # equalized even if targeted. sampling_strategy = 'not minority' tl = TomekLinks(sampling_strategy) X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a ``dict`` # ................................... # # When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted # classes. The values correspond to the desired number of samples for each # targeted class. This is working for both **under- and over-sampling** # algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
def test_deprecation_random_state(): tl = TomekLinks(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): tl.fit_resample(X, Y)
label='Majority class', s=200, marker='+') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title('Illustration of a Tomek link') make_plot_despine(ax) fig.tight_layout() ############################################################################### # We can run the ``TomekLinks`` sampling to remove the corresponding # samples. If ``ratio='auto'`` only the sample from the majority class will be # removed. If ``ratio='all'`` both samples will be removed. sampler = TomekLinks() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) ax_arr = (ax1, ax2) title_arr = ('Removing only majority samples', 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, [TomekLinks(ratio='auto', random_state=0), TomekLinks(ratio='all', random_state=0)]): X_res, y_res = sampler.fit_sample(np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0])) ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Minority class', s=200, marker='_')
from imblearn.under_sampling import TomekLinks # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Tomek Links cleaning tl = TomekLinks() X_resampled, y_resampled = tl.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
from imblearn.under_sampling import TomekLinks print(__doc__) rng = np.random.RandomState(0) n_samples_1 = 500 n_samples_2 = 50 X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]] y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2)) X_syn, y_syn = shuffle(X_syn, y_syn) X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn, y_syn) # remove Tomek links tl = TomekLinks(return_indices=True) X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_syn, y_syn) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1], alpha=.8, label='Removed samples')