def test_rest(x, y,c=0,ratio='auto'): c=c if(c==0): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==1): print('Tomek links') TL = TomekLinks(verbose=verbose,ratio=ratio) x, y = TL.fit_transform(x, y) elif(c==2): print('Clustering centroids') CC = ClusterCentroids(verbose=verbose,ratio=ratio) x, y = CC.fit_transform(x, y) elif(c==3): print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==4): print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==5): print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==6): print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==7): print('Random over-sampling') OS = OverSampler(verbose=verbose,ratio=ratio) x, y = OS.fit_transform(x, y) elif(c==8): print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose,ratio=ratio) x, y = STK.fit_transform(x, y) elif(c==9): print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose,ratio=ratio) x, y = SENN.fit_transform(x, y) else: print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose,ratio=ratio) x, y = EE.fit_transform(x, y) return x, y
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose) usx, usy, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose) nm1x, nm1y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose) nm2x, nm2y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose) nm3x, nm3y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose) ncrx, ncry, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def test_tl_fit_transform(): """Test the fit transform routine""" # Resample the data tl = TomekLinks(random_state=RND_SEED) X_resampled, y_resampled = tl.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_tl_fit(): """Test the fitting method""" # Create the object tl = TomekLinks(random_state=RND_SEED) # Fit the data tl.fit(X, Y) # Check if the data information have been computed assert_equal(tl.min_c_, 0) assert_equal(tl.maj_c_, 1) assert_equal(tl.stats_c_[0], 500) assert_equal(tl.stats_c_[1], 4500)
def test_tl_fit_transform_with_indices(): """Test the fit transform routine with indices support""" # Resample the data tl = TomekLinks(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = tl.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def balance_data_undersampling_tomek_links(self): ''' Balance data clustering centroids. ''' x = self.X y = self.y y.shape = (len(self.y)) verbose = True TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) self.X = tlx self.y = tly self.y.shape = (len(self.y), 1)
def test_tl_transform_wt_fit(): """Test either if an error is raised when transform is called before fitting""" # Create the object tl = TomekLinks(random_state=RND_SEED) assert_raises(RuntimeError, tl.transform, X, Y)
def test_tl_fit_single_class(): """Test either if an error when there is a single class""" # Create the object tl = TomekLinks(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_raises(RuntimeError, tl.fit, X, y_single_class)
def test_tl_init(): """Test the initialisation of the object""" # Define a ratio verbose = True tl = TomekLinks(random_state=RND_SEED, verbose=verbose) assert_equal(tl.n_jobs, -1) assert_equal(tl.rs_, RND_SEED) assert_equal(tl.verbose, verbose) assert_equal(tl.min_c_, None) assert_equal(tl.maj_c_, None) assert_equal(tl.stats_c_, {})
def balance_data_oversampling(self, ratio = 2, balance_type = "OverSampler"): ''' Balance data. ''' verbose = True if balance_type == "OverSampler": sm = OverSampler(verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_borderline1': sm = SMOTE(kind = 'borderline1', verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_regular': sm = SMOTE(kind = 'regular', verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_borderline2': sm = SMOTE(kind = 'borderline2', verbose = verbose, ratio = ratio) else: sm = TomekLinks(verbose = verbose) self.train_x, self.train_y = sm.fit_transform(self.train_x, self.train_y)
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio): sampler = None verbose = True if sample_type == SMOTE_REG: sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15) elif sample_type == SMOTE_SVM: # TODO: Make this configurable? svm_args = {'class_weight': 'balanced'} sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args) elif sample_type == SMOTE_BORDERLINE_1: sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_BORDERLINE_2: sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_ENN: sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15) elif sample_type == SMOTE_TOMEK: sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15) elif sample_type == UNDERSAMPLER: sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False, random_state=17) elif sample_type == ADASYN_SAMPLER: sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio) elif sample_type == TOMEK_LINKS: sampler = TomekLinks() elif sample_type == CLUSTER_CENTROIDS: sampler = ClusterCentroids(ratio=ratio) elif sample_type == NEARMISS: sampler = NearMiss(ratio=ratio) else: print "Unrecoqnized sample technique: " + sample_type print "Returning original data" return train_x, train_y return sampler.fit_transform(train_x, train_y)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling tl = TomekLinks() X_resampled, y_resampled = tl.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
from unbalanced_dataset.under_sampling import TomekLinks # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling tl = TomekLinks() X_resampled, y_resampled = tl.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
#parameters for decision tree param_grid_DT = dict(max_features=["auto","sqrt","log2",None],class_weight=["balanced", None] , presort=[True, False]) #stratified shuffle split X_train, X_test, y_train, y_test = stratify_and_shuffle(scaled_data, target, 0.99,0.01) verbose= True #undersample the scaled data #US = UnderSampler(verbose=True) US= TomekLinks(verbose=True) #US = ClusterCentroids(verbose=True) #US = NearMiss(version=1, verbose=verbose) #US = OneSidedSelection(verbose=verbose) #US = EasyEnsemble(verbose=verbose) #US = NeighbourhoodCleaningRule(verbose=verbose) #US = BalanceCascade(verbose=verbose) #svm_args={'class_weight': 'auto'} #US = SMOTE(kind='svm', verbose=verbose, **svm_args) ##US = SMOTE(kind='regular', verbose=verbose) #US = SMOTETomek(verbose =verbose) usx, usy = US.fit_transform((X_train), y_train) #usx = X_train #usy = y_train