def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
def test_allknn_fit_resample_with_indices(): allknn = AllKNN(return_indices=True) X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) assert_allclose(idx_under, idx_gt, rtol=R_TOL)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL)
def fit(self, c_data, x_data, y_data): # this is to track evolution of the size of the training samples self.samplesize = [] self.samplesize.append(len(x_data)) if self.reject_by_calendar: mask = self.mask_cal(c_data, y_data) # filter rows rejected by this calendar criteria # not filtering them might improve second classifier training #x_data = normalize(x_data[mask]) #y_data = y_data[mask] self.samplesize.append(len(x_data)) if self.use_resampling: # undersample resampler = AllKNN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # oversample resampler = SMOTEENN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # train clf only with filtered and resampled data if self.use_weights: try: self.clf.fit(x_data, y_data, self.get_weights(y_data)) except TypeError: print "The classifier selected does not admit weights for training samples" print "Switching to no weights" self.use_weights = False self.clf.fit(x_data, y_data) else: self.clf.fit(x_data, y_data)
def test_allknn_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object allknn = AllKNN(random_state=RND_SEED) assert_raises(RuntimeError, allknn.sample, X, Y)
def test_allknn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs): if name == "rus": sampler = RandomUnderSampler( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "nm": sampler = NearMiss( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "enn": sampler = EditedNearestNeighbours(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "renn": sampler = RepeatedEditedNearestNeighbours( return_indices=return_indices, random_state=random_state, **kwargs) elif name == "allknn": sampler = AllKNN(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "tl": sampler = TomekLinks(return_indices=return_indices, random_state=random_state, **kwargs) else: raise ValueError return sampler
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def getsampler(self, type): if type == 'none': sampler = NoSampler() elif type == 'randomunder': sampler = RandomUnderSampler() elif type == 'nearmiss': sampler = NearMiss() elif type == 'allknn': sampler = AllKNN() elif type == 'condensednn': sampler = CondensedNearestNeighbour() elif type == 'editednn': sampler = EditedNearestNeighbours() elif type == 'repeatededitednn': sampler = RepeatedEditedNearestNeighbours() elif type == 'tomeklinks': sampler = TomekLinks() elif type == 'randomover': sampler = RandomOverSampler() elif type == 'smote': sampler = SMOTE() elif type == 'adasyn': sampler = ADASYN() elif type == 'smotenc': sampler = SMOTENC() elif type == 'quality': # and self.quality_model_selection_type == 'extended': sampler = QualitySampler(self.n_init) else: print("Unsupported sampler %s" % type) exit(1) if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params( ).keys(): sampler.set_params(random_state=self.random_state) return sampler
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 40) ann = AllKNN(random_state=RND_SEED) assert_warns(UserWarning, ann.fit, X, y)
def test_allknn_init(): # Define a ratio allknn = AllKNN(random_state=RND_SEED) assert_equal(allknn.n_neighbors, 3) assert_equal(allknn.kind_sel, 'all') assert_equal(allknn.n_jobs, -1) assert_equal(allknn.random_state, RND_SEED)
def test_all_knn_allow_minority(): X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], class_sep=0.4, random_state=0) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_resample(X, y) allknn = AllKNN() X_res_2, y_res_2 = allknn.fit_resample(X, y) assert len(y_res_1) < len(y_res_2)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def test_allknn_fit_single_class(): """Test either if an error when there is a single class""" # Create the object allknn = AllKNN(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, allknn.fit, X, y_single_class)
def test_allknn_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object allknn = AllKNN(random_state=RND_SEED) allknn.fit(X, Y) assert_raises(RuntimeError, allknn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_allknn_init(): """Test the initialisation of the object""" # Define a ratio allknn = AllKNN(random_state=RND_SEED) assert_equal(allknn.n_neighbors, 3) assert_equal(allknn.kind_sel, 'all') assert_equal(allknn.n_jobs, -1) assert_equal(allknn.random_state, RND_SEED)
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train
def resample(self): """ Resampling data usinf AllKNN and SMOTE """ print("Sampling data...") # Under Sampling allknn = AllKNN(sampling_strategy={28: 565}) self.X, self.y = allknn.fit_resample(self.X, self.y) #Over Sampling smote = SMOTE(ratio="all") self.X, self.y = smote.fit_resample(self.X, self.y)
def __init__(self, name): self.strategie = None self.name = name if name == "enn": self.strategie = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all', n_jobs=-1) elif name == "allknn": self.strategie = AllKNN(sampling_strategy='auto', n_neighbors=3, kind_sel='all', allow_minority=False, n_jobs=-1) elif name == "renn": self.strategie = RepeatedEditedNearestNeighbours( sampling_strategy='auto', n_neighbors=3, max_iter=100, kind_sel='all', n_jobs=-1) elif name == "tomek": self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1) elif name == "smote": self.strategie = SMOTE(sampling_strategy='auto', k_neighbors=5, n_jobs=-1, random_state=42) elif name == "bdsmote": self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1) elif name == "adasyn": self.strategie = ADASYN(sampling_strategy='auto', n_neighbors=5, n_jobs=-1, random_state=42) elif name == "smoteenn": self.strategie = SMOTEENN(sampling_strategy='auto', smote=None, enn=None, n_jobs=-1, random_state=42) elif name == "smotetomek": self.strategie = SMOTETomek(sampling_strategy='auto', smote=None, tomek=None, n_jobs=-1, random_state=42)
def all_KNN(X, Y): from imblearn.under_sampling import AllKNN allknn = AllKNN() allknn.fit_resample(X, Y) indexes = allknn.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def test_allknn_fit_sample_mode(): """Test the fit sample routine using the mode as selection""" # Resample the data allknn = AllKNN(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy')) assert_array_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def undersampling(type): if type == 'random': und = RandomUnderSampler(ratio='majority', random_state=42) elif type == 'knn': und = AllKNN(ratio='majority', random_state=42, n_jobs=4) elif type == 'centroids': und = ClusterCentroids(ratio='majority', n_jobs=-1) x, y = und.fit_sample(train, label) x = pd.DataFrame(x, columns=train.columns.values) y = pd.DataFrame(y, columns=['is_attributed']) return x, y
def all_KNN(X, Y): from imblearn.under_sampling import AllKNN allknn = AllKNN() allknn.fit_resample(X, Y) indexes = allknn.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def aiiknn(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): allknn = AllKNN() X_res, y_res = allknn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_allknn_fit(): """Test the fitting method""" # Create the object allknn = AllKNN(random_state=RND_SEED) # Fit the data allknn.fit(X, Y) # Check if the data information have been computed assert_equal(allknn.min_c_, 0) assert_equal(allknn.maj_c_, 1) assert_equal(allknn.stats_c_[0], 500) assert_equal(allknn.stats_c_[1], 4500)
def undersample(X_train_org, y_train_org, sampler='AllKNN', size=1000): """Undersample the training set data using one of various techniques.""" # Select a sampler type. if sampler == "RandomUnderSampler": samp = RandomUnderSampler(sampling_strategy={True: size, False: size}) if sampler == 'AllKNN': samp = AllKNN() # Resample the data using the selected sampler. X_train, y_train = samp.fit_resample(X_train_org, y_train_org) print(sorted(Counter(y_train).items())) return X_train, y_train
def fit(self, X, y): # Preparação dos argumentos para os métodos da biblioteca ``scikit-learn`` #Xlinha = X[self.columns] #ylinha = y allknn = AllKNN() #allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, y) X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1) model = XGBClassifier() return model
def test_allknn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data allknn = AllKNN(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt) assert_array_almost_equal(idx_under, idx_gt)