def Dados_Balanceados_SMOTE_NearMiss_Sem_Municipio_Orgao(): #sampling_strategy = 0.1 ==> 10 x 1 #sampling_strategy = 0.2 ==> 5 x 1 #sampling_strategy = 1 ==> 1 x 1 feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao') X_data, y_data = load_svmlight_file( 'treino_desbalanceado_onehot_sem_municipio_orgao.svm', n_features=len(feature_names)) # pylint: disable=unbalanced-tuple-unpacking sm = SMOTE(random_state=6439, sampling_strategy=0.1) X_res, y_res = sm.fit_resample(X_data, y_data) nm = NearMiss(sampling_strategy=0.1) X_res_new, y_res_new = nm.fit_resample(X_res, y_res) dump_svmlight_file(X_res_new, y_res_new, 'smote_nearmiss_10_1_onehot_sem_municipio_orgao.svm') sm = SMOTE(random_state=6439, sampling_strategy=0.175) X_res, y_res = sm.fit_resample(X_data, y_data) nm = NearMiss(sampling_strategy=0.175) X_res_new, y_res_new = nm.fit_resample(X_res, y_res) dump_svmlight_file(X_res_new, y_res_new, 'smote_nearmiss_5_1_onehot_sem_municipio_orgao.svm') sm = SMOTE(random_state=6439, sampling_strategy=0.5) X_res, y_res = sm.fit_resample(X_data, y_data) nm = NearMiss(sampling_strategy=0.5) X_res_new, y_res_new = nm.fit_resample(X_res, y_res) dump_svmlight_file(X_res_new, y_res_new, 'smote_nearmiss_1_1_onehot_sem_municipio_orgao.svm')
def resampling(train_data, train_labels, resampling_type, resampling_stragey): train_data_new = np.reshape(train_data, (train_data.shape[0], train_data.shape[1] * train_data.shape[2] * train_data.shape[3])) if resampling_type == 'SMOTE': train_data_resampled, train_labels_resampled = SMOTE( random_state=42).fit_resample(train_data_new, train_labels.values) elif resampling_type == 'over_sampling': over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = over_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'under_sampling': under_sampler = RandomUnderSampler( sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = under_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'tomelinks': t1 = TomekLinks(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = t1.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'near_miss_neighbors': undersample = NearMiss(version=1, n_neighbors=3) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'one_sided_selection': undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) return train_data_resampled, train_labels_resampled
def near_miss2(X, Y): from imblearn.under_sampling import NearMiss nm1 = NearMiss(version=2) nm1.fit_resample(X, Y) indexes = nm1.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def Balancing(self, method=None): self.method = method if self.method == None or self.method == "nearmiss": balance = NearMiss() self.Xtrainb, self.Ytrainb = balance.fit_resample( self.Xtrain, self.Y_train) elif self.method == "smote": balance = SMOTE() self.Xtrainb, self.Ytrainb = balance.fit_resample( self.Xtrain, self.Y_train) self.Xtrainb, self.Ytrainb
def near_miss2(X, Y): from imblearn.under_sampling import NearMiss nm1 = NearMiss(version=2) nm1.fit_resample(X, Y) indexes = nm1.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def base_model(): mongo_connect = MongoHandler() like_tweets = mongo_connect.retrieve_from_collection("twitter_new") df = pd.DataFrame(list(like_tweets)) # text = df['text'] # df = df.drop(['user_name','user_location','hashtags','mentions','created_at'],axis=1) # Column / Feature selection base = df[[ 'user_followers', 'user_friends', 'user_favourites', 'user_months', 'user_statuses', 'user_verified', 'retweets' ]] per_month = round((base['user_statuses'] + 1) / (base['user_months'] + 1), 2) per_month = pd.DataFrame(per_month) per_month.columns = ['tweet_per_month'] base = pd.concat([base, per_month], axis=1) target = df['favorites'] # base = base[['user_followers', 'retweets', 'user_favourites', 'user_statuses']] columns = base.columns.values.tolist() # Tranform the problem of regression into a multi-class classification. Classes: zero, low, medium, high for i in range(len(target)): if 0 < target[i] < 6: target[i] = 1 elif 5 < target[i] < 11: target[i] = 2 elif target[i] >= 11: target[i] = 3 # target.hist() # plt.show() nm1 = NearMiss(version=1) # Under-sampling technique base, target = nm1.fit_resample(base, target) x_train, x_test, y_train, y_test = model_selection.train_test_split( base, target, random_state=1, test_size=0.3) # testing different ML algorithms # model = tree.DecisionTreeClassifier(criterion="entropy", random_state=5) # class_weight="balanced") # model = linear_model.LogisticRegression(solver="lbfgs", random_state=5) # model = naive_bayes.MultinomialNB() # model = tree.ExtraTreeClassifier(random_state=5) # model = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10), n_estimators=1000, random_state=5) model = RandomForestClassifier(n_estimators=500, random_state=5) k_fold_cv(model, x_train, y_train, False) model.fit(x_train, y_train) y_predict = model.predict(x_test) evaluation(y_test, y_predict) # feature importance / interpretability new_y_train = model.predict(x_train) tree_model = tree.DecisionTreeClassifier( criterion="entropy", random_state=5) # class_weight="balanced") tree_model.fit(x_train, new_y_train) tree_feature_importance(tree_model, columns, x_train) # calls function for interpretable ML
def limpieza(df): #librerías necesarias import pandas as pd from sklearn.model_selection import train_test_split from imblearn.under_sampling import NearMiss from imblearn.over_sampling import SMOTE # Eliminamos la variable Id df = df.drop("Unnamed: 0", axis=1) # Establecemos las variables de la X y de la Y. classification_X = df.drop(["misstate"], axis=1) classification_y = df["misstate"] # iniciando Smote under-sampling sm = SMOTE() X_sm, y_sm = sm.fit_resample(classification_X, classification_y) nm = NearMiss(version=1) X_nm, y_nm = nm.fit_resample(classification_X, classification_y) # Definimos el train y el test SMOTE X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split( X_sm, y_sm, test_size=0.30, random_state=0) # Definimos el train y el test NEARMISS X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split( X_nm, y_nm, test_size=0.30, random_state=0) return (X_train_nm, X_test_nm, y_train_nm, y_test_nm, X_train_sm, X_test_sm, y_train_sm, y_test_sm)
def test_nm_fit_resample_auto(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx])
def nearmiss(data, k): X = np.array(data['X'].tolist()) y = np.array(data['y'].tolist()) max_X = np.max(X[:, 0]) min_X = np.min(X[:, 0]) max_y = np.max(X[:, 1]) min_y = np.min(X[:, 1]) counter = Counter(y) print("Before undersampling:", counter) plot_scatter(X, y, counter, max_X, min_X, max_y, min_y) # define the undersampling method, version: 3, k is a target hyper-parameter undersample = NearMiss(version=3, n_neighbors_ver3=k) # perform NearMiss sampling X_sampled, y_sampled = undersample.fit_resample(X, y) X_list = X_sampled.tolist() sample_data = pd.DataFrame({'y': y_sampled, 'X': X_list}) counter = Counter(y_sampled) print("After undersampling:", counter) plot_scatter(X_sampled, y_sampled, counter, max_X, min_X, max_y, min_y) return sample_data
def test_nm_fit_resample_auto(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx])
def NearMiss_us(X_train, Y_train, seed, sampling_strategy, n_neighbors=3, n_neighbors_ver3=3, version=1): if not isinstance(sampling_strategy, str): sampling_strategy = compute_sampling_strategy(sampling_strategy, Y_train, 'undersampling') nm = NearMiss(random_state=seed, version=version, n_neighbors=n_neighbors, n_neighbors_ver3=n_neighbors_ver3, n_jobs=-1, ratio=None, sampling_strategy=sampling_strategy) print('Before NearMiss version ' + str(version) + ' undersampling : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = nm.fit_resample(X_train, Y_train) print('After NearMiss version ' + str(version) + ' undersampling : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def nearmiss1(X, y): undersample = NearMiss(version=1, n_neighbors=3) X, y = undersample.fit_resample(X, y) counter = Counter(y) print("NearMiss-1", counter) plot_dataset(X, y, counter)
def under_sampling(X,Y,ss='not minority'): Y = Y.ravel() if isinstance(ss,dict): ss = ss.copy() for y, c in zip(*np.unique(Y,return_counts=True)): ss[y] = min(ss[y],c) sampler = NearMiss(sampling_strategy=ss) data, labels = sampler.fit_resample(X, Y) return data, labels
def classify(self, X, type: str, classifier: str, test_prop: float, res: None, res_method: None): if type == 'binary': y = self.df['class'].replace(0,1) elif type == 'multi': y = self.df['class'] else: raise TypeError("Choose a proper type of classification") X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_prop, stratify=y) if res == True: if res_method == 'down': nm = NearMiss() X_res, Y_res = nm.fit_resample(X_train, Y_train) elif res_method == 'up': sm = ADASYN() X_res, Y_res = sm.fit_resample(X_train, Y_train) else: raise TypeError("Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling.") if classifier == 'lr': model = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l2') elif classifier == 'svc': model = LinearSVC(C=0.004, penalty='l2') elif classifier == 'rf': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = RandomForestClassifier(n_estimators=n_est, bootstrap=True, max_depth=5) elif classifier == 'xgb': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = XGBClassifier(n_estimators=n_est, bootstrap=True, max_depth=5, reg_lamba=0.4) elif classifier == 'ada': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005) else: raise TypeError("Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' .") if res == True: model.fit(X_res, Y_res) else: model.fit(X_train, Y_train) Y_pred = model.predict(X_test) # Accuracy Percentage print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%") # Classification Report print(classification_report(Y_pred, Y_test)) # Matthew's Correlation Coefficient print(f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}") # Plots of Confusion Matrix and ROC Curve plot_confusion_matrix(Y_test, Y_pred, figsize=(10,10)) return model
def test_nm_wrong_nn_obj(): sampling_strategy = 'auto' nn = 'rnd' nm = NearMiss(sampling_strategy=sampling_strategy, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): nm.fit_resample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss(sampling_strategy=sampling_strategy, version=3, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): nm3.fit_resample(X, Y)
def test_nm_wrong_nn_obj(): sampling_strategy = 'auto' nn = 'rnd' nm = NearMiss( sampling_strategy=sampling_strategy, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): nm.fit_resample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss( sampling_strategy=sampling_strategy, version=3, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): nm3.fit_resample(X, Y)
def undersample_NearMiss(df, variant=2, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) nm = NearMiss(random_state=0, version=variant) X_res, y_res = nm.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def under_sampling_shift(x, y, delta=0.5): labels = np.unique(y) y_counts = Counter(np.squeeze(y)) sampling_strategy = dict() for label in labels: sampling_strategy[label] = int(delta * y_counts[label]) # version 3 subsamples respecting more the initial data structure, # but it gives less control on the n of final samples (initial resampling phase) # thus let use version 2 as the default nm1 = NearMiss(version=2, sampling_strategy=sampling_strategy) x_resampled, y_resampled = nm1.fit_resample(x, y) return x_resampled, y_resampled
def near_miss(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): nm = NearMiss() X_res, y_res = nm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def Dados_Balanceados_NearMiss_Sem_Municipio_Orgao(): #sampling_strategy = 0.1 ==> 10 x 1 #sampling_strategy = 0.2 ==> 5 x 1 #sampling_strategy = 1 ==> 1 x 1 feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao') X_data, y_data = load_svmlight_file( 'treino_desbalanceado_onehot_sem_municipio_orgao.svm', n_features=len(feature_names)) # pylint: disable=unbalanced-tuple-unpacking nm = NearMiss(sampling_strategy=0.1) X_res, y_res = nm.fit_resample(X_data, y_data) dump_svmlight_file(X_res, y_res, 'nearmiss_10_1_onehot_sem_municipio_orgao.svm') nm = NearMiss(sampling_strategy=0.2) X_res, y_res = nm.fit_resample(X_data, y_data) dump_svmlight_file(X_res, y_res, 'nearmiss_5_1_onehot_sem_municipio_orgao.svm') nm = NearMiss(sampling_strategy=1) X_res, y_res = nm.fit_resample(X_data, y_data) dump_svmlight_file(X_res, y_res, 'nearmiss_1_1_onehot_sem_municipio_orgao.svm')
def undersampling(X, y): """Balancing data using NearMiss Args: X: Training set without Class Target y:Training set Class Target Returns: balanced train_x, test_x """ sample = NearMiss(version=1) X, y = sample.fit_resample(X, y) print('after balancing:', X.shape) return X, y
def __resample_data_NearMiss(self): ''' Resampling imbalanced data with near miss algorithm. (Undersampling) :param: None :return: None ''' name_train = self.__attributes_train.columns print("resampling data...") nm = NearMiss(random_state=6) X_train_res, y_train_res = nm.fit_resample(self.__attributes_train, self.__labels_train) self.__attributes_train, self.__labels_train = pd.DataFrame( X_train_res, columns=name_train), pd.DataFrame(y_train_res) print("[respamling finished]")
def clf_corregido(x_orig, y_orig): '''Función que se utiliza para corregir el desbalance sufrido por los datos''' nearmiss = NearMiss(sampling_strategy=0.20, n_neighbors=3, version=2) x_us, y_us = nearmiss.fit_resample(x_orig, y_orig) clf_us = LogisticRegression(random_state=0, class_weight='balanced').fit(x_us, y_us) y_clf_us = clf_us.predict(x_orig) print(''' -------------------------------------------------------------------------------- ''') print(classification_report(y_orig, y_clf_us)) _, recall_corregido, _, _ = score(y_orig, y_clf_us) recall_corregido_prom = (recall_corregido[0] + recall_corregido[1]) / 2 DICC_['clf_corregido'] = recall_corregido_prom return clf_us
def apply_nearMiss(X, y, near_miss_type=3): ''' Using NearMiss algorithm, undersamples the majority class to match the sample size of the minority class Args: X (np.array): features associated with examples of majority and minority classes; shape = number of examples x number of features y (np.array): lables of each example; shape = number of examples x 1 near_miss_type (int): Type of NearMiss algorithm Returns: X_samp (np.array): features associated with undersampled majority examples and all minority class examples; shape = 2*number of minority examples x number of features y_samp (np.array): lables of each example; shape = 2*number of minority examples x 1 ''' undersample = NearMiss(version=1, n_neighbors=near_miss_type) X_smap, y_samp = undersample.fit_resample(X, y) return X_samp, y_samp
model = run_model_balanced(X_train, X_test, y_train, y_test) y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Penalizacion') # Estrategia: Undersampling en la clase mayoritaria # Lo que haremos es utilizar un algoritmo para reducir la clase mayoritaria. # Lo haremos usando un algoritmo que hace similar al k-nearest neighbor para # ir seleccionando cuales eliminar. Fijemonos que reducimos bestialmente de # 199.020 muestras de clase cero (la mayoría) y pasan a ser 688. y Con esas # muestras entrenamos el modelo. us = NearMiss(sampling_strategy=0.5, n_neighbors=3, version=2) X_train_res, y_train_res = us.fit_resample(X_train, y_train) print(f'Distribution before resampling {Counter(y_train)}') print(f'Distribution after resampling {Counter(y_train_res)}') model = run_model(X_train_res, X_test, y_train_res, y_test) y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Undersampling') # Estrategia: Oversampling de la clase minoritaria # En este caso, crearemos muestras nuevas “sintéticas” de la clase minoritaria. # Usando RandomOverSampler. Y vemos que pasamos de 344 muestras de # fraudes a 99.510. os = RandomOverSampler(sampling_strategy=0.5) X_train_res, y_train_res = os.fit_resample(X_train, y_train) print(f'Distribution before resampling {Counter(y_train)}') print(f'Distribution after resampling {Counter(y_train_res)}')
def down_sampling(x_train, y_train): print("Down Sampling My friend.....") from imblearn.under_sampling import NearMiss nm1 = NearMiss(version=1) x_train, y_train = nm1.fit_resample(x_train, y_train) return x_train, y_train
def underSampling(X, Y): nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(X, Y) return X_resampled, y_resampled
def test_deprecation_random_state(): nm = NearMiss(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): nm.fit_resample(X, Y)
# Undersample imbalanced dataset with NearMiss-2 from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import NearMiss from matplotlib import pyplot from numpy import where # define dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) # summarize class distribution counter = Counter(y) print(counter) # define the undersampling method undersample = NearMiss(version=2, n_neighbors=3) # transform the dataset X, y = undersample.fit_resample(X, y) # summarize the new class distribution counter = Counter(y) print(counter) # scatter plot of examples by class label for label, _ in counter.items(): row_ix = where(y == label)[0] pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) pyplot.legend() pyplot.show()
def test_nearmiss_wrong_version(): version = 1000 nm = NearMiss(version=version) with raises(ValueError, match="must be 1, 2 or 3"): nm.fit_resample(X, Y)
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) from imblearn.over_sampling import ADASYN ada=ADASYN(random_state=0, n_neighbors=5) X_resampled, y_resampled = ada.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) from imblearn.under_sampling import NearMiss nm=NearMiss(version=1) nm.sample_indices=True X_resampled, y_resample = nm.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) deleted_ind = np.setdiff1d(np.arange(len(X)), ind) plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2 plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) from imblearn.under_sampling import OneSidedSelection oss=OneSidedSelection(random_state=0, n_neighbors=1, n_seeds_S=1) X_resampled, y_resampled = oss.fit_resample(X,y)
'gender', 'label_emotion', 'polarity', 'label_polarity', 'path', 'source' ], axis=1).to_numpy().squeeze() print('y_full after drop: ', y_full[:5]) print('y_full shape after drop: ', y_full.shape) # reshape to undersample X_full_reshape = X_full.reshape((X_full.shape[0], -1)) print('X_full_under shape: ', X_full_reshape.shape) # create undersampler undersample = NearMiss(sampling_strategy="not minority") print('Starting undersampling...') X_under, y_under = undersample.fit_resample(X_full_reshape, y_full) y_counter = Counter(y_under) print('Information after undersampling') print('y_under count: ', y_counter) print('Under set shapes:') print('X_under shape: ', X_under.shape) print('y_under shape: ', y_under.shape) X_train_reshape, X_test_reshape, y_train, y_test = train_test_split( X_under, y_under, test_size=0.1, shuffle=True, random_state=42) print('Shapes after train_test_splits') print('X_train_reshape: ', X_train_reshape.shape) print('y_train: ', y_train.shape)
def test_deprecation_random_state(): nm = NearMiss(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): nm.fit_resample(X, Y)
def test_nearmiss_wrong_version(): version = 1000 nm = NearMiss(version=version) with raises(ValueError, match="must be 1, 2 or 3"): nm.fit_resample(X, Y)