def test_nm_wrong_nn_obj(): ratio = 'auto' nn = 'rnd' nm = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): nm.fit_sample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss(ratio=ratio, random_state=RND_SEED, version=3, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): nm3.fit_sample(X, Y)
def test_nm3_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nn3 = NearestNeighbors(n_neighbors=3) nm3 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) # Fit and sample X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm3_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .7 # Create the object nm3 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [-0.05903827, 0.10947647], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], [0.45713638, 1.31069295]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def under_sample_near_miss(X, y, k_neighbors: int, under_sampling_mode: int, out_file_name: str): print('Undersampling using "Near Miss"-Method.') n_samples, n_x, n_y, n_z = X.shape f = open(out_file_name, 'w') f.write('Read y==0 count: ' + str(np.count_nonzero(y == 0)) + '\n') f.write('Read y==1 count: ' + str(np.count_nonzero(y == 1)) + '\n') f.write('Read X Shape: ' + str(X.shape) + '\n') f.write('Read y Shape: ' + str(y.shape) + '\n') f.write('Undersampling version: ' + str(under_sampling_mode) + '\n') f.write('Undersampling k_neighbors: ' + str(k_neighbors) + '\n') under_sampler = NearMiss(version=under_sampling_mode, n_neighbors_ver3=k_neighbors) f.write('Undersampler: ' + str(under_sampler) + '\n') under_X = X.reshape(X.shape[0], -1) under_y = y.ravel() del X del y under_X, under_y = under_sampler.fit_sample(under_X, y) under_X = under_X.reshape(under_X.shape[0], n_x, n_y, n_z) under_y = under_y[:, np.newaxis] print('Undersampled y==0 count: ' + str(np.count_nonzero(under_y == 0))) print('Undersampled y==1 count: ' + str(np.count_nonzero(under_y == 1))) print('Undersampled X Shape: ' + str(under_X.shape)) print('Undersampled y Shape: ' + str(under_y.shape)) f.write('Undersampled y==0 count: ' + str(np.count_nonzero(under_y == 0)) + '\n') f.write('Undersampled y==1 count: ' + str(np.count_nonzero(under_y == 1)) + '\n') f.write('Undersampled X Shape: ' + str(under_X.shape) + '\n') f.write('Undersampled y Shape: ' + str(under_y.shape) + '\n') f.close() return under_X, under_y
def test_nm3_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nn3 = NearestNeighbors(n_neighbors=3) nm3 = NearMiss( ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) # Fit and sample X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm2_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) # Fit and sample X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm_fit_sample_auto(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx])
def test_nm1_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) # Fit and sample X_resampled, y_resampled, idx_under = nm1.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm_fit_sample_auto_indices(): ratio = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) ] idx_gt = [ np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), np.array([3, 10, 11, 0, 5, 8, 14, 4, 12]) ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(ratio=ratio, version=version, return_indices=True) X_resampled, y_resampled, idx_under = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) assert_array_equal(idx_under, idx_gt[version_idx])
def main(): train = pd.read_csv("../data/processed/train.csv") train.pop("id") target = train.pop("血糖") train_x = train.as_matrix() train_y = target.as_matrix() best_score = 10 best_left = 0 for left in np.arange(3.5,5.0,0.02): label_Y = np.zeros(train_y.shape[0]) for i in range(train_y.shape[0]): if train_y[i] >= left and train_y[i] <= 6.1: label_Y[i] = 0 elif train_y[i] < left: label_Y[i] = 1 else: label_Y[i] = 2 nm = NearMiss(ratio={0: 3000, 1: len(np.where(label_Y == 1)[0]), 2: len(np.where(label_Y == 2)[0])}, random_state=42, return_indices=True, version=2,n_neighbors=10) X_res, y_res, index = nm.fit_sample(train_x, label_Y) new_x = train_x[index] new_y = train_y[index] s = score(new_x, new_y) if s < best_score: best_score = s best_left = left print("greater") print(best_score, best_left) print(best_score, best_left) print(best_score, best_left)
def test_nm1_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .7 # Create the object nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm1.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def under_sampling(self, x_train, y_train): file = open('Training_Logs/General_Log.txt', 'a+') self.logger_object.log( file, 'Entered under_sampling() method of PreProcessor class of data_preprocessing package' ) file.close() self.x_train = x_train self.y_train = y_train try: self.logger_object.log( self.file_object, 'x_train label 0 shape before under-sampling :: %s' % str(sum(self.y_train == 0))) self.logger_object.log( self.file_object, 'x_train label 1 shape before under-sampling :: %s' % str(sum(self.y_train == 1))) self.logger_object.log( self.file_object, 'x_train label 2 shape before under-sampling :: %s' % str(sum(self.y_train == 2))) nm = NearMiss(version=1) self.x_train, self.y_train = nm.fit_sample(self.x_train, self.y_train) self.logger_object.log( self.file_object, 'x_train label 0 shape after under-sampling :: %s' % str(sum(self.y_train == 0))) self.logger_object.log( self.file_object, 'x_train label 1 shape after under-sampling :: %s' % str(sum(self.y_train == 1))) self.logger_object.log( self.file_object, 'x_train label 2 shape after under-sampling :: %s' % str(sum(self.y_train == 2))) file = open('Training_Logs/General_Log.txt', 'a+') self.logger_object.log( file, 'Successfully Executed under_sampling() method of PreProcessor class of data_preprocessing package' ) file.close() return x_train, y_train except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in under_sampling() method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Under Sampling Unsuccessful. Exited the under_sampling() method of the Preprocessor class' ) raise Exception()
def sampling(dataset): X,y = split_dataset(dataset) print("Under Sampling") rus = NearMiss(random_state = 42) x_res, y_res = rus.fit_sample(X, y) #sm = SMOTE(random_state=12, ratio = 1.0) #x_res, y_res = sm.fit_sample(X, y) return x_res,y_res
def apply_near_miss(df): x = df.iloc[:, df.columns != 'y'] y = df.iloc[:, df.columns == 'y'] near_miss = NearMiss() x, y = near_miss.fit_sample(x, y.values.ravel()) # print(np.bincount(y)) return x, y
def model_subSampling(X_train, y_train, X_test, y_test): nm = NearMiss(version=1) print('NearMiss - clase minoritaria = 1 (antes) {}'.format( Counter(y_train))) X_train_res, y_train_res = nm.fit_sample(X_train, y_train) print('NearMiss - clase minoritaria = 1 {}'.format(Counter(y_train_res))) model_reg_log(X_train_res, y_train_res, X_test, y_test) return None
def modeloNaiveBayesSS(): #Carga del dataset almacenado en csv dataset = pd.read_csv('dataset2.csv') #Reducción de la dimensionalidad, con Feature Selection, usando SelctKBest de Sklearn X = dataset.drop(['Plag'], axis=1) y = dataset['Plag'] best = SelectKBest(k=50) X_new = best.fit_transform(X, y) X_new.shape selected = best.get_support(indices=True) #print(X.columns[selected]) used_features = X.columns[selected] # Separación los datos del dataset en los cjtos de entrenamiento y test: X_train, X_test = train_test_split(dataset, test_size=0.3, random_state=6) y_train = X_train["Plag"] y_test = X_test["Plag"] #Aplicación del muestreo por subsampling en el cjto de entrenamiento sobre #la clase mayoritaria, reduciendo #las observaciones a la misma cantidad de la clase minoritaria us = NearMiss(sampling_strategy='auto', version=1, n_neighbors=3, n_neighbors_ver3=3, n_jobs=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) X_test_res, y_test_res = (X_test, y_test) # Uso del clasificador Gausiano gnb = GaussianNB() #Con el modelo creado, se utiliza fit() para el aprendizaje gnb.fit(X_train_res[used_features].values, y_train_res) y_pred = gnb.predict(X_test_res[used_features]) #Calculo de la precisión print('Precisión en el set de Entrenamiento: {:.2f}'.format( gnb.score(X_train_res[used_features], y_train_res))) print('Precisión en el set de Test: {:.2f}'.format( gnb.score(X_test_res[used_features], y_test_res))) #Calculo de la matriz de confusión print(confusion_matrix(y_test_res, y_pred)) print("Distribución inicial de entrenamiento{}".format(Counter(y_train))) print("Distribución finalde entrenamiento: {}".format( Counter(y_train_res))) print("Distribución inicial de test {}".format(Counter(y_test))) print("Distribución final de test: {}".format(Counter(y_test_res)))
def under_sampling(input_data_path, out_sample_path): 'Perform NearMiss under sampling to balance the dataset' X, Y = load_data(input_data_path) near_miss = NearMiss() X_res, y_res = near_miss.fit_sample(X, Y) print('Original data shape {}'.format(Counter(Y))) print('Resampled data shape {}'.format(Counter(y_res))) resampled_validation = np.c_[X_res, y_res] np.save(out_sample_path, resampled_validation)
def resample(self, sampling_type): os = RandomOverSampler(random_state=10) nm = NearMiss() smote = SMOTETomek(random_state=10) if sampling_type == 'over': self.xtrain, self.ytrain = os.fit_sample(self.xtrain, self.ytrain) elif sampling_type == 'under': self.xtrain, self.ytrain = nm.fit_sample(self.xtrain, self.ytrain) else: self.xtrain, self.ytrain = smote.fit_sample( self.xtrain, self.ytrain)
def treinaModelo(dfPopulacao, model): nr = NearMiss() X, y = nr.fit_sample(dfPopulacao.drop(['portifolio', 'id'], axis=1), dfPopulacao.portifolio) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.01, stratify=y) model.fit(X_train, y_train) return model
def modelEvaluation(train, test, sampling='normal', n_jobs=None): for _ in np.asarray(train.select_dtypes(include='category').columns): encoder = LabelEncoder() train[_] = encoder.fit_transform(train[_]) test[_] = encoder.fit_transform(test[_]) x_train, y_train = [ np.asarray(train.drop('churned', 1)), np.asarray(train['churned']) ] x_test, y_test = [ np.asarray(test.drop('churned', 1)), np.asarray(test['churned']) ] grid_values = { 'n_estimators': [900, 1200, 1500], 'criterion': ['gini', 'entropy'], 'max_depth': [7, 8, 9], 'max_features': [5, 7, 9] } model = RandomForestClassifier() if sampling == 'smote': smote = SMOTE() x_train, y_train = smote.fit_sample(x_train, y_train) if sampling == 'nearmiss': nm = NearMiss() x_train, y_train = nm.fit_sample(x_train, y_train) grid = GridSearchCV(model, param_grid=grid_values, scoring='f1', cv=3, n_jobs=n_jobs) grid.fit(x_train, y_train) best_params = grid.best_params_ predictions = grid.predict(x_test) rf_accuraccy = accuracy_score(predictions, y_test) rf_f1_score = f1_score(predictions, y_test) with open('ValidationResults.txt', 'a+') as f: f.write('{}\n'.format(datetime.now())) f.write(name.upper() + ':\n') f.write(sampling.upper() + ':\n') f.write( 'Parameters found: {}\nAccuracy: {}\nF1_Score :{}\n\n\n'.format( best_params, rf_accuraccy, rf_f1_score)) f.close() return best_params, rf_accuraccy, rf_f1_score
def load_dataset(data_file): with open(data_file, "rb") as f: X, Y, embed_mat = pickle.load(f) X = pad_sequences(X, maxlen=MAX_SENTENCE_LEN, truncating="post") Y = np.array(Y) nm1 = NearMiss(random_state=0, version=1) x_resampled, y_resampled = nm1.fit_sample(X, Y) x_train, x_dev, y_train, y_dev = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=0) neg_sent_count = sum(Y) print("# neg:", neg_sent_count) return x_train, x_dev, y_train, y_dev, embed_mat
def _fix_imbalance(self): ''' Fix imbalance of size between classes ''' # FIXME find best ratio card = self.y.iloc[:, 0].value_counts() ratio = card.max() / card.min() if ratio > 1.5: st = NearMiss(ratio=0.75, random_state=42) fX, fy = st.fit_sample(self.X.values, self.y.values.ravel()) samples = [f'smp{i}' for i in range(fX.shape[0])] self.fX = pd.DataFrame(fX, index=samples, columns=self.X.columns) self.fy = pd.DataFrame(fy, index=samples, columns=self.y.columns) else: self.fX = self.X self.fy = self.y st = SMOTETomek(random_state=42) fX, fy = st.fit_sample(self.fX.values, self.fy.values.ravel()) samples = [f'smp{i}' for i in range(fX.shape[0])] self.fX = pd.DataFrame(fX, index=samples, columns=self.X.columns) self.fy = pd.DataFrame(fy, index=samples, columns=self.y.columns) log.info(f'Keeping {fX.shape[0]} samples, {fX.shape[1]} features')
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data nm1 = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) X_resampled, y_resampled = nm1.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 400) assert_equal(count_y_res[2], 400)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) X_resampled, y_resampled = nm.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 166) assert_equal(count_y_res[2], 144)
def test_nm2_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .5 # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm2_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .5 # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm2_fit_sample_auto_indices(): """Test fit and sample routines with auto ratio and indices support""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True) # Fit and sample X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm2_fit_sample_auto_indices(): """Test fit and sample routines with auto ratio and indices support""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True) # Fit and sample X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm3_fit_sample_auto(): """Test fit and sample routines with auto ratio""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm3 = NearMiss( ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm1_fit_sample_auto(): """Test fit and sample routines with auto ratio""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm1 = NearMiss( ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm1.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
actual_predictions_labels,\ loss, accuracy, f1_score, precision, recall, roc_auc=\ test_run(allfeatures_smote_X, np.array(oh_smote_y), test_allfeatures, np.array(y_test_df['oh_label'].tolist())) test_results.append(('SMOTE',(clfmodel, history, cm,\ test_predictions_labels,\ actual_predictions_labels,\ loss, accuracy, f1_score, precision, recall, roc_auc))) print('SMOTE:', precision, roc_auc) # NearMiss test nr = NearMiss() allfeatures_nearmiss_X, nearmiss_y =\ nr.fit_sample(allfeatures_nosmote_X, tosmote_y) oh_nearmiss_y = [] for slabel in nearmiss_y: ohl = np.zeros((len(nosmote_y[0], ))) ohl[slabel] = 1 oh_nearmiss_y.append(ohl) clfmodel, history, cm,\ test_predictions_labels,\ actual_predictions_labels,\ loss, accuracy, f1_score, precision, recall, roc_auc=\ test_run(allfeatures_nearmiss_X, np.array(oh_nearmiss_y), test_allfeatures, np.array(y_test_df['oh_label'].tolist()))
model_vars = ['Customer Address Country', 'Carrier', 'PPU day', 'FHS day', 'FDA day', 'Delivery day', 'PPU-FHS', 'FHS-FDA', 'FDA-Delivery', 'Delayed'] rel_data = df[model_vars] rel_data_encoded = pd.get_dummies(rel_data) # convert categorical vars into numerical.Yields 56 cols # Separate predictor from target variable x = rel_data_encoded.drop(['Delayed'], axis = 1) # predictor vars y = rel_data_encoded['Delayed'] # target variable # Creating training and test sets x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0) # Undersampling with NearMiss print('Under-sampiling over-represented data...') nm = NearMiss('not minority', random_state=42) nm_x_train, nm_y_train = nm.fit_sample(x_train, y_train) df_y_train = pd.DataFrame(data = nm_y_train, columns= ['Delayed']) print('Length of under-sampled data is', len(nm_x_train)) # 4668 print('Number of delayed shipments in oversampled data is', len(nm_y_train[df_y_train['Delayed']==True])) # 2334 print('Number of on time shipments is', len(nm_y_train[df_y_train['Delayed']==False])) # 2334 # Scale data to feed prediction models print('Scaling data...') scaler = StandardScaler().fit(nm_x_train) nm_x_train = scaler.transform(nm_x_train) x_test = scaler.transform(x_test) """ Building prediction models
y = data.loc[:, data.columns == 'Class'] print("data size", collections.Counter(y['Class'])) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=0) print("test data size", collections.Counter(y_test['Class'])) print("original training data size", collections.Counter(y_train['Class'])) y_train_arr = np.array(y_train['Class']) X_train_arr = np.array(X_train) A = [3, 5, 7] for i in A: nm = NearMiss(version=3, random_state=5, n_neighbors_ver3=7) #ratio after sampling Nmin/Mmaj n_neighbors=5 number of neighbour to be taken in consideration at a time X_train_sampled, y_train_sampled = nm.fit_sample(X_train_arr, y_train_arr) print("sampled training data size", collections.Counter(y_train_sampled)) #random forest clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0) clf.fit(X_train_sampled, y_train_sampled) y_pred = clf.predict(X_test) cn_mat = confusion_matrix(y_pred, y_test['Class']) print(cn_mat) TP = cn_mat[1][1] TN = cn_mat[0][0] FN = cn_mat[0][1] FP = cn_mat[1][0] pre1 = TP / (TP + FP) recall1 = TP / (TP + FN) NPV1 = TN / (TN + FN)
rus = RandomUnderSampler(random_state=0, replacement=True) X_resampled, y_resampled = rus.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) print(np.vstack({tuple(row) for row in X_resampled}).shape) #(181, 2) 重复抽样 ''' NearMiss函数则添加了一些启发式(heuristic)的规则来选择样本,通过设定version参数来实现三种启发式的规则. 假设正样本是需要下采样的(多数类样本),负样本是少数类的样本. NearMiss-1:选择离N个近邻的负样本的平均距离最小的正样本; NearMiss-2:选择离N个负样本最远的平均距离最小的正样本; NearMiss-3:是一个两段式的算法.首先,对于每一个负样本,保留它们的M个近邻样本;接着,那些到N个近邻样本平均距离最大的正样本将被选择. ''' from imblearn.under_sampling import NearMiss nm1 = NearMiss(random_state=0, version=1) X_resampled_nm1, y_resampled = nm1.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' Cleaning under-sampling techniques omek’s links TomekLinks:样本x与样本y来自于不同的类别,满足以下条件,它们之间被称之为TomekLinks; 不存在另外一个样本z,使得d(x,z)<d(x,y)或者 d(y,z)<d(x,y)成立.其中d(.)表示两个样本之间的距离,也就是说两个样本之间互为近邻关系. 这个时候,样本x或样本y很有可能是噪声数据,或者两个样本在边界的位置附近. TomekLinks函数中的auto参数控制Tomek’s links中的哪些样本被剔除. 默认的ratio='auto'移除多数类的样本,当ratio='all'时,两个样本均被移除. ''' from imblearn.under_sampling import TomekLinks tl =TomekLinks(random_state=0,ratio='all') X_resampled, y_resampled = tl.fit_sample(X, y) print(sorted(Counter(y_resampled).items()))
scaler = StandardScaler() scaled_features = scaler.fit_transform(df_f.values) df_f_scaled = pd.DataFrame(scaled_features, columns=df_f.columns) df_d = df.drop( ['V3', 'V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18'], axis=1) df_f = df.drop(df_d, axis=1) df_t = df['Class'] clf1 = XGBClassifier(random_state=42) # Implementing Undersampling for Handling Imbalanced nm = NearMiss(sampling_strategy={0: 100000, 1: 492}) X_res, y_res = nm.fit_sample(df_f, df_t) df3_train, df3_test, df_t_train, df_t_test = train_test_split(X_res, y_res, test_size=.3, stratify=y_res, random_state=42) clf1.fit(df3_train, df_t_train) # Saving model to disk pickle.dump(clf1, open('model.pkl', 'wb')) # Loading model to compare the results model = pickle.load(open('model.pkl', 'rb')) print(model.predict(df3_test))
from imblearn.over_sampling import ADASYN from imblearn.over_sampling import SMOTENC from imblearn.over_sampling import SVMSMOTE #from imblearn.over_sampling import KMeansSMOTE data_=pd.read_csv(r'train.csv') data1=np.array(data_) data=data1[:,1:] [m1,n1]=np.shape(data) label1=np.ones((4242,1))#Value can be changed int(m1/2) label2=np.zeros((71809,1)) label=np.append(label1,label2) shu=scale(data) X=shu y=label#.astype('int64') nm1 = NearMiss(version=2)#version=2,version=3 X_resampled, y_resampled = nm1.fit_sample(X, y) shu=X_resampled X1=scale(shu) y1=y_resampled #shu2 =X_resampled #shu3 =y_resampled data_csv = pd.DataFrame(data=X1) data_csv.to_csv('NearMiss_train.csv') data_csv = pd.DataFrame(data=y1) data_csv.to_csv('label_NearMiss_train.csv')
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss 2 nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
# Creamos un índice que luego nos servirá para mergear el dataset 'All_peptides' # con el dataset balanceado que creemos. copy['ID_Sequence'] = range(len(copy)) A = copy.loc[:, ['Label']] B = copy.loc[:, ['ID_Sequence']] # Categorizamos la variable cualitativa 'Label' a numérica # Lable: Toxic = 1 # NonToxic = 0 A = pd.get_dummies(A, columns=["Label"], drop_first=True) # Realizamos el balanceo: from imblearn.under_sampling import NearMiss nr = NearMiss() B_res, A_res = nr.fit_sample(B, A) DBSCAN_subsampling = pd.merge(B_res, A_res, right_index=True, left_index=True) DBSCAN_subsampling = pd.merge(DBSCAN_subsampling, copy, on='ID_Sequence') # Vemos que el balanceo se ha realizado correctamente, ya que tenemos # 703 péptidos de cada clase. DBSCAN_subsampling.groupby('Label').size() # Exportamos el fichero para poder utilizarlo en los demás algoritmos. DBSCAN_subsampling.to_csv("BBDD/DBSCAN_subsampling.txt", index=None) # MODELO SVM del modelo DBSCAN. # Preparación del dataset para la generación de los modelos predictivos.