def smotenc_over_sampler(X_data, y_data, categorical_features_dims): """Generate oversampling for training data set using SMOTENC technique. Args: X_data (pandas data frame): y_data (pandas vector): categorical_features_dims (list): Returns: X and Y datasets balanced """ utils.save_log('{0} :: {1}'.format( smotenc_over_sampler.__module__, smotenc_over_sampler.__name__)) model = SMOTENC(categorical_features=categorical_features_dims, random_state=config.random_seed, n_jobs=config.num_jobs) X, y = model.fit_resample(X_data, y_data) X_smotenc = pandas.DataFrame(X, columns=features_engineering.features_list) y_smotenc = pandas.DataFrame(y, columns=[features_engineering.target_label]) return X_smotenc, y_smotenc
def balance_data(X,y): y=y.astype('int64') xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,stratify=y) smotenc = SMOTENC([0,1,2,3,4,5]) X_oversample,y_oversample = smotenc.fit_resample(xtrain,ytrain) return X_oversample,y_oversample,xtest,ytest
def test_smotenc_samplers_one_label(): X, _, categorical_features = data_heterogneous_unordered() y = np.zeros(30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match='needs to have more than 1 class'): smote.fit(X, y)
def get_smotenc(X, y, cat_cols): """Upsamples categorical and non-categorical data using SMOTENC package Args: X: DataFrame, feature data that needs to be upsampled y: Series, tags corresponding to the given features Returns: us_X: DataFrame, upsampled feature data us_y: Series, upsampled target data """ # Finding which indexes are categorical categorical_mask = [ index for index, col in enumerate(X) if col in cat_cols ] X_dtypes = X.dtypes smote = SMOTENC(categorical_mask, random_state=44, n_jobs=-1, k_neighbors=3) upsampled_data, upsampled_results = smote.fit_resample(X, y) # Converting the numpy arrays back to dataframes and series objects us_X = pd.DataFrame(upsampled_data, columns=X.columns) us_y = pd.Series(upsampled_results) # The data types are all defaulted to 'object' so I am fixing that for col in us_X: us_X[col] = us_X[col].astype(X_dtypes[col]) return us_X, us_y
def train_catboost_model(X, y, seed=0, iterations=100, verbose=False, upsample=True, eval_data=None, eval_labels=None): if upsample: verbose and print('upsampling...') categorical_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'int8'] smote = SMOTENC(random_state=seed, categorical_features=categorical_features) X, y = smote.fit_resample(X, y) verbose and print('scaling...') scaling = StandardScaler() X = scaling.fit_transform(X) verbose and print('fitting...') verbose and print('iterations:', iterations) model = CatBoostClassifier(random_state=seed, iterations=iterations, cat_features=None, custom_metric=['Logloss', 'AUC:hints=skip_train~false']) # For early stopping if eval_data is not None: eval_dataset = Pool(eval_data, eval_labels) model.fit(X, y, eval_set=eval_dataset) else: model.fit(X, y) verbose and print('chaining pipeline...') pipe = Pipeline([('scaling', scaling), ('model', model)]) verbose and print('done.') return pipe
def DataAugmentation(data, labels, balance=False): # ipdb.set_trace() categorical_features = [ is_categorical(data[:, inx]) for inx in range(data.shape[1]) ] categorical_features_index = np.where(categorical_features)[0] labels = labels.astype('float32') na_inx = np.isnan(labels) data_na, labels_na = data[na_inx], labels[na_inx] data1, labels1 = data[np.logical_not(na_inx)], labels[np.logical_not( na_inx)] if len(labels1 > 2): if balance: data1 = np.nan_to_num(data1, copy=False) data1 = pd.DataFrame(data1) data1 = data1.fillna(0) mappeds = [] for ii in categorical_features_index: data1[ii], mapped = cat2int(data1[ii]) mappeds.append(mapped) # imputation sm = SMOTENC(random_state=42, categorical_features=categorical_features) # sm = SMOTETomek(ratio='auto') data1, labels1 = sm.fit_sample(data1, labels1) data1 = pd.DataFrame(data1) for mapped, ii in zip(mappeds, categorical_features_index): data1[ii] = int2cat(data1[ii], mapped) data1 = data1.values data = np.concatenate([data1, data_na], 0) labels = np.concatenate([labels1, labels_na], 0) return data, labels
def test_smotenc_fit(): X, y, categorical_features = data_heterogneous_unordered() smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) assert hasattr(smote, 'sampling_strategy_'), \ "No fitted attribute sampling_strategy_"
def train_rf_model(X, y, seed=0, n_estimators=100, verbose=False, upsample=True): if upsample: verbose and print('upsampling...') categorical_features = [ i for i, col in enumerate(X.columns) if X[col].dtype == 'int8' ] smote = SMOTENC(random_state=seed, categorical_features=categorical_features) X, y = smote.fit_resample(X, y) verbose and print('scaling...') scaling = StandardScaler() X = scaling.fit_transform(X) verbose and print('fitting...') verbose and print('n_estimators:', n_estimators) model = RandomForestClassifier(random_state=seed, n_estimators=n_estimators) model.fit(X, y) verbose and print('chaining pipeline...') pipe = Pipeline([('scaling', scaling), ('model', model)]) verbose and print('done.') return pipe
def resample_vals(x_train, y_train): """ Prior to running a supervised classification algorithm, we will need to even our training target labels through resampling. Since many of our values are categorical, we will use SMOTENC. Parameters: x_train: The training dataset features. y_train: The training dataset targets. Returns: x_train_new: The resampled training dataset features. y_train_new: The resampled training dataset targets. """ # Specify categorical variables cats = [0, 2, 4] cats += list(range(10, 18)) # Resample all non-majority categories sm_alg = SMOTENC(categorical_features=cats, random_state=42, sampling_strategy='not majority') x_array, y_array = sm_alg.fit_resample(x_train, y_train) x_train_new = pd.DataFrame(x_array, columns=list(x_train.columns)) y_train_new = pd.DataFrame(y_array, columns=list(y_train.columns)) return x_train_new, y_train_new
def split(self, resample=False, index=None): self.sm = SMOTENC(categorical_features=self.categorical_features, sampling_strategy='auto', random_state=self.random_state, k_neighbors=5, n_jobs=1) if index is None: if resample: # If no index is provided transform the whole training set self.X_ttrain, self.y_train = self.sm.fit_resample(self.X_ttrain, self.y_train) else: # If index is provided transform the corresponding train split (on X_ttrain) # Be aware that the test split (on X_ttest) is not transformed for accurate # evaluation of performance on unbalanced dataset. As a consequence splitting # has to occur before rebalancing index_train, index_test = self.splits[index] self.X_tttrain, self.y_ttrain = self.X_ttrain.iloc[index_train], self.y_train.iloc[index_train] self.X_tttest, self.y_ttest = self.X_ttrain.iloc[index_test], self.y_train.iloc[index_test] if resample: # Transform train split self.X_tttrain, self.y_ttrain = self.sm.fit_resample(self.X_tttrain, self.y_ttrain) # Match back to pandas data types self.X_tttrain = pd.DataFrame(self.X_tttrain, columns=self.columns) self.y_ttrain = pd.Series(self.y_ttrain) self.resampled = resample return self
def SMOTE_cat(DFmain): data = DFmain X, y = reshape_data(DFmain) X_train, X_test, y_train, y_test = splitData(X,y, test_size= .33) sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,14],random_state= 1, sampling_strategy ='minority') X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel()) print("Before SMOTE, counts of label 'yes': {}".format(sum(y_train == 'yes'))) print("After SMOTE, the shape of X_train: ", X_train_smote.shape) print("After SMOTE, the shape of y_train: ", y_train_smote.shape) print("After SMOTE, counts of Class attr 'Yes': ", sum(y_train_smote == 'yes')) print("After SMOTE, counts of Class attr 'No': ", sum(y_train_smote == 'no')) print('\n\na) Go back to main menu') print('b) Go back to pre-processing menu') print('q) Quit') getInput = input('What would you like to do next: ') if(getInput.lower() == 'a'): state = STATE_MAIN elif(getInput.lower() == 'b'): state = STATE_PREPROCESS showPreProcessMenu(state,data) return state
def _smote_data(self): if self.cols_nominal.size > 0: cats = self.X_train.columns.isin(self.cols_nominal) sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state) else: sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state) self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
def SMOTENC_augmentation_cat(self, X_train_selected, y_train, cat_indexes): sm = SMOTENC(random_state=41, categorical_features=cat_indexes) X_train_selected_aug, y_train_aug = sm.fit_resample( X_train_selected, y_train) return X_train_selected_aug, y_train_aug
def process_training_data(X, y_orig, do_simple_duplicate=False, do_smote=True, max_first_feature=0, do_one_hot=True, with_category=False): if (do_smote): if (with_category): sm = SMOTENC(categorical_features=[0], random_state=42) else: sm = SMOTE(random_state=42) X, y_orig = sm.fit_resample(X, y_orig.reshape(-1)) elif (do_simple_duplicate): c = Counter(y_orig[:, -1]) mc = c.most_common()[0] dup_num = [] for cc in c.most_common(): dup_num.append(mc[1] - cc[1]) dup_x = np.zeros((0, X.shape[1])) dup_y = np.zeros((0, y_orig.shape[1])) for i in range(len(c)): class_ = c.most_common()[i][0] idx_c = np.argwhere(y_orig == class_)[:, 0].reshape(-1) idx_c = np.random.permutation(idx_c) if (idx_c.shape[0] >= dup_num[i]): idx_c = idx_c[:dup_num[i]] elif (idx_c.shape[0] < dup_num[i]): idx_c_ = idx_c[:(dup_num[i] - idx_c.shape[0])] dup_x = np.vstack((dup_x, X[idx_c_, :])) dup_y = np.vstack((dup_y, y_orig[idx_c_, :])) dup_x = np.vstack((dup_x, X[idx_c, :])) dup_y = np.vstack((dup_y, y_orig[idx_c, :])) X = np.vstack((X, dup_x)) y_orig = np.vstack((y_orig, dup_y)) idx = [i for i in range(X.shape[0])] idx = np.random.permutation(idx) X = X[idx] y_orig = y_orig[idx].astype(np.int) if (do_one_hot): if (max_first_feature == 0): max_first_feature = np.max(X[:, 0]).astype(np.int) one_hot_first_feature = np.eye(max_first_feature)[ X[:, 0].reshape(-1).astype(np.int) - 1] X = np.hstack((one_hot_first_feature, X[:, 1:])) max_minus_min = np.max(X, axis=0) - np.min(X, axis=0) idx = np.argwhere(max_minus_min.astype(np.int) == 0) if (idx.shape[0] != 0): max_minus_min[idx[:, 0]] = 1 X = (X - np.mean(X, axis=0)) / max_minus_min # X = normalize(X, axis=0) # X[:, -1] = normalize(X[:, -1].reshape(-1, 1), axis=0) if (do_one_hot): new_x = np.hstack((one_hot_first_feature, X[:, 1:])) new_x = X return new_x, y_orig.reshape(-1, 1)
def test_smotenc_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() target_stats = Counter(y) smote = SMOTENC(categorical_features=categorical_features, random_state=0) _, y_res = smote.fit_resample(X, y) _ = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values())
def oversampling_cat(X_train, y_train, categorical_features_indices): # over_sampler = RandomOverSampler() # X_train_res, y_train_res = over_sampler.fit_sample(X_train, y_train) sm = SMOTENC(random_state=42, categorical_features=categorical_features_indices) X_train_res, y_train_res = sm.fit_resample(X_train, y_train) print('Resampled dataset shape %s' % Counter(y_train_res)) return X_train_res, y_train_res
def test_smotenc_fit_resample_sampling_strategy(): X, y, categorical_features = data_heterogneous_unordered_multiclass() expected_stat = Counter(y)[1] smote = SMOTENC(categorical_features=categorical_features, random_state=0) sampling_strategy = {2: 25, 0: 25} smote.set_params(sampling_strategy=sampling_strategy) X_res, y_res = smote.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat
def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): X, y, categorical_features = heterogeneous_data smote = SMOTENC(categorical_features, k_neighbors=_CustomNearestNeighbors(n_neighbors=5)) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (40, 4) assert Counter(y_res) == {0: 20, 1: 20}
def oversample(dataframe: pd.DataFrame, cat_feats): X = dataframe.drop("Reservation_Status", axis="columns") y = dataframe.loc[:, "Reservation_Status"] smote_enc = SMOTENC(categorical_features=cat_feats, random_state=42) X_res, y_res = smote_enc.fit_resample(X, y) out_df = X_res.copy(deep=True) out_df["Reservation_Status"] = y_res return out_df
def main(): logger = logging.getLogger(__name__) processed_df = pd.read_csv(f'../../data/processed/processed.csv') id_col = ['customerID'] target_col = ["Churn"] cols = [i for i in processed_df.columns if i not in id_col + target_col] cate_cols = processed_df.nunique()[processed_df.nunique() == 2].keys().tolist() cate_cols = [col for col in cate_cols if col not in target_col] cate_cols_idx = [processed_df.columns.get_loc(col) for col in cate_cols] smote_X = processed_df[cols] smote_Y = processed_df[target_col] smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split( smote_X, smote_Y, test_size=.25, random_state=111) logger.info(f'Applying SMOTE') os = SMOTENC(categorical_features=cate_cols_idx, sampling_strategy='minority', random_state=0) os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y) os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols) os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col) logger.info(f'Fitting Logistic Regression and Tuning') lr = LogisticRegression(max_iter=500) clf = GridSearchCV(estimator=lr, param_grid=LogisticRegression_grid, cv=5) best_model = clf.fit(os_smote_X.values, os_smote_Y.values.ravel()) logger.info(f'Best Parameters: {best_model.best_params_}') metrics = create_report(best_model, smote_test_X, smote_test_Y) logger.info(f'{metrics}') f = open(f'../../models/logistigregression_best_metrics.txt', 'w') f.write(metrics) f.close() joblib.dump(best_model, f'../../models/logsticreg_best.pkl', compress=9) logger.info(f'Model and Evaluation saved to "models/"') logger.info('Visualising metrics') plot_report(processed_df=processed_df, algorithm=best_model.best_estimator_, test_X=smote_test_X, test_Y=smote_test_Y, cf='coefficients', name='Logistic Regression') logger.info('DOWNLOAD PLOT FROM PLOTLY') return
def test_smotenc_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() target_stats = Counter(y) smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res, y_res = smote.fit_resample(X, y) target_stats_res = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values())
def _smote_data(self): """Performs a SMOTE upsampling of the data. If there are nominal columns detected, it will change SMOTE algorithms.""" if self.cols_nominal.size > 0: cats = self.X_train.columns.isin(self.cols_nominal) sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state) else: sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state) self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
def smotenc_generater(sampling_strategy=None): data = pd.read_csv('/tmp/data/small_train.csv') data.rename(columns={'hour': 'time'}, inplace=True) data['time'] = data['time'].astype('str') data['hour'] = data['time'].str[6:] # 1.Label Encoding for sparse features,and do simple Transformation for dense features data[sparse_features] = data[sparse_features].fillna('-1', ) for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) for feat in dense_features: minmax = MinMaxScaler() data[feat] = minmax.fit_transform(data[feat].values.reshape(-1, 1)) # 1.1 smotenc # categorical_features = ['hour', 'C1', 'banner_pos', # 'site_category', 'app_category', # 'device_ip', # 'device_type', 'device_conn_type', 'C15', 'C16', 'C18'] categorical_list = [] for cf in categorical_features: for num, sf in enumerate(sparse_features): if cf == sf: categorical_list.append(num) train, test = train_test_split(data, test_size=0.2, random_state=2020) X_train = pd.np.array(train[sparse_features]) Y_train = list(train['click']) if sampling_strategy: print("This smotenc generater using " + str(sampling_strategy)) smote_nc = SMOTENC(categorical_features=categorical_list, random_state=0, sampling_strategy=sampling_strategy) else: smote_nc = SMOTENC(categorical_features=categorical_list, random_state=0) X_smotenc, Y_smotenc = smote_nc.fit_resample(X_train, Y_train) train = pd.DataFrame(X_smotenc, columns=sparse_features) train = pd.concat( [train, pd.DataFrame(Y_smotenc, columns=['click'])], axis=1) for i in categorical_features: train[i] = train[i].astype(int) print("writing trian file ...") train.to_csv('/tmp/data/mayi_smotenc_train_03.csv', index=False) print("trian file write done") print("writing test file ...") test.to_csv('/tmp/data/mayi_smotenc_test_03.csv', index=False) print("test file write done")
def test_smotenc_pandas(): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y, categorical_features = data_heterogneous_unordered_multiclass() X_pd = pd.DataFrame(X) smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) assert_array_equal(X_res_pd.to_numpy(), X_res) assert_allclose(y_res_pd, y_res)
def test_smotenc_preserve_dtype(): X, y = make_classification(n_samples=50, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) smote = SMOTENC(categorical_features=[1], random_state=0) X_res, y_res = smote.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved"
def validation_norm_pipeline(model, cv, X_train, y_train): oversample = SMOTENC(categorical_features=[ 2, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44 ], random_state=42) cv_scores = [] i = 1 for train_index_fold, validation_index_fold in cv.split(X_train, y_train): #Training Data X_train_fold, y_train_fold = X_train[train_index_fold], y_train[ train_index_fold] #Validation Data X_validation_fold, y_validation_fold = X_train[ validation_index_fold], y_train[validation_index_fold] #Apply SMOTE to upsample training data X_upsampled_train_fold, y_upsampled_train_fold = oversample.fit_resample( X_train_fold, y_train_fold) #Apply Min-Max Normalization scaler = MinMaxScaler() #Fit on training set scaler.fit(X_upsampled_train_fold) #scale on training set X_upsampled_train_fold = scaler.transform(X_upsampled_train_fold) #scale the validation dataset X_validation_fold = scaler.transform(X_validation_fold) #Fit the model clf = model.fit(X_upsampled_train_fold, y_upsampled_train_fold) #Compute score on validation set score = balanced_accuracy_score(y_validation_fold, clf.predict(X_validation_fold)) print("Fold " + str(i) + " accuracy: " + str(score)) i += 1 cv_scores.append(score) mean = 0 for elem in cv_scores: mean += elem mean = mean / 10 return mean
def convert_and_save(self, processed_datapath): """변환한 데이터를 파일로 저장.""" datapath = self.orig_file column_type = json.load(open(f"{CONFIGPATH}/column_list.json")) df = pd.read_csv(datapath, sep=",", dtype=column_type) #print (df) df = self.fillempty(df, column_type) df = df.apply(pd.to_numeric) print(f"dfiloc {df.iloc[1, 1:]}") #print("df nan number =" + df.isna().sum()) print(f"type {df.iloc[1, 2]}") print(f"type {type(df.iloc[1, 2])}") # smote from imblearn.over_sampling import SMOTENC categorial_list = [x for x in range(235)] smote = SMOTENC(random_state=42, categorical_features=categorial_list) train_input, train_label = smote.fit_resample(df.iloc[:, 1:], df.iloc[:, :1]) print(type(train_label)) print(type(train_input)) train_label = np.expand_dims(train_label, axis=-1) print(f'train_input.shape is {train_input.shape}') print(f'train_label.shape is {train_label.shape}') np_smote = np.concatenate([train_label, train_input], axis=-1) df_smote = pd.DataFrame(np_smote) print(f"df.iloc[0,:] is {df.iloc[0,:]}") print(f"df.columns is {df.columns}") print(f"df_smote.columns is {df_smote.columns}") df_smote.columns = df.columns print(f"df_smote.columns is {df_smote.columns}") #df_train_input = pd.DataFrame(train_input) #df_train_label = pd.DataFrame(train_label) #df[:, 1:] = df_train_input #df[:, :1] = df_train_label # smote 끝 df, column_names = self.convert_dataset(df_smote, column_type) np.random.shuffle(df) np.save(processed_datapath, df) np.save(processed_datapath.replace(".npy", "_columnnames.npy"), column_names) print("Saved at %s" % processed_datapath)
def test_smotenc_check_target_type(): X, _, categorical_features = data_heterogneous_unordered() y = np.linspace(0, 1, 30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): smote.fit_resample(X, y) rng = np.random.RandomState(42) y = rng.randint(2, size=(20, 3)) with pytest.raises(ValueError, match="'y' should encode the multiclass"): smote.fit_resample(X, y)
def smote(X,y): y_cat = (y>0.5).astype(np.int32) X = np.concatenate([X,np.expand_dims(y,1)],axis=1) smote_nc = SMOTENC(categorical_features=np.arange(0,10).tolist(), random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(X, y_cat) new_x = X_resampled[:,:-1] new_y = X_resampled[:,-1] return new_x, new_y
def test_smotenc_pandas(): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y, categorical_features = data_heterogneous_unordered_multiclass() X_pd = pd.DataFrame(X) smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) assert X_res_pd.tolist() == X_res.tolist() assert_allclose(y_res_pd, y_res)
def smote(y_name, X_train_keras, y_train_keras): # sm = SMOTENC(categorical_features=['prev_char', 'curr_char', 'next_char'], random_state=0, sampling_strategy=0.6) sm = SMOTENC(categorical_features=[0, 1, 2], random_state=0) X_train_keras['spurrious'] = 0.0 X_train_2, y_train_2 = sm.fit_sample( X_train_keras[['prev_char', 'curr_char', 'next_char', 'spurrious']], y_train_keras[y_name]) del X_train_2["spurrious"] print(X_train_2.head()) print(y_train_2.head()) return (X_train_keras, y_train_keras)
def smotenc_oversampling(DataFrame, y, cat): ''' Make sure drop dependent variable column or else it would be considered a categorical variable. ''' dataset = DataFrame.copy() cat_list = get_indicator_columns(dataset) se = SMOTENC( categorical_features=cat_list, k_neighbors=6, n_jobs=10, sampling_strategy="minority" ) resampled_x, resampled_y = se.fit_resample(dataset, y) return resampled_x, resampled_y
def test_smotenc_raising_error_all_categorical(categorical_features): X, y = make_classification( n_features=3, n_informative=1, n_redundant=1, n_repeated=0, n_clusters_per_class=1, ) smote = SMOTENC(categorical_features=categorical_features) err_msg = "SMOTE-NC is not designed to work only with categorical features" with pytest.raises(ValueError, match=err_msg): smote.fit_resample(X, y)
def test_smote_nc_with_null_median_std(): # Non-regression test for #662 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/662 data = np.array([[1, 2, 1, 'A'], [2, 1, 2, 'A'], [1, 2, 3, 'B'], [1, 2, 4, 'C'], [1, 2, 5, 'C']], dtype="object") labels = np.array(['class_1', 'class_1', 'class_1', 'class_2', 'class_2'], dtype=object) smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0) X_res, y_res = smote.fit_resample(data, labels) # check that the categorical feature is not random but correspond to the # categories seen in the minority class samples assert X_res[-1, -1] == "C"
def over_under_sampling(col, strategy): y = data[col] X = data.drop([col], axis=1) sampler = SMOTENC(k_neighbors=2, categorical_features=[1, 2, 3, 4, 5, 6, 7], sampling_strategy=strategy, n_jobs=2) X, y = sampler.fit_resample(X, y) under_sampler = RandomUnderSampler(sampling_strategy='majority') X, y = under_sampler.fit_resample(X, y) print('Balancing for {} finished. Result:'.format(col)) print(Counter(y)) return pd.concat([X, y], axis=1)
def test_smotenc(data): X, y, categorical_features = data smote = SMOTENC(random_state=0, categorical_features=categorical_features) X_resampled, y_resampled = smote.fit_resample(X, y) assert X_resampled.dtype == X.dtype categorical_features = np.array(categorical_features) if categorical_features.dtype == bool: categorical_features = np.flatnonzero(categorical_features) for cat_idx in categorical_features: if sparse.issparse(X): assert set(X[:, cat_idx].data) == set(X_resampled[:, cat_idx].data) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype else: assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
def test_smotenc_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] smote = SMOTENC(random_state=0, categorical_features=categorical_features) with pytest.raises(ValueError, match="indices are out of range"): smote.fit_resample(X, y)
plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # When dealing with a mixed of continuous and categorical features, SMOTE-NC # is the only method which can handle this case. # create a synthetic data set with continuous and categorical features rng = np.random.RandomState(42) n_samples = 50 X = np.empty((n_samples, 3), dtype=object) X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object) X[:, 1] = rng.randn(n_samples) X[:, 2] = rng.randint(3, size=n_samples) y = np.array([0] * 20 + [1] * 30) print('The original imbalanced dataset') print(sorted(Counter(y).items())) print('The first and last columns are containing categorical features:') print(X[:5]) smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(X, y) print('Dataset after resampling:') print(sorted(Counter(y_resampled).items())) print('SMOTE-NC will generate categories for the categorical features:') print(X_resampled[-5:]) plt.show()