Example #1
0
def smotenc_over_sampler(X_data,
                         y_data,
                         categorical_features_dims):
    """Generate oversampling for training data set using SMOTENC technique.

    Args:
        X_data (pandas data frame):
        y_data (pandas vector):
        categorical_features_dims (list):

    Returns:
        X and Y datasets balanced
    """
    utils.save_log('{0} :: {1}'.format(
        smotenc_over_sampler.__module__,
        smotenc_over_sampler.__name__))

    model = SMOTENC(categorical_features=categorical_features_dims,
                    random_state=config.random_seed,
                    n_jobs=config.num_jobs)

    X, y = model.fit_resample(X_data, y_data)

    X_smotenc = pandas.DataFrame(X,
                                 columns=features_engineering.features_list)
    y_smotenc = pandas.DataFrame(y,
                                 columns=[features_engineering.target_label])

    return X_smotenc, y_smotenc
def balance_data(X,y):
    y=y.astype('int64')
    xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,stratify=y)
    smotenc = SMOTENC([0,1,2,3,4,5])
    X_oversample,y_oversample = smotenc.fit_resample(xtrain,ytrain)
    
    return X_oversample,y_oversample,xtest,ytest
Example #3
0
def test_smotenc_samplers_one_label():
    X, _, categorical_features = data_heterogneous_unordered()
    y = np.zeros(30)
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    with pytest.raises(ValueError, match='needs to have more than 1 class'):
        smote.fit(X, y)
Example #4
0
def get_smotenc(X, y, cat_cols):
    """Upsamples categorical and non-categorical data using SMOTENC package
    Args:
        X: DataFrame, feature data that needs to be upsampled
        y: Series, tags corresponding to the given features
    
    Returns:
        us_X: DataFrame, upsampled feature data
        us_y: Series, upsampled target data
    """
    # Finding which indexes are categorical
    categorical_mask = [
        index for index, col in enumerate(X) if col in cat_cols
    ]
    X_dtypes = X.dtypes

    smote = SMOTENC(categorical_mask,
                    random_state=44,
                    n_jobs=-1,
                    k_neighbors=3)

    upsampled_data, upsampled_results = smote.fit_resample(X, y)

    # Converting the numpy arrays back to dataframes and series objects
    us_X = pd.DataFrame(upsampled_data, columns=X.columns)
    us_y = pd.Series(upsampled_results)

    # The data types are all defaulted to 'object' so I am fixing that
    for col in us_X:
        us_X[col] = us_X[col].astype(X_dtypes[col])
    return us_X, us_y
Example #5
0
def train_catboost_model(X, y, seed=0, iterations=100, verbose=False, upsample=True, eval_data=None, eval_labels=None):
    if upsample:
        verbose and print('upsampling...')
        categorical_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'int8']
        smote = SMOTENC(random_state=seed, categorical_features=categorical_features)
        X, y = smote.fit_resample(X, y)

    verbose and print('scaling...')
    scaling = StandardScaler()
    X = scaling.fit_transform(X)

    verbose and print('fitting...')
    verbose and print('iterations:', iterations)
    model = CatBoostClassifier(random_state=seed, iterations=iterations, cat_features=None, 
                               custom_metric=['Logloss', 'AUC:hints=skip_train~false'])

    # For early stopping
    if eval_data is not None:
        eval_dataset = Pool(eval_data, eval_labels)
        model.fit(X, y, eval_set=eval_dataset)
    else:
        model.fit(X, y)

    verbose and print('chaining pipeline...')
    pipe = Pipeline([('scaling', scaling), ('model', model)])
    verbose and print('done.')
    return pipe
Example #6
0
def DataAugmentation(data, labels, balance=False):
    #     ipdb.set_trace()
    categorical_features = [
        is_categorical(data[:, inx]) for inx in range(data.shape[1])
    ]
    categorical_features_index = np.where(categorical_features)[0]
    labels = labels.astype('float32')
    na_inx = np.isnan(labels)
    data_na, labels_na = data[na_inx], labels[na_inx]
    data1, labels1 = data[np.logical_not(na_inx)], labels[np.logical_not(
        na_inx)]

    if len(labels1 > 2):
        if balance:
            data1 = np.nan_to_num(data1, copy=False)
            data1 = pd.DataFrame(data1)
            data1 = data1.fillna(0)
            mappeds = []
            for ii in categorical_features_index:
                data1[ii], mapped = cat2int(data1[ii])
                mappeds.append(mapped)
            # imputation
            sm = SMOTENC(random_state=42,
                         categorical_features=categorical_features)
            #         sm = SMOTETomek(ratio='auto')
            data1, labels1 = sm.fit_sample(data1, labels1)
            data1 = pd.DataFrame(data1)
            for mapped, ii in zip(mappeds, categorical_features_index):
                data1[ii] = int2cat(data1[ii], mapped)
            data1 = data1.values

        data = np.concatenate([data1, data_na], 0)
        labels = np.concatenate([labels1, labels_na], 0)

    return data, labels
Example #7
0
def test_smotenc_fit():
    X, y, categorical_features = data_heterogneous_unordered()
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    smote.fit_resample(X, y)
    assert hasattr(smote, 'sampling_strategy_'), \
        "No fitted attribute sampling_strategy_"
Example #8
0
def train_rf_model(X,
                   y,
                   seed=0,
                   n_estimators=100,
                   verbose=False,
                   upsample=True):
    if upsample:
        verbose and print('upsampling...')
        categorical_features = [
            i for i, col in enumerate(X.columns) if X[col].dtype == 'int8'
        ]
        smote = SMOTENC(random_state=seed,
                        categorical_features=categorical_features)
        X, y = smote.fit_resample(X, y)

    verbose and print('scaling...')
    scaling = StandardScaler()
    X = scaling.fit_transform(X)

    verbose and print('fitting...')
    verbose and print('n_estimators:', n_estimators)
    model = RandomForestClassifier(random_state=seed,
                                   n_estimators=n_estimators)
    model.fit(X, y)

    verbose and print('chaining pipeline...')
    pipe = Pipeline([('scaling', scaling), ('model', model)])
    verbose and print('done.')
    return pipe
Example #9
0
def resample_vals(x_train, y_train):
    """
    Prior to running a supervised classification algorithm,
    we will need to even our training target labels through
    resampling. Since many of our values are categorical,
    we will use SMOTENC.

    Parameters:
    x_train: The training dataset features.
    y_train: The training dataset targets.

    Returns:
    x_train_new: The resampled training dataset features.
    y_train_new: The resampled training dataset targets.
    """
    #   Specify categorical variables
    cats = [0, 2, 4]
    cats += list(range(10, 18))

    #   Resample all non-majority categories
    sm_alg = SMOTENC(categorical_features=cats,
                     random_state=42,
                     sampling_strategy='not majority')
    x_array, y_array = sm_alg.fit_resample(x_train, y_train)
    x_train_new = pd.DataFrame(x_array, columns=list(x_train.columns))
    y_train_new = pd.DataFrame(y_array, columns=list(y_train.columns))
    return x_train_new, y_train_new
Example #10
0
    def split(self, resample=False, index=None):
        self.sm = SMOTENC(categorical_features=self.categorical_features,
                          sampling_strategy='auto',
                          random_state=self.random_state,
                          k_neighbors=5,
                          n_jobs=1)

        if index is None:
            if resample:
                # If no index is provided transform the whole training set
                self.X_ttrain, self.y_train = self.sm.fit_resample(self.X_ttrain, self.y_train)
        else:
            # If index is provided transform the corresponding train split (on X_ttrain)
            # Be aware that the test split (on X_ttest) is not transformed for accurate
            # evaluation of performance on unbalanced dataset. As a consequence splitting 
            # has to occur before rebalancing

            index_train, index_test = self.splits[index]
            self.X_tttrain, self.y_ttrain = self.X_ttrain.iloc[index_train], self.y_train.iloc[index_train]
            self.X_tttest, self.y_ttest = self.X_ttrain.iloc[index_test], self.y_train.iloc[index_test]

            if resample:
                # Transform train split
                self.X_tttrain, self.y_ttrain = self.sm.fit_resample(self.X_tttrain, self.y_ttrain)

                # Match back to pandas data types
                self.X_tttrain = pd.DataFrame(self.X_tttrain, columns=self.columns)
                self.y_ttrain = pd.Series(self.y_ttrain)
            
        self.resampled = resample
        return self
Example #11
0
def SMOTE_cat(DFmain):
    data = DFmain
    X, y = reshape_data(DFmain)
    
    X_train, X_test, y_train, y_test = splitData(X,y, test_size= .33)
        
    sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,14],random_state= 1,
             sampling_strategy ='minority') 
    X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel())
    
    
    print("Before SMOTE, counts of label 'yes': {}".format(sum(y_train 
                                                                 == 'yes')))
    print("After SMOTE, the shape of X_train: ", X_train_smote.shape) 
    print("After SMOTE, the shape of y_train: ", y_train_smote.shape)  
    print("After SMOTE, counts of Class attr 'Yes': ", sum(y_train_smote 
                                                           == 'yes'))
    print("After SMOTE, counts of Class attr 'No': ", sum(y_train_smote 
                                                          == 'no'))
    
    print('\n\na) Go back to main menu')
    print('b) Go back to pre-processing menu')
    print('q) Quit')
    
    getInput = input('What would you like to do next: ')  
    
    if(getInput.lower() == 'a'):
        state = STATE_MAIN
    elif(getInput.lower() == 'b'):
        state = STATE_PREPROCESS
        showPreProcessMenu(state,data)
        
    return state
Example #12
0
 def _smote_data(self):
     if self.cols_nominal.size > 0:
         cats = self.X_train.columns.isin(self.cols_nominal)
         sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state)
     else:
         sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state)
     self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
Example #13
0
    def SMOTENC_augmentation_cat(self, X_train_selected, y_train, cat_indexes):

        sm = SMOTENC(random_state=41, categorical_features=cat_indexes)
        X_train_selected_aug, y_train_aug = sm.fit_resample(
            X_train_selected, y_train)

        return X_train_selected_aug, y_train_aug
def process_training_data(X,
                          y_orig,
                          do_simple_duplicate=False,
                          do_smote=True,
                          max_first_feature=0,
                          do_one_hot=True,
                          with_category=False):
    if (do_smote):
        if (with_category):
            sm = SMOTENC(categorical_features=[0], random_state=42)
        else:
            sm = SMOTE(random_state=42)
        X, y_orig = sm.fit_resample(X, y_orig.reshape(-1))
    elif (do_simple_duplicate):
        c = Counter(y_orig[:, -1])
        mc = c.most_common()[0]
        dup_num = []
        for cc in c.most_common():
            dup_num.append(mc[1] - cc[1])
        dup_x = np.zeros((0, X.shape[1]))
        dup_y = np.zeros((0, y_orig.shape[1]))
        for i in range(len(c)):
            class_ = c.most_common()[i][0]
            idx_c = np.argwhere(y_orig == class_)[:, 0].reshape(-1)
            idx_c = np.random.permutation(idx_c)
            if (idx_c.shape[0] >= dup_num[i]):
                idx_c = idx_c[:dup_num[i]]
            elif (idx_c.shape[0] < dup_num[i]):
                idx_c_ = idx_c[:(dup_num[i] - idx_c.shape[0])]
                dup_x = np.vstack((dup_x, X[idx_c_, :]))
                dup_y = np.vstack((dup_y, y_orig[idx_c_, :]))
            dup_x = np.vstack((dup_x, X[idx_c, :]))
            dup_y = np.vstack((dup_y, y_orig[idx_c, :]))
        X = np.vstack((X, dup_x))
        y_orig = np.vstack((y_orig, dup_y))
        idx = [i for i in range(X.shape[0])]
        idx = np.random.permutation(idx)
        X = X[idx]
        y_orig = y_orig[idx].astype(np.int)

    if (do_one_hot):
        if (max_first_feature == 0):
            max_first_feature = np.max(X[:, 0]).astype(np.int)
        one_hot_first_feature = np.eye(max_first_feature)[
            X[:, 0].reshape(-1).astype(np.int) - 1]
        X = np.hstack((one_hot_first_feature, X[:, 1:]))
    max_minus_min = np.max(X, axis=0) - np.min(X, axis=0)
    idx = np.argwhere(max_minus_min.astype(np.int) == 0)
    if (idx.shape[0] != 0):
        max_minus_min[idx[:, 0]] = 1
    X = (X - np.mean(X, axis=0)) / max_minus_min
    # X = normalize(X, axis=0)
    # X[:, -1] = normalize(X[:, -1].reshape(-1, 1), axis=0)

    if (do_one_hot):
        new_x = np.hstack((one_hot_first_feature, X[:, 1:]))
    new_x = X

    return new_x, y_orig.reshape(-1, 1)
Example #15
0
def test_smotenc_fit_resample():
    X, y, categorical_features = data_heterogneous_unordered()
    target_stats = Counter(y)
    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
    _, y_res = smote.fit_resample(X, y)
    _ = Counter(y_res)
    n_samples = max(target_stats.values())
    assert all(value >= n_samples for value in Counter(y_res).values())
Example #16
0
 def oversampling_cat(X_train, y_train, categorical_features_indices):
     # over_sampler = RandomOverSampler()
     # X_train_res, y_train_res = over_sampler.fit_sample(X_train, y_train)
     sm = SMOTENC(random_state=42,
                  categorical_features=categorical_features_indices)
     X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
     print('Resampled dataset shape %s' % Counter(y_train_res))
     return X_train_res, y_train_res
Example #17
0
def test_smotenc_fit_resample_sampling_strategy():
    X, y, categorical_features = data_heterogneous_unordered_multiclass()
    expected_stat = Counter(y)[1]
    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
    sampling_strategy = {2: 25, 0: 25}
    smote.set_params(sampling_strategy=sampling_strategy)
    X_res, y_res = smote.fit_resample(X, y)
    assert Counter(y_res)[1] == expected_stat
def test_heterogeneous_smote_k_custom_nn(heterogeneous_data):
    X, y, categorical_features = heterogeneous_data
    smote = SMOTENC(categorical_features,
                    k_neighbors=_CustomNearestNeighbors(n_neighbors=5))
    X_res, y_res = smote.fit_resample(X, y)

    assert X_res.shape == (40, 4)
    assert Counter(y_res) == {0: 20, 1: 20}
Example #19
0
def oversample(dataframe: pd.DataFrame, cat_feats):
    X = dataframe.drop("Reservation_Status", axis="columns")
    y = dataframe.loc[:, "Reservation_Status"]
    smote_enc = SMOTENC(categorical_features=cat_feats, random_state=42)
    X_res, y_res = smote_enc.fit_resample(X, y)
    out_df = X_res.copy(deep=True)
    out_df["Reservation_Status"] = y_res
    return out_df
Example #20
0
def main():
    logger = logging.getLogger(__name__)

    processed_df = pd.read_csv(f'../../data/processed/processed.csv')

    id_col = ['customerID']
    target_col = ["Churn"]
    cols = [i for i in processed_df.columns if i not in id_col + target_col]

    cate_cols = processed_df.nunique()[processed_df.nunique() ==
                                       2].keys().tolist()
    cate_cols = [col for col in cate_cols if col not in target_col]
    cate_cols_idx = [processed_df.columns.get_loc(col) for col in cate_cols]

    smote_X = processed_df[cols]
    smote_Y = processed_df[target_col]

    smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split(
        smote_X, smote_Y, test_size=.25, random_state=111)
    logger.info(f'Applying SMOTE')

    os = SMOTENC(categorical_features=cate_cols_idx,
                 sampling_strategy='minority',
                 random_state=0)
    os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y)
    os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols)
    os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col)

    logger.info(f'Fitting Logistic Regression and Tuning')

    lr = LogisticRegression(max_iter=500)

    clf = GridSearchCV(estimator=lr, param_grid=LogisticRegression_grid, cv=5)

    best_model = clf.fit(os_smote_X.values, os_smote_Y.values.ravel())

    logger.info(f'Best Parameters: {best_model.best_params_}')

    metrics = create_report(best_model, smote_test_X, smote_test_Y)
    logger.info(f'{metrics}')
    f = open(f'../../models/logistigregression_best_metrics.txt', 'w')
    f.write(metrics)
    f.close()
    joblib.dump(best_model, f'../../models/logsticreg_best.pkl', compress=9)
    logger.info(f'Model and Evaluation saved to "models/"')

    logger.info('Visualising metrics')

    plot_report(processed_df=processed_df,
                algorithm=best_model.best_estimator_,
                test_X=smote_test_X,
                test_Y=smote_test_Y,
                cf='coefficients',
                name='Logistic Regression')

    logger.info('DOWNLOAD PLOT FROM PLOTLY')

    return
Example #21
0
def test_smotenc_fit_resample():
    X, y, categorical_features = data_heterogneous_unordered()
    target_stats = Counter(y)
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    X_res, y_res = smote.fit_resample(X, y)
    target_stats_res = Counter(y_res)
    n_samples = max(target_stats.values())
    assert all(value >= n_samples for value in Counter(y_res).values())
Example #22
0
def test_smotenc_fit_resample_sampling_strategy():
    X, y, categorical_features = data_heterogneous_unordered_multiclass()
    expected_stat = Counter(y)[1]
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    sampling_strategy = {2: 25, 0: 25}
    smote.set_params(sampling_strategy=sampling_strategy)
    X_res, y_res = smote.fit_resample(X, y)
    assert Counter(y_res)[1] == expected_stat
    def _smote_data(self):
        """Performs a SMOTE upsampling of the data. If there are nominal columns detected, it will change SMOTE algorithms."""

        if self.cols_nominal.size > 0:
            cats = self.X_train.columns.isin(self.cols_nominal)
            sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state)
        else:
            sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state)
        self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
Example #24
0
def smotenc_generater(sampling_strategy=None):
    data = pd.read_csv('/tmp/data/small_train.csv')
    data.rename(columns={'hour': 'time'}, inplace=True)
    data['time'] = data['time'].astype('str')
    data['hour'] = data['time'].str[6:]

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    data[sparse_features] = data[sparse_features].fillna('-1', )
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    for feat in dense_features:
        minmax = MinMaxScaler()
        data[feat] = minmax.fit_transform(data[feat].values.reshape(-1, 1))

    # 1.1 smotenc
    # categorical_features = ['hour', 'C1', 'banner_pos',
    #                         'site_category', 'app_category',  # 'device_ip',
    #                         'device_type', 'device_conn_type', 'C15', 'C16', 'C18']

    categorical_list = []
    for cf in categorical_features:
        for num, sf in enumerate(sparse_features):
            if cf == sf:
                categorical_list.append(num)

    train, test = train_test_split(data, test_size=0.2, random_state=2020)

    X_train = pd.np.array(train[sparse_features])
    Y_train = list(train['click'])

    if sampling_strategy:
        print("This smotenc generater using " + str(sampling_strategy))
        smote_nc = SMOTENC(categorical_features=categorical_list,
                           random_state=0,
                           sampling_strategy=sampling_strategy)
    else:
        smote_nc = SMOTENC(categorical_features=categorical_list,
                           random_state=0)

    X_smotenc, Y_smotenc = smote_nc.fit_resample(X_train, Y_train)

    train = pd.DataFrame(X_smotenc, columns=sparse_features)
    train = pd.concat(
        [train, pd.DataFrame(Y_smotenc, columns=['click'])], axis=1)

    for i in categorical_features:
        train[i] = train[i].astype(int)

    print("writing trian file ...")
    train.to_csv('/tmp/data/mayi_smotenc_train_03.csv', index=False)
    print("trian file write done")

    print("writing test file ...")
    test.to_csv('/tmp/data/mayi_smotenc_test_03.csv', index=False)
    print("test file write done")
Example #25
0
def test_smotenc_pandas():
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y, categorical_features = data_heterogneous_unordered_multiclass()
    X_pd = pd.DataFrame(X)
    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
    X_res_pd, y_res_pd = smote.fit_resample(X_pd, y)
    X_res, y_res = smote.fit_resample(X, y)
    assert_array_equal(X_res_pd.to_numpy(), X_res)
    assert_allclose(y_res_pd, y_res)
Example #26
0
def test_smotenc_preserve_dtype():
    X, y = make_classification(n_samples=50, n_classes=3, n_informative=4,
                               weights=[0.2, 0.3, 0.5], random_state=0)
    # Cast X and y to not default dtype
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    smote = SMOTENC(categorical_features=[1], random_state=0)
    X_res, y_res = smote.fit_resample(X, y)
    assert X.dtype == X_res.dtype, "X dtype is not preserved"
    assert y.dtype == y_res.dtype, "y dtype is not preserved"
def validation_norm_pipeline(model, cv, X_train, y_train):

    oversample = SMOTENC(categorical_features=[
        2, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
        42, 43, 44
    ],
                         random_state=42)

    cv_scores = []
    i = 1
    for train_index_fold, validation_index_fold in cv.split(X_train, y_train):

        #Training Data
        X_train_fold, y_train_fold = X_train[train_index_fold], y_train[
            train_index_fold]

        #Validation Data
        X_validation_fold, y_validation_fold = X_train[
            validation_index_fold], y_train[validation_index_fold]

        #Apply SMOTE to upsample training data
        X_upsampled_train_fold, y_upsampled_train_fold = oversample.fit_resample(
            X_train_fold, y_train_fold)

        #Apply Min-Max Normalization
        scaler = MinMaxScaler()

        #Fit on training set
        scaler.fit(X_upsampled_train_fold)

        #scale on training set
        X_upsampled_train_fold = scaler.transform(X_upsampled_train_fold)

        #scale the validation dataset
        X_validation_fold = scaler.transform(X_validation_fold)

        #Fit the model
        clf = model.fit(X_upsampled_train_fold, y_upsampled_train_fold)

        #Compute score on validation set
        score = balanced_accuracy_score(y_validation_fold,
                                        clf.predict(X_validation_fold))
        print("Fold " + str(i) + " accuracy: " + str(score))
        i += 1
        cv_scores.append(score)

    mean = 0

    for elem in cv_scores:
        mean += elem

    mean = mean / 10

    return mean
Example #28
0
    def convert_and_save(self, processed_datapath):
        """변환한 데이터를 파일로 저장."""
        datapath = self.orig_file
        column_type = json.load(open(f"{CONFIGPATH}/column_list.json"))

        df = pd.read_csv(datapath, sep=",", dtype=column_type)

        #print (df)
        df = self.fillempty(df, column_type)
        df = df.apply(pd.to_numeric)

        print(f"dfiloc {df.iloc[1, 1:]}")
        #print("df nan number =" + df.isna().sum())
        print(f"type {df.iloc[1, 2]}")
        print(f"type {type(df.iloc[1, 2])}")

        # smote
        from imblearn.over_sampling import SMOTENC
        categorial_list = [x for x in range(235)]
        smote = SMOTENC(random_state=42, categorical_features=categorial_list)
        train_input, train_label = smote.fit_resample(df.iloc[:, 1:],
                                                      df.iloc[:, :1])

        print(type(train_label))
        print(type(train_input))

        train_label = np.expand_dims(train_label, axis=-1)
        print(f'train_input.shape is {train_input.shape}')
        print(f'train_label.shape is {train_label.shape}')

        np_smote = np.concatenate([train_label, train_input], axis=-1)
        df_smote = pd.DataFrame(np_smote)

        print(f"df.iloc[0,:] is {df.iloc[0,:]}")

        print(f"df.columns is {df.columns}")

        print(f"df_smote.columns is {df_smote.columns}")
        df_smote.columns = df.columns
        print(f"df_smote.columns is {df_smote.columns}")

        #df_train_input = pd.DataFrame(train_input)
        #df_train_label = pd.DataFrame(train_label)

        #df[:, 1:] = df_train_input
        #df[:, :1] = df_train_label
        # smote 끝

        df, column_names = self.convert_dataset(df_smote, column_type)

        np.random.shuffle(df)
        np.save(processed_datapath, df)
        np.save(processed_datapath.replace(".npy", "_columnnames.npy"),
                column_names)
        print("Saved at %s" % processed_datapath)
Example #29
0
def test_smotenc_check_target_type():
    X, _, categorical_features = data_heterogneous_unordered()
    y = np.linspace(0, 1, 30)
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    with pytest.raises(ValueError, match="Unknown label type: 'continuous'"):
        smote.fit_resample(X, y)
    rng = np.random.RandomState(42)
    y = rng.randint(2, size=(20, 3))
    with pytest.raises(ValueError, match="'y' should encode the multiclass"):
        smote.fit_resample(X, y)
def smote(X,y):
    
    y_cat = (y>0.5).astype(np.int32)
    
    X = np.concatenate([X,np.expand_dims(y,1)],axis=1)

    smote_nc = SMOTENC(categorical_features=np.arange(0,10).tolist(), random_state=0)
    X_resampled, y_resampled = smote_nc.fit_resample(X, y_cat)
    new_x = X_resampled[:,:-1]
    new_y = X_resampled[:,-1]
    return new_x, new_y
Example #31
0
def test_smotenc_pandas():
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y, categorical_features = data_heterogneous_unordered_multiclass()
    X_pd = pd.DataFrame(X)
    smote = SMOTENC(categorical_features=categorical_features,
                    random_state=0)
    X_res_pd, y_res_pd = smote.fit_resample(X_pd, y)
    X_res, y_res = smote.fit_resample(X, y)
    assert X_res_pd.tolist() == X_res.tolist()
    assert_allclose(y_res_pd, y_res)
Example #32
0
def smote(y_name, X_train_keras, y_train_keras):
    #    sm = SMOTENC(categorical_features=['prev_char', 'curr_char', 'next_char'], random_state=0, sampling_strategy=0.6)
    sm = SMOTENC(categorical_features=[0, 1, 2], random_state=0)
    X_train_keras['spurrious'] = 0.0
    X_train_2, y_train_2 = sm.fit_sample(
        X_train_keras[['prev_char', 'curr_char', 'next_char', 'spurrious']],
        y_train_keras[y_name])
    del X_train_2["spurrious"]
    print(X_train_2.head())
    print(y_train_2.head())
    return (X_train_keras, y_train_keras)
Example #33
0
def smotenc_oversampling(DataFrame, y, cat):
    '''
    Make sure drop dependent variable column or else it would be considered
    a categorical variable.  
    '''
    dataset = DataFrame.copy()
    cat_list = get_indicator_columns(dataset)
    se = SMOTENC(
        categorical_features=cat_list, k_neighbors=6, n_jobs=10, sampling_strategy="minority"
    )
    resampled_x, resampled_y = se.fit_resample(dataset, y)
    return resampled_x, resampled_y
def test_smotenc_raising_error_all_categorical(categorical_features):
    X, y = make_classification(
        n_features=3,
        n_informative=1,
        n_redundant=1,
        n_repeated=0,
        n_clusters_per_class=1,
    )
    smote = SMOTENC(categorical_features=categorical_features)
    err_msg = "SMOTE-NC is not designed to work only with categorical features"
    with pytest.raises(ValueError, match=err_msg):
        smote.fit_resample(X, y)
Example #35
0
def test_smote_nc_with_null_median_std():
    # Non-regression test for #662
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/662
    data = np.array([[1, 2, 1, 'A'], [2, 1, 2, 'A'], [1, 2, 3, 'B'],
                     [1, 2, 4, 'C'], [1, 2, 5, 'C']],
                    dtype="object")
    labels = np.array(['class_1', 'class_1', 'class_1', 'class_2', 'class_2'],
                      dtype=object)
    smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0)
    X_res, y_res = smote.fit_resample(data, labels)
    # check that the categorical feature is not random but correspond to the
    # categories seen in the minority class samples
    assert X_res[-1, -1] == "C"
Example #36
0
def test_smotenc_preserve_dtype():
    X, y = make_classification(n_samples=50,
                               n_classes=3,
                               n_informative=4,
                               weights=[0.2, 0.3, 0.5],
                               random_state=0)
    # Cast X and y to not default dtype
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    smote = SMOTENC(categorical_features=[1], random_state=0)
    X_res, y_res = smote.fit_resample(X, y)
    assert X.dtype == X_res.dtype, "X dtype is not preserved"
    assert y.dtype == y_res.dtype, "y dtype is not preserved"
def over_under_sampling(col, strategy):
    y = data[col]
    X = data.drop([col], axis=1)
    sampler = SMOTENC(k_neighbors=2,
                      categorical_features=[1, 2, 3, 4, 5, 6, 7],
                      sampling_strategy=strategy,
                      n_jobs=2)
    X, y = sampler.fit_resample(X, y)
    under_sampler = RandomUnderSampler(sampling_strategy='majority')
    X, y = under_sampler.fit_resample(X, y)
    print('Balancing for {} finished. Result:'.format(col))
    print(Counter(y))
    return pd.concat([X, y], axis=1)
Example #38
0
def test_smotenc(data):
    X, y, categorical_features = data
    smote = SMOTENC(random_state=0, categorical_features=categorical_features)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    assert X_resampled.dtype == X.dtype

    categorical_features = np.array(categorical_features)
    if categorical_features.dtype == bool:
        categorical_features = np.flatnonzero(categorical_features)
    for cat_idx in categorical_features:
        if sparse.issparse(X):
            assert set(X[:, cat_idx].data) == set(X_resampled[:, cat_idx].data)
            assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
        else:
            assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
            assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
Example #39
0
def test_smotenc_error():
    X, y, _ = data_heterogneous_unordered()
    categorical_features = [0, 10]
    smote = SMOTENC(random_state=0, categorical_features=categorical_features)
    with pytest.raises(ValueError, match="indices are out of range"):
        smote.fit_resample(X, y)
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# When dealing with a mixed of continuous and categorical features, SMOTE-NC
# is the only method which can handle this case.

# create a synthetic data set with continuous and categorical features
rng = np.random.RandomState(42)
n_samples = 50
X = np.empty((n_samples, 3), dtype=object)
X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object)
X[:, 1] = rng.randn(n_samples)
X[:, 2] = rng.randint(3, size=n_samples)
y = np.array([0] * 20 + [1] * 30)

print('The original imbalanced dataset')
print(sorted(Counter(y).items()))
print('The first and last columns are containing categorical features:')
print(X[:5])

smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
print('Dataset after resampling:')
print(sorted(Counter(y_resampled).items()))
print('SMOTE-NC will generate categories for the categorical features:')
print(X_resampled[-5:])

plt.show()