Esempio n. 1
0
def SMOTE_cat(DFmain):
    data = DFmain
    X, y = reshape_data(DFmain)
    
    X_train, X_test, y_train, y_test = splitData(X,y, test_size= .33)
        
    sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,14],random_state= 1,
             sampling_strategy ='minority') 
    X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel())
    
    
    print("Before SMOTE, counts of label 'yes': {}".format(sum(y_train 
                                                                 == 'yes')))
    print("After SMOTE, the shape of X_train: ", X_train_smote.shape) 
    print("After SMOTE, the shape of y_train: ", y_train_smote.shape)  
    print("After SMOTE, counts of Class attr 'Yes': ", sum(y_train_smote 
                                                           == 'yes'))
    print("After SMOTE, counts of Class attr 'No': ", sum(y_train_smote 
                                                          == 'no'))
    
    print('\n\na) Go back to main menu')
    print('b) Go back to pre-processing menu')
    print('q) Quit')
    
    getInput = input('What would you like to do next: ')  
    
    if(getInput.lower() == 'a'):
        state = STATE_MAIN
    elif(getInput.lower() == 'b'):
        state = STATE_PREPROCESS
        showPreProcessMenu(state,data)
        
    return state
Esempio n. 2
0
 def _smote_data(self):
     if self.cols_nominal.size > 0:
         cats = self.X_train.columns.isin(self.cols_nominal)
         sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state)
     else:
         sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state)
     self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
Esempio n. 3
0
def DataAugmentation(data, labels, balance=False):
    #     ipdb.set_trace()
    categorical_features = [
        is_categorical(data[:, inx]) for inx in range(data.shape[1])
    ]
    categorical_features_index = np.where(categorical_features)[0]
    labels = labels.astype('float32')
    na_inx = np.isnan(labels)
    data_na, labels_na = data[na_inx], labels[na_inx]
    data1, labels1 = data[np.logical_not(na_inx)], labels[np.logical_not(
        na_inx)]

    if len(labels1 > 2):
        if balance:
            data1 = np.nan_to_num(data1, copy=False)
            data1 = pd.DataFrame(data1)
            data1 = data1.fillna(0)
            mappeds = []
            for ii in categorical_features_index:
                data1[ii], mapped = cat2int(data1[ii])
                mappeds.append(mapped)
            # imputation
            sm = SMOTENC(random_state=42,
                         categorical_features=categorical_features)
            #         sm = SMOTETomek(ratio='auto')
            data1, labels1 = sm.fit_sample(data1, labels1)
            data1 = pd.DataFrame(data1)
            for mapped, ii in zip(mappeds, categorical_features_index):
                data1[ii] = int2cat(data1[ii], mapped)
            data1 = data1.values

        data = np.concatenate([data1, data_na], 0)
        labels = np.concatenate([labels1, labels_na], 0)

    return data, labels
Esempio n. 4
0
def main():
    logger = logging.getLogger(__name__)

    processed_df = pd.read_csv(f'../../data/processed/processed.csv')

    id_col = ['customerID']
    target_col = ["Churn"]
    cols = [i for i in processed_df.columns if i not in id_col + target_col]

    cate_cols = processed_df.nunique()[processed_df.nunique() ==
                                       2].keys().tolist()
    cate_cols = [col for col in cate_cols if col not in target_col]
    cate_cols_idx = [processed_df.columns.get_loc(col) for col in cate_cols]

    smote_X = processed_df[cols]
    smote_Y = processed_df[target_col]

    smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split(
        smote_X, smote_Y, test_size=.25, random_state=111)
    logger.info(f'Applying SMOTE')

    os = SMOTENC(categorical_features=cate_cols_idx,
                 sampling_strategy='minority',
                 random_state=0)
    os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y)
    os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols)
    os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col)

    logger.info(f'Fitting Logistic Regression and Tuning')

    lr = LogisticRegression(max_iter=500)

    clf = GridSearchCV(estimator=lr, param_grid=LogisticRegression_grid, cv=5)

    best_model = clf.fit(os_smote_X.values, os_smote_Y.values.ravel())

    logger.info(f'Best Parameters: {best_model.best_params_}')

    metrics = create_report(best_model, smote_test_X, smote_test_Y)
    logger.info(f'{metrics}')
    f = open(f'../../models/logistigregression_best_metrics.txt', 'w')
    f.write(metrics)
    f.close()
    joblib.dump(best_model, f'../../models/logsticreg_best.pkl', compress=9)
    logger.info(f'Model and Evaluation saved to "models/"')

    logger.info('Visualising metrics')

    plot_report(processed_df=processed_df,
                algorithm=best_model.best_estimator_,
                test_X=smote_test_X,
                test_Y=smote_test_Y,
                cf='coefficients',
                name='Logistic Regression')

    logger.info('DOWNLOAD PLOT FROM PLOTLY')

    return
    def _smote_data(self):
        """Performs a SMOTE upsampling of the data. If there are nominal columns detected, it will change SMOTE algorithms."""

        if self.cols_nominal.size > 0:
            cats = self.X_train.columns.isin(self.cols_nominal)
            sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state)
        else:
            sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state)
        self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
Esempio n. 6
0
def smote(y_name, X_train_keras, y_train_keras):
    #    sm = SMOTENC(categorical_features=['prev_char', 'curr_char', 'next_char'], random_state=0, sampling_strategy=0.6)
    sm = SMOTENC(categorical_features=[0, 1, 2], random_state=0)
    X_train_keras['spurrious'] = 0.0
    X_train_2, y_train_2 = sm.fit_sample(
        X_train_keras[['prev_char', 'curr_char', 'next_char', 'spurrious']],
        y_train_keras[y_name])
    del X_train_2["spurrious"]
    print(X_train_2.head())
    print(y_train_2.head())
    return (X_train_keras, y_train_keras)
Esempio n. 7
0
def custom_smote(df, cat_cols, random_state=1234):
    """
    Creates synthetic DataFrame for a Input DataFrame by splitting dataframe
    to minority and majority using train_test_split by a given percentage.
    Adds a boolean column to minority and majority assigning, where all
    column values for one dataset will be 0 and 1 for the other. Joins the
    two datasets back and feeds to SMOTE algorithm for synthetic data
    generation.

    Parameters
    ----------
    df : pd.DataFrame
    cat_cols : list
        List of categorical columns
    random_state : int

    Returns
    -------
    output : pd.DataFrame
    """
    df_dtypes = df.dtypes.astype('str').to_dict()
    logging.debug("shuffling the data just in case if the datafile "
                  "itself has majority and minority grouped together")
    df = df.sample(frac=1)
    df_dtypes.update({'__flag_value': 'int8'})

    minority = df.copy()
    majority = df.append([df, df], ignore_index=True)
    minority['__flag_value'] = 1
    majority['__flag_value'] = 0
    df_x = majority.append(minority, ignore_index=True)
    y = df_x.iloc[:, df_x.columns == '__flag_value'].squeeze()

    logging.info("Performing Smote operation on the DataFrame")
    sm = SMOTENC(categorical_features=cat_cols,
                 random_state=random_state,
                 k_neighbors=6)
    output = sm.fit_sample(df_x, y)

    logging.info("Creating DataFrame from synthetic results, "
                 "and casting Input DataFrame column names and dtypes")
    output = pd.DataFrame(output[0],
                          columns=list(df_dtypes.keys()
                                       )).astype(df_dtypes)
    output = output[output['__flag_value'] != 0]
    output.drop('__flag_value', inplace=True, axis='columns')
    return output
Esempio n. 8
0
def apply_smote(df, index, cat_vars=False):

    #Applies smote to the df dataframe in the Total Conversions variable.
    #X should not have any type pf transformations
    #Input: df: dataframe to be smoted.
    #       index: True or False, indicating which value is from the minority
    #              group (True) and which is from the majority group (False)
    #Returns: smoted and one-hot encoded

    from imblearn.over_sampling import SMOTENC, SMOTE
    import pandas as pd
    import numpy as np

    #assigne class 1 to minorty class and 0 to majority class
    df['Class'] = index
    df['Class'] = df['Class'].replace({True: 1, False: 0})

    #split dataframe into Class and variables dataframes
    y = df['Class'].values.flatten()
    X = df.drop('Class', axis=1)
    X_new = []

    if cat_vars == False:
        smote = SMOTE(random_state=12)
        X_new, _ = smote.fit_resample(X, y)

    else:
        ind = [np.where(df.columns == x)[0][0] for x in cat_vars]

        smote_nc = SMOTENC(categorical_features=ind, random_state=12)
        X_new, _ = smote_nc.fit_sample(X, y)

    new_df = pd.DataFrame(X_new, columns=X.columns)

    #convert numerical columns to float
    if cat_vars:
        for c in new_df.copy():
            if c not in cat_vars:
                new_df[c] = new_df[c].astype('float')

    return new_df
imputed_df_1s.index = scaled_1s.index

X = imputed_df_1s.drop(['_1s_side'], axis=1)
y = imputed_df_1s['_1s_side']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0,
                                                    shuffle=False)

# In[17]:

from imblearn.over_sampling import SMOTENC
sm = SMOTENC(random_state=42, categorical_features=[6])
X_resampled, y_resampled = sm.fit_sample(X_train, y_train)

# In[18]:

print("Before OverSampling, counts of label '0': {}".format(
    sum(pd.DataFrame(y_train)['_1s_side'] == 0)))
print("Before OverSampling, counts of label '1': {}".format(
    sum(pd.DataFrame(y_train)['_1s_side'] == 1)))
print("Before OverSampling, counts of label '2': {}".format(
    sum(pd.DataFrame(y_train)['_1s_side'] == 2)))
print("After OverSampling, counts of label '0': {}".format(
    sum(pd.DataFrame(y_resampled)[0] == 0)))
print("After OverSampling, counts of label '1': {}".format(
    sum(pd.DataFrame(y_resampled)[0] == 1)))
print("After OverSampling, counts of label '2': {}".format(
    sum(pd.DataFrame(y_resampled)[0] == 2)))
test_data = pd.read_csv("./datasets/testset.csv")

# Splitting into x-y
train_y = training_data["ClaimAmount"]
train_x = training_data.drop("rowIndex", axis=1, inplace=False)
train_x.drop("ClaimAmount", axis=1, inplace=True)

test_x = test_data.drop("rowIndex", axis=1, inplace=False)

train_y_categorical = train_y.astype('bool')
train_y_categorical = train_y_categorical.astype('int')

# Adding synthetic samples
categorical_feats = [2, 3, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17]
sm = SMOTENC(categorical_features=categorical_feats, random_state=27, sampling_strategy=1.0)
train_x_synthetic, train_y_categorical_synthetic = sm.fit_sample(train_x, train_y_categorical)

train_x_synthetic = pd.DataFrame(data=train_x_synthetic[0:,0:], columns=test_x.columns)

train_x_claims_only = training_data[training_data.ClaimAmount != 0]
train_x_claims_only.drop("rowIndex", axis=1, inplace=True)
train_x_claims_only.drop("ClaimAmount", axis=1, inplace=True)

train_y_claims_only = train_y[train_y != 0]

# rf_train_errors = []
# rf_cv_errors = []


# for i in range(25, 30):
#     model = RandomForestClassifier(n_estimators=20, random_state=0, n_jobs=-1, max_depth=28, bootstrap=False)
def z_dataset_preprocessing_pipeline(X_train,
                                     X_test,
                                     y_train=None,
                                     scaler=StandardScaler(),
                                     drop=None,
                                     oversampling=True,
                                     return_pipeline_object=False):
    """ ######## Work in progress. Code works good enough.
    Takes X_train, and X_test DataFrames. Then seperates DataFrame by categorical and numerical coulmns, and performs OneHotEncoding with droping control on categorical coulumns and scaling on numerical columns, user can select scalers. 
    Returns transformed DataFrames.
    
    All transforming steps are done using scikit-learn preprocessing, pipeline, and compose objects; and DataFrame creation is done with pandas. 
    
    :::: MAKE SURE EVERY FEATURE HAS CORRECT DATA TYPE; EITHER CATEGORICAL OR NUMERICAL :::

    Parameters:
    ===========

    X_train = pandas.DataFrame object; no default,
                training split of the DataFrame.
    X_test  = pandas.DataFrame object; no default,
                testing split of the DataFrame.
    scaler  = `sklarn scaler object` or `None`; default: StandardScaler(),
                *** IMPORT desired scaler before using. ***
                *** OR call with this module. all of them are imported and ready 
                to use inside this module.***
                Available options:
                - StandardScaler: removes the mean and scales the data to 
                    unit variance. 
                - MinMaxScaler: rescales the data set such that all feature 
                    values are in the range [0, 1]
                - RobustScaler: is based on percentiles and are therefore not
                    influenced by a few number of very large marginal outliers.
                - QuantileTransformer: applies a non-linear transformation 
                    such that the probability density function of each feature
                    will be mapped to a uniform or Gaussian distribution.
                - PowerTransformer: applies a power transformation to each 
                    feature to make the data more Gaussian-like in order to 
                    stabilize variance and minimize skewness.
                - MaxAbsScaler: is similar to `MinMaxScaler` except that the
                    values are mapped in the range [0, 1]
                - Normalizer: rescales the vector for each sample to have 
                    unit norm, independently of the distribution of the samples.
                - None: does not scale data. #::: NOT TESTED :::#
    drop    = str or `None`; default: None.
                Option to control OneHotEncoder droping.
                - None : retain all features (the default).
                - 'first' : drop the first category in each feature. If only one
                  category is present, the feature will be dropped entirely.
                - 'if_binary' : drop the first category in each feature with two
                  categories. Features with 1 or more than 2 categories are
                  left intact.
                - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
                  should be dropped.
    oversampling = boolean; default: True,
                    turn oversampling on or off; 
                - `True` oversamples.
                - `False` no oversampling.
    return_pipeline_object= boolean; default: False, {not sure how it might be useful though #::: NOT TESTED :::#}
                    control object return.
                - `True` returns object.
                - `False` does not return object.
    NOTE: 
        - possible error if test data has unseen category; creating new 
          DataFrame will fail.
        - Source can be modified to add more preprocessing steps.
    
    Stage: Coding

    Next steps: 
    - use OOP to make this a class. 
    - Add oversampling method changing option.
    - add imputer in the pipeline.
    - add and remove steps in pipeline option.

    ---version 0.0.1 beta---
    """
    # isolating numerical features
    nume_cols = X_train.select_dtypes('number').columns.to_list()
    # isolating categorical features
    cate_cols = X_train.select_dtypes('category').columns.to_list()

    # pipeline for processing categorical features
    pipe_cate = Pipeline([('ohe', OneHotEncoder(sparse=False, drop=drop))])
    # pipeline for processing numerical features
    pipe_nume = Pipeline([('scaler', scaler)])

    # Coulmn transformer
    preprocessor = ColumnTransformer([
        ('numerical_features', pipe_nume, nume_cols),
        ('categorical_features', pipe_cate, cate_cols)
    ])

    # creating a pandas.DataFrame with appropriate header
    # creating modified X_train
    ret_X_train = pd.DataFrame(
        preprocessor.fit_transform(X_train),
        columns=nume_cols +
        preprocessor.named_transformers_['categorical_features'].
        named_steps['ohe'].get_feature_names(cate_cols).tolist())

    # creating modified X_test
    ## NOTE: possible error if test data has unseen category, in this step.
    ## for debugging such error modify this, and its processing steps `in pipe_cate`.
    ret_X_test = pd.DataFrame(
        preprocessor.transform(X_test),
        columns=nume_cols +
        preprocessor.named_transformers_['categorical_features'].
        named_steps['ohe'].get_feature_names(cate_cols).tolist())

    # NEW ADDITION
    if oversampling:
        smotenc_features = [True] * len(nume_cols) + [False] * len(
            preprocessor.named_transformers_['categorical_features'].
            named_steps['ohe'].get_feature_names(cate_cols).tolist())
        oversampling_ = SMOTENC(categorical_features=smotenc_features,
                                n_jobs=-1)
        X_train_oversampled = oversampling_.fit_sample(ret_X_train, y_train)

    if return_pipeline_object:
        if oversampling:
            return preprocessor, X_train_oversampled, ret_X_test
        else:
            return preprocessor, ret_X_train, ret_X_test
    else:
        if oversampling:
            return X_train_oversampled, ret_X_test
        else:
            return ret_X_train, ret_X_test
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

x.dtypes

"""## TP"""

from imblearn.over_sampling import SMOTENC
sm = SMOTENC(categorical_features=x.columns,random_state=0)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)

cols_to_check = ['agent', 'cntr','referer']
data['is_na'] = data[cols_to_check].isnull().apply(lambda x: all(x), axis=1) 
data.head()  
print(data['is_na'].value_counts())

print(data.info())

cols = data.copy()

"""Handle Missing Values for agent, country, referer 

Not using address & bank account thus ignored
"""