def SMOTE_cat(DFmain): data = DFmain X, y = reshape_data(DFmain) X_train, X_test, y_train, y_test = splitData(X,y, test_size= .33) sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,14],random_state= 1, sampling_strategy ='minority') X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel()) print("Before SMOTE, counts of label 'yes': {}".format(sum(y_train == 'yes'))) print("After SMOTE, the shape of X_train: ", X_train_smote.shape) print("After SMOTE, the shape of y_train: ", y_train_smote.shape) print("After SMOTE, counts of Class attr 'Yes': ", sum(y_train_smote == 'yes')) print("After SMOTE, counts of Class attr 'No': ", sum(y_train_smote == 'no')) print('\n\na) Go back to main menu') print('b) Go back to pre-processing menu') print('q) Quit') getInput = input('What would you like to do next: ') if(getInput.lower() == 'a'): state = STATE_MAIN elif(getInput.lower() == 'b'): state = STATE_PREPROCESS showPreProcessMenu(state,data) return state
def _smote_data(self): if self.cols_nominal.size > 0: cats = self.X_train.columns.isin(self.cols_nominal) sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state) else: sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state) self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
def DataAugmentation(data, labels, balance=False): # ipdb.set_trace() categorical_features = [ is_categorical(data[:, inx]) for inx in range(data.shape[1]) ] categorical_features_index = np.where(categorical_features)[0] labels = labels.astype('float32') na_inx = np.isnan(labels) data_na, labels_na = data[na_inx], labels[na_inx] data1, labels1 = data[np.logical_not(na_inx)], labels[np.logical_not( na_inx)] if len(labels1 > 2): if balance: data1 = np.nan_to_num(data1, copy=False) data1 = pd.DataFrame(data1) data1 = data1.fillna(0) mappeds = [] for ii in categorical_features_index: data1[ii], mapped = cat2int(data1[ii]) mappeds.append(mapped) # imputation sm = SMOTENC(random_state=42, categorical_features=categorical_features) # sm = SMOTETomek(ratio='auto') data1, labels1 = sm.fit_sample(data1, labels1) data1 = pd.DataFrame(data1) for mapped, ii in zip(mappeds, categorical_features_index): data1[ii] = int2cat(data1[ii], mapped) data1 = data1.values data = np.concatenate([data1, data_na], 0) labels = np.concatenate([labels1, labels_na], 0) return data, labels
def main(): logger = logging.getLogger(__name__) processed_df = pd.read_csv(f'../../data/processed/processed.csv') id_col = ['customerID'] target_col = ["Churn"] cols = [i for i in processed_df.columns if i not in id_col + target_col] cate_cols = processed_df.nunique()[processed_df.nunique() == 2].keys().tolist() cate_cols = [col for col in cate_cols if col not in target_col] cate_cols_idx = [processed_df.columns.get_loc(col) for col in cate_cols] smote_X = processed_df[cols] smote_Y = processed_df[target_col] smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split( smote_X, smote_Y, test_size=.25, random_state=111) logger.info(f'Applying SMOTE') os = SMOTENC(categorical_features=cate_cols_idx, sampling_strategy='minority', random_state=0) os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y) os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols) os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col) logger.info(f'Fitting Logistic Regression and Tuning') lr = LogisticRegression(max_iter=500) clf = GridSearchCV(estimator=lr, param_grid=LogisticRegression_grid, cv=5) best_model = clf.fit(os_smote_X.values, os_smote_Y.values.ravel()) logger.info(f'Best Parameters: {best_model.best_params_}') metrics = create_report(best_model, smote_test_X, smote_test_Y) logger.info(f'{metrics}') f = open(f'../../models/logistigregression_best_metrics.txt', 'w') f.write(metrics) f.close() joblib.dump(best_model, f'../../models/logsticreg_best.pkl', compress=9) logger.info(f'Model and Evaluation saved to "models/"') logger.info('Visualising metrics') plot_report(processed_df=processed_df, algorithm=best_model.best_estimator_, test_X=smote_test_X, test_Y=smote_test_Y, cf='coefficients', name='Logistic Regression') logger.info('DOWNLOAD PLOT FROM PLOTLY') return
def _smote_data(self): """Performs a SMOTE upsampling of the data. If there are nominal columns detected, it will change SMOTE algorithms.""" if self.cols_nominal.size > 0: cats = self.X_train.columns.isin(self.cols_nominal) sm = SMOTENC(categorical_features=cats, sampling_strategy='not majority', random_state=self.random_state) else: sm = SMOTE(sampling_strategy='not majority', random_state=self.random_state) self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train)
def smote(y_name, X_train_keras, y_train_keras): # sm = SMOTENC(categorical_features=['prev_char', 'curr_char', 'next_char'], random_state=0, sampling_strategy=0.6) sm = SMOTENC(categorical_features=[0, 1, 2], random_state=0) X_train_keras['spurrious'] = 0.0 X_train_2, y_train_2 = sm.fit_sample( X_train_keras[['prev_char', 'curr_char', 'next_char', 'spurrious']], y_train_keras[y_name]) del X_train_2["spurrious"] print(X_train_2.head()) print(y_train_2.head()) return (X_train_keras, y_train_keras)
def custom_smote(df, cat_cols, random_state=1234): """ Creates synthetic DataFrame for a Input DataFrame by splitting dataframe to minority and majority using train_test_split by a given percentage. Adds a boolean column to minority and majority assigning, where all column values for one dataset will be 0 and 1 for the other. Joins the two datasets back and feeds to SMOTE algorithm for synthetic data generation. Parameters ---------- df : pd.DataFrame cat_cols : list List of categorical columns random_state : int Returns ------- output : pd.DataFrame """ df_dtypes = df.dtypes.astype('str').to_dict() logging.debug("shuffling the data just in case if the datafile " "itself has majority and minority grouped together") df = df.sample(frac=1) df_dtypes.update({'__flag_value': 'int8'}) minority = df.copy() majority = df.append([df, df], ignore_index=True) minority['__flag_value'] = 1 majority['__flag_value'] = 0 df_x = majority.append(minority, ignore_index=True) y = df_x.iloc[:, df_x.columns == '__flag_value'].squeeze() logging.info("Performing Smote operation on the DataFrame") sm = SMOTENC(categorical_features=cat_cols, random_state=random_state, k_neighbors=6) output = sm.fit_sample(df_x, y) logging.info("Creating DataFrame from synthetic results, " "and casting Input DataFrame column names and dtypes") output = pd.DataFrame(output[0], columns=list(df_dtypes.keys() )).astype(df_dtypes) output = output[output['__flag_value'] != 0] output.drop('__flag_value', inplace=True, axis='columns') return output
def apply_smote(df, index, cat_vars=False): #Applies smote to the df dataframe in the Total Conversions variable. #X should not have any type pf transformations #Input: df: dataframe to be smoted. # index: True or False, indicating which value is from the minority # group (True) and which is from the majority group (False) #Returns: smoted and one-hot encoded from imblearn.over_sampling import SMOTENC, SMOTE import pandas as pd import numpy as np #assigne class 1 to minorty class and 0 to majority class df['Class'] = index df['Class'] = df['Class'].replace({True: 1, False: 0}) #split dataframe into Class and variables dataframes y = df['Class'].values.flatten() X = df.drop('Class', axis=1) X_new = [] if cat_vars == False: smote = SMOTE(random_state=12) X_new, _ = smote.fit_resample(X, y) else: ind = [np.where(df.columns == x)[0][0] for x in cat_vars] smote_nc = SMOTENC(categorical_features=ind, random_state=12) X_new, _ = smote_nc.fit_sample(X, y) new_df = pd.DataFrame(X_new, columns=X.columns) #convert numerical columns to float if cat_vars: for c in new_df.copy(): if c not in cat_vars: new_df[c] = new_df[c].astype('float') return new_df
imputed_df_1s.index = scaled_1s.index X = imputed_df_1s.drop(['_1s_side'], axis=1) y = imputed_df_1s['_1s_side'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=False) # In[17]: from imblearn.over_sampling import SMOTENC sm = SMOTENC(random_state=42, categorical_features=[6]) X_resampled, y_resampled = sm.fit_sample(X_train, y_train) # In[18]: print("Before OverSampling, counts of label '0': {}".format( sum(pd.DataFrame(y_train)['_1s_side'] == 0))) print("Before OverSampling, counts of label '1': {}".format( sum(pd.DataFrame(y_train)['_1s_side'] == 1))) print("Before OverSampling, counts of label '2': {}".format( sum(pd.DataFrame(y_train)['_1s_side'] == 2))) print("After OverSampling, counts of label '0': {}".format( sum(pd.DataFrame(y_resampled)[0] == 0))) print("After OverSampling, counts of label '1': {}".format( sum(pd.DataFrame(y_resampled)[0] == 1))) print("After OverSampling, counts of label '2': {}".format( sum(pd.DataFrame(y_resampled)[0] == 2)))
test_data = pd.read_csv("./datasets/testset.csv") # Splitting into x-y train_y = training_data["ClaimAmount"] train_x = training_data.drop("rowIndex", axis=1, inplace=False) train_x.drop("ClaimAmount", axis=1, inplace=True) test_x = test_data.drop("rowIndex", axis=1, inplace=False) train_y_categorical = train_y.astype('bool') train_y_categorical = train_y_categorical.astype('int') # Adding synthetic samples categorical_feats = [2, 3, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17] sm = SMOTENC(categorical_features=categorical_feats, random_state=27, sampling_strategy=1.0) train_x_synthetic, train_y_categorical_synthetic = sm.fit_sample(train_x, train_y_categorical) train_x_synthetic = pd.DataFrame(data=train_x_synthetic[0:,0:], columns=test_x.columns) train_x_claims_only = training_data[training_data.ClaimAmount != 0] train_x_claims_only.drop("rowIndex", axis=1, inplace=True) train_x_claims_only.drop("ClaimAmount", axis=1, inplace=True) train_y_claims_only = train_y[train_y != 0] # rf_train_errors = [] # rf_cv_errors = [] # for i in range(25, 30): # model = RandomForestClassifier(n_estimators=20, random_state=0, n_jobs=-1, max_depth=28, bootstrap=False)
def z_dataset_preprocessing_pipeline(X_train, X_test, y_train=None, scaler=StandardScaler(), drop=None, oversampling=True, return_pipeline_object=False): """ ######## Work in progress. Code works good enough. Takes X_train, and X_test DataFrames. Then seperates DataFrame by categorical and numerical coulmns, and performs OneHotEncoding with droping control on categorical coulumns and scaling on numerical columns, user can select scalers. Returns transformed DataFrames. All transforming steps are done using scikit-learn preprocessing, pipeline, and compose objects; and DataFrame creation is done with pandas. :::: MAKE SURE EVERY FEATURE HAS CORRECT DATA TYPE; EITHER CATEGORICAL OR NUMERICAL ::: Parameters: =========== X_train = pandas.DataFrame object; no default, training split of the DataFrame. X_test = pandas.DataFrame object; no default, testing split of the DataFrame. scaler = `sklarn scaler object` or `None`; default: StandardScaler(), *** IMPORT desired scaler before using. *** *** OR call with this module. all of them are imported and ready to use inside this module.*** Available options: - StandardScaler: removes the mean and scales the data to unit variance. - MinMaxScaler: rescales the data set such that all feature values are in the range [0, 1] - RobustScaler: is based on percentiles and are therefore not influenced by a few number of very large marginal outliers. - QuantileTransformer: applies a non-linear transformation such that the probability density function of each feature will be mapped to a uniform or Gaussian distribution. - PowerTransformer: applies a power transformation to each feature to make the data more Gaussian-like in order to stabilize variance and minimize skewness. - MaxAbsScaler: is similar to `MinMaxScaler` except that the values are mapped in the range [0, 1] - Normalizer: rescales the vector for each sample to have unit norm, independently of the distribution of the samples. - None: does not scale data. #::: NOT TESTED :::# drop = str or `None`; default: None. Option to control OneHotEncoder droping. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two categories. Features with 1 or more than 2 categories are left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. oversampling = boolean; default: True, turn oversampling on or off; - `True` oversamples. - `False` no oversampling. return_pipeline_object= boolean; default: False, {not sure how it might be useful though #::: NOT TESTED :::#} control object return. - `True` returns object. - `False` does not return object. NOTE: - possible error if test data has unseen category; creating new DataFrame will fail. - Source can be modified to add more preprocessing steps. Stage: Coding Next steps: - use OOP to make this a class. - Add oversampling method changing option. - add imputer in the pipeline. - add and remove steps in pipeline option. ---version 0.0.1 beta--- """ # isolating numerical features nume_cols = X_train.select_dtypes('number').columns.to_list() # isolating categorical features cate_cols = X_train.select_dtypes('category').columns.to_list() # pipeline for processing categorical features pipe_cate = Pipeline([('ohe', OneHotEncoder(sparse=False, drop=drop))]) # pipeline for processing numerical features pipe_nume = Pipeline([('scaler', scaler)]) # Coulmn transformer preprocessor = ColumnTransformer([ ('numerical_features', pipe_nume, nume_cols), ('categorical_features', pipe_cate, cate_cols) ]) # creating a pandas.DataFrame with appropriate header # creating modified X_train ret_X_train = pd.DataFrame( preprocessor.fit_transform(X_train), columns=nume_cols + preprocessor.named_transformers_['categorical_features']. named_steps['ohe'].get_feature_names(cate_cols).tolist()) # creating modified X_test ## NOTE: possible error if test data has unseen category, in this step. ## for debugging such error modify this, and its processing steps `in pipe_cate`. ret_X_test = pd.DataFrame( preprocessor.transform(X_test), columns=nume_cols + preprocessor.named_transformers_['categorical_features']. named_steps['ohe'].get_feature_names(cate_cols).tolist()) # NEW ADDITION if oversampling: smotenc_features = [True] * len(nume_cols) + [False] * len( preprocessor.named_transformers_['categorical_features']. named_steps['ohe'].get_feature_names(cate_cols).tolist()) oversampling_ = SMOTENC(categorical_features=smotenc_features, n_jobs=-1) X_train_oversampled = oversampling_.fit_sample(ret_X_train, y_train) if return_pipeline_object: if oversampling: return preprocessor, X_train_oversampled, ret_X_test else: return preprocessor, ret_X_train, ret_X_test else: if oversampling: return X_train_oversampled, ret_X_test else: return ret_X_train, ret_X_test
# Creating a bar plot sns.barplot(x=feature_imp, y=feature_imp.index) # Add labels to your graph plt.xlabel('Feature Importance Score') plt.ylabel('Features') plt.title("Visualizing Important Features") plt.legend() plt.show() x.dtypes """## TP""" from imblearn.over_sampling import SMOTENC sm = SMOTENC(categorical_features=x.columns,random_state=0) x_train_res, y_train_res = sm.fit_sample(x_train, y_train) cols_to_check = ['agent', 'cntr','referer'] data['is_na'] = data[cols_to_check].isnull().apply(lambda x: all(x), axis=1) data.head() print(data['is_na'].value_counts()) print(data.info()) cols = data.copy() """Handle Missing Values for agent, country, referer Not using address & bank account thus ignored """