def Data_prep(df, flag): # Removing Loan Id and Loan Status for One-Hot encoding and Imputation # ######################################################## predictors = df.columns predictors[1] predictors = np.delete(predictors, 0) if flag != 1: predictors = np.delete(predictors, -1) print("train module flag 0") flag = 1 df[predictors] # One-Hot Encoding # ######################################################## df_dummy = pd.get_dummies(df[predictors], dummy_na=True) df_dummy df_dummy.count() df_dummy.head() newcols = df_dummy.columns newcols # Data Imputations # ######################################################## from fancyimpute import IterativeImputer df_imputed = IterativeImputer().fit_transform(df_dummy) df_imputed = pd.DataFrame(df_imputed, columns=newcols) df_imputed.head() df_imputed.count() return (df_imputed)
# One-Hot Encoding # ######################################################## train_dummy = pd.get_dummies(train[predictors],dummy_na = True) train_dummy train_dummy.count() train_dummy.head() newcols = train_dummy.columns newcols # Data Imputations # ######################################################## from fancyimpute import IterativeImputer train_imputed = IterativeImputer().fit_transform(train_dummy) train_imputed = pd.DataFrame(train_imputed,columns = newcols) train_imputed.head() train_imputed.count() # Adding Loan Id and Loan Status back again to Data # ######################################################## train_imputed['Loan_Status'] = train['Loan_Status'] train_imputed.head() train_imputed['Loan_Status'] = train_imputed.Loan_Status.map(dict(Y=1,N=0)) #train_imputed.to_csv("train_imputed.csv",sep=',') # Split DataFrame into Train and Test sets # ######################################################## msk = np.random.rand(len(train_imputed))<0.8 msk data_train = train_imputed[msk] data_test = train_imputed[~msk]