def seperate_cat_num_var(self, df): # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(df) print(len(num_feats), len(cat_feats)) for x in range(len(self.lst)): if self.lst[x] in num_feats: num_feats.remove(self.lst[x]) return num_feats, cat_feats
val = x_df.isna().sum().sort_values(ascending=False) f.plot_bar(val.index,(70,10),val,30) # In[6]: f.get_missing_value_feats(x_df) # In[5]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) num_feats.remove('TARGET') # In[6]: f.change_type(x_df,num_feats,10) # In[7]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df)
train_clean_df = pd.read_csv(train_clean) bureau_df = pd.read_csv(bureau_dataset) bureau_balance_df = pd.read_csv(bureau_balance_dataset) b_df = bureau_df.loc[bureau_df[bureau_df['SK_ID_CURR'].isin(train_clean_df.SK_ID_CURR)].index] b_df.reset_index(drop=True) bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index] bal_df.reset_index(drop=True) del bureau_df, bureau_balance_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_b,cat_feats_b = f.distinct_feats(b_df) print(len(num_feats_b),len(cat_feats_b)) #num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df) #print(len(num_feats_bal),len(cat_feats_bal)) # Change the datatype of categorical and numerical values f.change_type(b_df,num_feats_b,count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_b,cat_feats_b = f.distinct_feats(b_df) for i in ['SK_ID_BUREAU','SK_ID_CURR']: num_feats_b.remove(i) print(len(num_feats_b),len(cat_feats_b))
# Import working dataset train_df = pd.read_csv(train_dataset) # Create a new dataset same as train data x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True) # Delete the original dataset and work with Sample to free some space for processing. del train_df ################################ CHANGING THE DATA TYPES ################################ # Observe the features with missing values f.get_missing_value_feats(x_df) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Change the datatype of categorical and numerical values f.change_type(x_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats) ############################# IDENTIFYING MISSING FEATS #########################
# Import User Libraries and Data Pre-Proccessing Scripts import Model.FunctionLib as f #import Model.Preprocessing_app_train #import Model.Preprocessing_app_test #import Model.Preprocessing_bureau ################################ IMPORT LATEST DATASET ################################ train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv") train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True) # Change the datatype of categorical and numerical values (NOT REQUIRED) #f.change_type(train_df,num_feats,count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(train_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Get the list of attributes and their properties to start par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats, cat_feats) ############# FEATURE CORRELATIONS ########## # Code Block to find the correlated features for various features including featues including each category correlations # This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer # Not using this approach for now as there are no strong correlations with missing value columns x_df_dum = pd.get_dummies(train_df) x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1]
import Model.FunctionLib as f # Import working dataset train_clean_bureau_df = pd.read_csv(train_clean_bureau) pos_cash_df = pd.read_csv(pos_cash_dataset) p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin( train_clean_bureau_df.SK_ID_CURR)].index] p_df.reset_index(drop=True) del pos_cash_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_p, cat_feats_p = f.distinct_feats(p_df) print(len(num_feats_p), len(cat_feats_p)) # Change the datatype of categorical and numerical values f.change_type(p_df, num_feats_p, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_p, cat_feats_p = f.distinct_feats(p_df) for i in ['SK_ID_PREV', 'SK_ID_CURR']: num_feats_p.remove(i) print(len(num_feats_p), len(cat_feats_p)) par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p, cat_feats_p)
# Import working dataset train_bureau_poscash_instpmt_clean_df = pd.read_csv( train_bureau_poscash_instpmt_clean) cc_bal_df = pd.read_csv(cc_bal_dataset) c_df = cc_bal_df.loc[cc_bal_df[cc_bal_df['SK_ID_CURR'].isin( train_bureau_poscash_instpmt_clean_df.SK_ID_CURR)].index] c_df.reset_index(drop=True) del cc_bal_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_c, cat_feats_c = f.distinct_feats(c_df) print(len(num_feats_c), len(cat_feats_c)) # Change the datatype of categorical and numerical values f.change_type(c_df, num_feats_c, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_c, cat_feats_c = f.distinct_feats(c_df) for i in ['SK_ID_CURR', 'SK_ID_PREV']: num_feats_c.remove(i) print(len(num_feats_c), len(cat_feats_c)) par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c, cat_feats_c)