def define_dataset(self): # Observe the features with missing values f.get_missing_value_feats(self.ds1_df) # Seperate the categorical and numerical features self.ds1_df.shape num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) # Change the datatype of categorical and numerical values f.change_type(self.ds1_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) par_num_df_start, par_cat_df_start = f.get_params( self.ds1_df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
def define_dataset(self, df=None, ch_type=False, cnt_threshold=2): # Observe the features with missing values if df == None: df = self.ds1_df f.get_missing_value_feats(df) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(df) # Change the datatype of categorical and numerical values if ch_type == True: f.change_type(df, num_feats, count_threshold=cnt_threshold) # Seperate the categorical and numerical features par_num_df_start, par_cat_df_start = self.define_params(df) stats_df = f.feature_stats(df) par_num_df_start = par_num_df_start.join(stats_df, how='left') par_cat_df_start = par_cat_df_start.join(stats_df, how='left') return par_num_df_start, par_cat_df_start
f.get_missing_value_feats(x_df) # In[5]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) num_feats.remove('TARGET') # In[6]: f.change_type(x_df,num_feats,10) # In[7]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) # In[10]: x_df_dum = pd.get_dummies(x_df) x_df_Default_dum = x_df_dum[x_df_dum['TARGET']==1]
bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index] bal_df.reset_index(drop=True) del bureau_df, bureau_balance_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_b,cat_feats_b = f.distinct_feats(b_df) print(len(num_feats_b),len(cat_feats_b)) #num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df) #print(len(num_feats_bal),len(cat_feats_bal)) # Change the datatype of categorical and numerical values f.change_type(b_df,num_feats_b,count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_b,cat_feats_b = f.distinct_feats(b_df) for i in ['SK_ID_BUREAU','SK_ID_CURR']: num_feats_b.remove(i) print(len(num_feats_b),len(cat_feats_b)) par_num_df_start, par_cat_df_start = f.get_params(b_df, num_feats_b, cat_feats_b) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as
# Delete the original dataset and work with Sample to free some space for processing. del train_df ################################ CHANGING THE DATA TYPES ################################ # Observe the features with missing values f.get_missing_value_feats(x_df) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Change the datatype of categorical and numerical values f.change_type(x_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats) ############################# IDENTIFYING MISSING FEATS ######################### # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(x_df) missing_value_feats # Calculate Missing Value percentage and Visualize
pos_cash_df = pd.read_csv(pos_cash_dataset) p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin( train_clean_bureau_df.SK_ID_CURR)].index] p_df.reset_index(drop=True) del pos_cash_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_p, cat_feats_p = f.distinct_feats(p_df) print(len(num_feats_p), len(cat_feats_p)) # Change the datatype of categorical and numerical values f.change_type(p_df, num_feats_p, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_p, cat_feats_p = f.distinct_feats(p_df) for i in ['SK_ID_PREV', 'SK_ID_CURR']: num_feats_p.remove(i) print(len(num_feats_p), len(cat_feats_p)) par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p, cat_feats_p) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset
cc_bal_df = pd.read_csv(cc_bal_dataset) c_df = cc_bal_df.loc[cc_bal_df[cc_bal_df['SK_ID_CURR'].isin( train_bureau_poscash_instpmt_clean_df.SK_ID_CURR)].index] c_df.reset_index(drop=True) del cc_bal_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_c, cat_feats_c = f.distinct_feats(c_df) print(len(num_feats_c), len(cat_feats_c)) # Change the datatype of categorical and numerical values f.change_type(c_df, num_feats_c, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_c, cat_feats_c = f.distinct_feats(c_df) for i in ['SK_ID_CURR', 'SK_ID_PREV']: num_feats_c.remove(i) print(len(num_feats_c), len(cat_feats_c)) par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c, cat_feats_c) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset