def missing_value_treatment(self, min_threshold): # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats) # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, self.ds1_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. self.ds1_df = f.impute_values(self.ds1_df, missing_value_feats, min_threshold, action=True) self.ds1_df.reset_index(drop=True) # How row in dataframe having more than x% NaN values na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30) print('No of rows having more than 30% NA Values', na_row_cnt) # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats)
# Delete the original dataset and work with Sample to free some space for processing. del train_df # In[4]: color_list = ['green','blue','orange','yellow','red','violet','cyan'] # In[5]: val = x_df.isna().sum().sort_values(ascending=False) f.plot_bar(val.index,(70,10),val,30) # In[6]: f.get_missing_value_feats(x_df) # In[5]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) num_feats.remove('TARGET')