def missing_value_treatment(self, min_threshold): # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats) # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, self.ds1_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. self.ds1_df = f.impute_values(self.ds1_df, missing_value_feats, min_threshold, action=True) self.ds1_df.reset_index(drop=True) # How row in dataframe having more than x% NaN values na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30) print('No of rows having more than 30% NA Values', na_row_cnt) # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats)
# Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats) ############################# IDENTIFYING MISSING FEATS ######################### # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(x_df) missing_value_feats # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, x_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) #################### REMOVING THE VALUES DIRECTLY ########################## # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. imp_df = f.impute_values(x_df, missing_value_feats, 65, action=True) imp_df.reset_index(drop=True) # How row in dataframe having more than x% NaN values na_row_cnt = f.get_rowcnt_most_missing_val(imp_df, 30) # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(imp_df) missing_value_feats