コード例 #1
0
    def missing_value_treatment(self, min_threshold):
        # Identify na values exist and add them to a list

        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
        # Calculate Missing Value percentage and Visualize
        missing_values_perc_df = f.missing_val_perc(missing_value_feats,
                                                    self.ds1_df)
        val = missing_values_perc_df[0].sort_values(ascending=False)
        f.plot_bar(val.index, (50, 10), val)

        # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove
        # attributes which contain more than 65% of null values.
        self.ds1_df = f.impute_values(self.ds1_df,
                                      missing_value_feats,
                                      min_threshold,
                                      action=True)
        self.ds1_df.reset_index(drop=True)

        # How row in dataframe having more than x% NaN values
        na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30)
        print('No of rows having more than 30% NA Values', na_row_cnt)

        # Identify na values exist and add them to a list
        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats)
############################# IDENTIFYING MISSING FEATS #########################

# Identify na values exist and add them to a list
missing_value_feats = f.get_missing_value_feats(x_df)
missing_value_feats

# Calculate Missing Value percentage and Visualize
missing_values_perc_df = f.missing_val_perc(missing_value_feats, x_df)
val = missing_values_perc_df[0].sort_values(ascending=False)
f.plot_bar(val.index, (50, 10), val)

#################### REMOVING THE VALUES DIRECTLY ##########################
# Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove
# attributes which contain more than 65% of null values.
imp_df = f.impute_values(x_df, missing_value_feats, 65, action=True)
imp_df.reset_index(drop=True)

# How row in dataframe having more than x% NaN values
na_row_cnt = f.get_rowcnt_most_missing_val(imp_df, 30)

# Identify na values exist and add them to a list
missing_value_feats = f.get_missing_value_feats(imp_df)
missing_value_feats