def missing_value_imputations(self): #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: #indicator = MissingIndicator(missing_values=np.nan) #mask_missing_values_only = indicator.fit_transform(self.ds1_df) #mask_missing_values_only.shape num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) # Num missing values imputations self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna( value=self.ds1_df[num_feats_imp_df].mean()) # Left missing values are categorical. missing_feats_cat = f.get_missing_value_feats(self.ds1_df) par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80 for feature in missing_feats_cat: if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold: self.ds1_df[feature].fillna( value=par_cat_df.loc[feature]['MODE'], inplace=True) print("Method : MODE , Feature : {} , Mode_Percentage : {}". format(feature, par_cat_df.loc[feature]['MODE_PERCENTAGE'])) else: imp_list, score = f.impute_knn_classifier( self.ds1_df, feature, 5) self.ds1_df[feature].fillna(value=imp_list, inplace=True) print( "Method : KNN , Feature : {} , Imputation Accuracy Score : {}" .format(feature, score)) return par_num_df, par_cat_df
missing_feats_cat = f.get_missing_value_feats(imp_df) par_num_df, par_cat_df = f.get_params(imp_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80 for feature in missing_feats_cat: if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold: imp_df[feature].fillna(value=par_cat_df.loc[feature]['MODE'], inplace=True) print("Method : MODE , Feature : {} , Mode_Percentage : {}".format( feature, par_cat_df.loc[feature]['MODE_PERCENTAGE'])) else: imp_list, score = f.impute_knn_classifier(imp_df, feature, 75) imp_df[feature].fillna(value=imp_list, inplace=True) print("Method : KNN , Feature : {} , Imputation Accuracy Score : {}". format(feature, score)) # Create Numerical features information dataframe # CHECK DIFF from param_df and check the mean median diff # #imp_df = pd.read_csv(wd+"\\Output\\application_train_clean.csv") ##imp_df.drop(imp_df[['Unnamed: 0']],axis=1,inplace=True) num_feats_imp_df, cat_feats_imp_df = f.distinct_feats(imp_df) num_feats_imp_df.remove('SK_ID_CURR') num_feats_imp_df.remove('TARGET') print(len(num_feats_imp_df), len(cat_feats_imp_df))