def outlier_treatment(self, normalized_feats): # Find the num and cat feats for imp_df num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) other_feats = [ x for x in num_feats_imp_df if x not in normalized_feats ] # Anamolies and data correction. # DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage feature = 'DAYS_EMPLOYED' self.ds1_df[feature].loc[self.ds1_df[self.ds1_df[feature] == 365243].index] = np.nan # XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed. self.ds1_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True) # Log transformation of all numerical non normalized highly skewed values to remove outliers for feature in other_feats: print('log_transform', feature) self.ds1_df = f.log_transform(self.ds1_df, feature) self.ds1_df.drop(self.ds1_df[[feature]], axis=1, inplace=True) #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df] num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) for i in num_feats_imp_df: print(i) out_l, out_r, min, max = f.TurkyOutliers(self.ds1_df, i, drop=False) if (len(out_l) | len(out_r)) > 0: self.ds1_df[i].loc[out_l] = round(min, 3) self.ds1_df[i].loc[out_r] = round(max, 3)
for feature in other_feats: print(feature) imp_df = f.log_transform(imp_df, feature) imp_df.drop(imp_df[[feature]], axis=1, inplace=True) #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df] num_feats_imp_df, cat_feats_imp_df = f.distinct_feats(imp_df) num_feats_imp_df.remove('TARGET') num_feats_imp_df.remove('SK_ID_CURR') print(len(num_feats_imp_df), len(cat_feats_imp_df)) for i in num_feats_imp_df: print(i) #i = 'AMT_REQ_CREDIT_BUREAU_YEAR_log' out_l, out_r, min, max = f.TurkyOutliers(imp_df, i, drop=False) if (len(out_l) | len(out_r)) > 0: imp_df[i].loc[out_l] = round(min, 3) imp_df[i].loc[out_r] = round(max, 3) #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: indicator = MissingIndicator(missing_values=np.nan) mask_missing_values_only = indicator.fit_transform(imp_df) mask_missing_values_only.shape # Num missing values imputations imp_df[num_feats_imp_df] = imp_df[num_feats_imp_df].fillna(