コード例 #1
0
    def outlier_treatment(self, normalized_feats):
        # Find the num and cat feats for imp_df

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        other_feats = [
            x for x in num_feats_imp_df if x not in normalized_feats
        ]

        # Anamolies and data correction.
        # DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage
        feature = 'DAYS_EMPLOYED'
        self.ds1_df[feature].loc[self.ds1_df[self.ds1_df[feature] ==
                                             365243].index] = np.nan

        # XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed.
        self.ds1_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True)

        # Log transformation of all numerical non normalized highly skewed values to remove outliers

        for feature in other_feats:
            print('log_transform', feature)
            self.ds1_df = f.log_transform(self.ds1_df, feature)
            self.ds1_df.drop(self.ds1_df[[feature]], axis=1, inplace=True)

        #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df]
        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)

        for i in num_feats_imp_df:
            print(i)
            out_l, out_r, min, max = f.TurkyOutliers(self.ds1_df,
                                                     i,
                                                     drop=False)
            if (len(out_l) | len(out_r)) > 0:
                self.ds1_df[i].loc[out_l] = round(min, 3)
                self.ds1_df[i].loc[out_r] = round(max, 3)
other_feats = [x for x in num_feats_imp_df if x not in normalized_feats]

# Anamolies and data correction.
# DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage
feature = 'DAYS_EMPLOYED'
imp_df[feature].loc[imp_df[imp_df[feature] == 365243].index] = np.nan

# XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed.
imp_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True)

# Log transformation of all numerical non normalized highly skewed values to remove outliers

for feature in other_feats:
    print(feature)
    imp_df = f.log_transform(imp_df, feature)
    imp_df.drop(imp_df[[feature]], axis=1, inplace=True)

#normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df]
num_feats_imp_df, cat_feats_imp_df = f.distinct_feats(imp_df)
num_feats_imp_df.remove('TARGET')
num_feats_imp_df.remove('SK_ID_CURR')
print(len(num_feats_imp_df), len(cat_feats_imp_df))

for i in num_feats_imp_df:
    print(i)
    #i = 'AMT_REQ_CREDIT_BUREAU_YEAR_log'
    out_l, out_r, min, max = f.TurkyOutliers(imp_df, i, drop=False)
    if (len(out_l) | len(out_r)) > 0:
        imp_df[i].loc[out_l] = round(min, 3)
        imp_df[i].loc[out_r] = round(max, 3)