def remove_outliers(self, df, outlier_removal_col): '''Need to check how it will affect multiple columns''' outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_removal_col) df = self._data_frame.filter( self._data_frame[outlier_removal_col] > ol_lower_range) df = self._data_frame.filter( self._data_frame[outlier_removal_col] < ol_upper_range) return df
def cap_outliers(self, outlier_replacement_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_replacement_col) df_dup = self._data_frame self._data_frame = df_dup.withColumn( outlier_replacement_col, when((df_dup[outlier_replacement_col] < ol_lower_range), ol_lower_range).otherwise(df_dup[outlier_replacement_col])) self._data_frame = self._data_frame.withColumn( outlier_replacement_col, when((self._data_frame[outlier_replacement_col] > ol_upper_range), ol_upper_range).otherwise( self._data_frame[outlier_replacement_col])) return self._data_frame
def mode_impute_outliers(self, outlier_imputation_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_imputation_col) # df_dup = self._data_frame df_without_outliers = self.remove_outliers(self._data_frame, outlier_imputation_col) mode_without_outliers = self.get_mode( self._data_frame, df_without_outliers[outlier_imputation_col]) self._data_frame = self._data_frame.withColumn( outlier_imputation_col, when((self._data_frame[outlier_imputation_col] < ol_lower_range) | (self._data_frame[outlier_imputation_col] > ol_upper_range), mode_without_outliers).otherwise( self._data_frame[outlier_imputation_col])) return self._data_frame
def mean_impute_outliers(self, outlier_imputation_col): outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z( self._data_frame, outlier_imputation_col) # df_dup = self._data_frame df_without_outliers = self.remove_outliers(self._data_frame, outlier_imputation_col) mean_without_outliers = df_without_outliers.agg( avg(outlier_imputation_col)).first()[0] self._data_frame = self._data_frame.withColumn( outlier_imputation_col, when((self._data_frame[outlier_imputation_col] < ol_lower_range) | (self._data_frame[outlier_imputation_col] > ol_upper_range), mean_without_outliers).otherwise( self._data_frame[outlier_imputation_col])) return self._data_frame