Exemple #1
0
 def select(self):
     """
     Select outliers rows using the selected column
     :return:
     """
     df = self.df
     col_name = self.col_name
     upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
     return df.rows.select((df[col_name] > upper_bound) | (df[col_name] < lower_bound))
Exemple #2
0
 def select(self):
     """
     Select outliers rows using the selected column
     :return:
     """
     col_name = self.col_name
     upper_bound, lower_bound = dict_filter(self.whiskers(),
                                            ["upper_bound", "lower_bound"])
     return self.df.rows.select((F.col(col_name) > upper_bound)
                                | (F.col(col_name) < lower_bound))
Exemple #3
0
    def info(self):
        """
        Get whiskers, iqrs and outliers and non outliers count
        :return:
        """
        upper_bound, lower_bound, = dict_filter(self.whiskers(),
                                                ["upper_bound", "lower_bound"])

        return {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "lower_bound": lower_bound,
            "upper_bound": upper_bound,
        }
Exemple #4
0
    def info(self, output: str = "dict"):
        """
        Get whiskers, iqrs and outliers and non outliers count
        :return:
        """
        upper_bound, lower_bound, = dict_filter(self.whiskers(),
                                                ["upper_bound", "lower_bound"])

        result = {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "lower_bound": lower_bound,
            "lower_bound_count": self.count_lower_bound(lower_bound),
            "upper_bound": upper_bound,
            "upper_bound_count": self.count_upper_bound(upper_bound)
        }
        if output == "json":
            result = dump_json(result)
        return result
Exemple #5
0
    def __init__(self,
                 df,
                 col_name,
                 threshold: int,
                 relative_error: int = RELATIVE_ERROR):
        """

        :param df:
        :param col_name:
        :type threshold: object
        :type relative_error: object
        """
        self.df = df
        self.col_name = col_name
        self.threshold = threshold
        self.relative_error = relative_error
        self.upper_bound, self.lower_bound = dict_filter(
            self.whiskers(), ["upper_bound", "lower_bound"])
        super().__init__(df, col_name, self.lower_bound, self.upper_bound)
Exemple #6
0
 def drop(self):
     col_name = self.col_name
     upper_bound, lower_bound = dict_filter(self.whiskers(),
                                            ["upper_bound", "lower_bound"])
     return self.df.rows.drop((F.col(col_name) > upper_bound)
                              | (F.col(col_name) < lower_bound))
Exemple #7
0
    def __init__(self, df, col_name):
        """

        :param df: Spark Dataframe
        :param col_name: column name
        """
        self.df = df
        self.col_name = col_name

        self.lower_bound, self.upper_bound, self.q1, self.median, self.q3, self.iqr = dict_filter(
            self.whiskers(),
            ["lower_bound", "upper_bound", "q1", "median", "q3", "iqr"])
        # print(self.lower_bound, self.upper_bound)
        super().__init__(df, col_name, self.lower_bound, self.upper_bound)