コード例 #1
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :return:
        """

        columns = parse_columns(self, columns)
        df = self
        expr = []

        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean
            if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")

            if is_(df.cols.schema_dtype(col_name), (float, int)):
                expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name))

            elif is_(df.cols.schema_dtype(col_name), (NullType)):
                expr.append(F.count(col_name).alias(col_name))

            else:
                expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name))

        result = format_dict(df.select(*expr).to_json())
        return result
コード例 #2
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def percentile(columns, values=None, error=1):
        """
        Return the percentile of a dataframe
        :param columns:  '*', list of columns names or a single column name.
        :param values: list of percentiles to be calculated
        :param error:
        :return: percentiles per columns
        """

        if values is None:
            values = [0.05, 0.25, 0.5, 0.75, 0.95]

        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        # Get percentiles
        percentile_results = []
        for c in columns:
            percentile_per_col = self \
                .rows.drop_na(c) \
                .cols.cast(c, "double") \
                .approxQuantile(c, values, error)

            percentile_results.append(dict(zip(values, percentile_per_col)))

        percentile_results = dict(zip(columns, percentile_results))

        return format_dict(percentile_results)
コード例 #3
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def mad(columns, more=None):
        """
        Return the Median Absolute Deviation
        :param columns: Column to be processed
        :param more: Return some extra computed values (Median).
        :return:
        """
        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
        result = {}
        for col_name in columns:

            _mad = {}

            # return mean(absolute(data - mean(data, axis)), axis)
            median_value = self.cols.median(col_name)

            mad_value = self.select(col_name) \
                .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \
                .cols.median(col_name)

            if more:
                _mad = {"mad": mad_value, "median": median_value}
            else:
                _mad = {"mad": mad_value}

            result[col_name] = _mad

        return format_dict(result)
コード例 #4
0
ファイル: columns.py プロジェクト: xuliangleon/Optimus
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :param type: Accepts integer, float, string or None
        :return:
        """

        columns = parse_columns(self, columns)

        df = self
        expr = []
        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure

            if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")
            expr.append(
                F.count(
                    F.when(
                        F.isnan(col_name) | F.col(col_name).isNull(),
                        col_name)).alias(col_name))

        result = format_dict(collect_as_dict(df.select(*expr).collect()))

        return result
コード例 #5
0
ファイル: columns.py プロジェクト: rAashutosh/Optimus
    def percentile(columns, values=None, error=1):
        """
        Return the percentile of a dataframe
        :param columns:  '*', list of columns names or a single column name.
        :param values: list of percentiles to be calculated
        :return: percentiles per columns
        """
        start_time = timeit.default_timer()

        if values is None:
            values = [0.05, 0.25, 0.5, 0.75, 0.95]

        columns = parse_columns(self, columns)

        # Get percentiles
        percentile_results = []
        for c in columns:
            percentile_per_col = self \
                .rows.drop_na(c) \
                .cols.cast(c, "double") \
                .approxQuantile(c, values, error)

            percentile_results.append(dict(zip(values, percentile_per_col)))

        percentile_results = dict(zip(columns, percentile_results))

        logging.info("percentile")
        logging.info(timeit.default_timer() - start_time)
        return format_dict(percentile_results)
コード例 #6
0
    def _exprs(funcs, columns):
        """
        Helper function to apply multiple columns expression to multiple columns
        :param funcs: Aggregation functions from Apache Spark
        :param columns: list or string of columns names or a .
        :return:
        """
        def parse_col_names_funcs_to_keys(data):
            """
            Helper function that return a formatted json with function:value inside columns. Transform from
            {'max_antiguedad_anos': 15,
            'max_m2_superficie_construida': 1800000,
            'min_antiguedad_anos': 2,
            'min_m2_superficie_construida': 20}

            to

            {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}}

            :param data: json data
            :return: json
            """
            functions_array = [
                "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum",
                "variance", "approx_count_distinct", "na", "zeros",
                "percentile"
            ]
            result = {}
            if is_dict(data):
                for k, v in data.items():
                    for f in functions_array:
                        temp_func_name = f + "_"
                        if k.startswith(temp_func_name):
                            _col_name = k[len(temp_func_name):]
                            result.setdefault(_col_name, {})[f] = v
                return result
            else:
                return data

        columns = parse_columns(self, columns)

        # Ensure that is a list
        funcs = val_to_list(funcs)

        df = self

        # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving
        # unexpected results
        # df = df.cols.cast(columns, "float")

        # Create a Column Expression for every column
        exprs = []
        for col_name in columns:
            for func in funcs:
                exprs.append(
                    func(col_name).alias(func.__name__ + "_" + col_name))

        return (parse_col_names_funcs_to_keys(
            format_dict(df.agg(*exprs).to_json())))
コード例 #7
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
 def schema_dtype(columns):
     """
     Return the column(s) data type as Type
     :param columns: Columns to be processed
     :return:
     """
     columns = parse_columns(self, columns)
     return format_dict([self.schema[col_name].dataType for col_name in columns])
コード例 #8
0
 def schema_dtypes(columns):
     """
     Return the columns data type as Type
     :param columns:
     :return:
     """
     columns = parse_columns(self, columns)
     return format_dict(
         [self.schema[col_name].dataType for col_name in columns])
コード例 #9
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
 def count_zeros(columns):
     """
     Return the NAN and Null count in a Column
     :param columns: '*', list of columns names or a single column name.
     :param type: Accepts integer, float, string or None
     :return:
     """
     columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
     df = self
     return format_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]).to_json())
コード例 #10
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def dtypes(columns):
        """
        Return the column(s) data type as string
        :param columns: Columns to be processed
        :return:
        """

        columns = parse_columns(self, columns)
        data_types = tuple_to_dict(self.dtypes)

        return format_dict({c: data_types[c] for c in columns})
コード例 #11
0
ファイル: columns.py プロジェクト: xuliangleon/Optimus
 def count_zeros(columns):
     """
     Return the NAN and Null count in a Column
     :param columns: '*', list of columns names or a single column name.
     :param type: Accepts integer, float, string or None
     :return:
     """
     columns = parse_columns(self, columns)
     df = self
     return format_dict(collect_as_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]) \
                                        .collect()))