Ejemplo n.º 1
0
def to_json(self):
    """
    Return a json from a Spark Dataframe
    :param self:
    :return:
    """
    return collect_as_dict(self.collect())
Ejemplo n.º 2
0
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :param type: Accepts integer, float, string or None
        :return:
        """

        columns = parse_columns(self, columns)

        df = self
        expr = []
        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure

            if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")
            expr.append(
                F.count(
                    F.when(
                        F.isnan(col_name) | F.col(col_name).isNull(),
                        col_name)).alias(col_name))

        result = format_dict(collect_as_dict(df.select(*expr).collect()))

        return result
Ejemplo n.º 3
0
def glom(self):
    """

    :param self: Dataframe
    :return:
    """
    return collect_as_dict(self.rdd.glom().collect()[0])
Ejemplo n.º 4
0
def to_dict(self):
    """
    Return a Python object from a Spark Dataframe
    :param self:
    :return:
    """
    return collect_as_dict(self)
Ejemplo n.º 5
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            counts = (collect_as_dict(
                df.groupBy(col_name + "_buckets").agg(
                    F.count(col_name + "_buckets").alias("count")).cols.rename(
                        col_name + "_buckets",
                        "value").sort(F.asc("value")).collect()))

            hist = []
            for x, y in zip(counts, splits):
                # if x["value"] is not None and x["count"] != 0:
                hist.append({
                    "lower": y["lower"],
                    "upper": y["upper"],
                    "value": x["count"]
                })

        return hist
Ejemplo n.º 6
0
def to_json(self):
    """
    Return a json from a Spark Dataframe
    :param self:
    :return:
    """
    return json.dumps(collect_as_dict(self), ensure_ascii=False, default=json_converter)
Ejemplo n.º 7
0
def table(self, limit=100, columns=None):
    """
    Return a HTML table with the dataframe cols, data types and values
    :param self:
    :param columns: Columns to be printed
    :param limit: how many rows will be printed
    :return:
    """

    columns = parse_columns(self, columns)

    data = collect_as_dict(self.select(columns).limit(limit).collect())

    # Load template
    path = os.path.dirname(os.path.abspath(__file__))
    template_loader = jinja2.FileSystemLoader(searchpath=path +
                                              "//../templates")
    template_env = jinja2.Environment(loader=template_loader, autoescape=True)
    template = template_env.get_template("table.html")

    # Filter only the columns and data type need it
    dtypes = list(filter(lambda x: x[0] in columns, self.dtypes))

    total = self.count()
    if total < limit:
        limit = total

    # Print table
    output = template.render(cols=dtypes, data=data, limit=limit, total=total)
    display(HTML(output))
Ejemplo n.º 8
0
    def _exprs(funcs, columns):
        """
        Helper function to apply multiple columns expression to multiple columns
        :param funcs: Aggregation functions from Apache Spark
        :param columns: list or string of columns names or a .
        :return:
        """
        def parse_col_names_funcs_to_keys(data):
            """
            Helper function that return a formatted json with function:value inside columns. Transform from
            {'max_antiguedad_anos': 15,
            'max_m2_superficie_construida': 1800000,
            'min_antiguedad_anos': 2,
            'min_m2_superficie_construida': 20}

            to

            {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}}

            :param data: json data
            :return: json
            """
            functions_array = [
                "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum",
                "variance", "approx_count_distinct", "na", "zeros",
                "percentile"
            ]
            result = {}
            if is_dict(data):
                for k, v in data.items():
                    for f in functions_array:
                        temp_func_name = f + "_"
                        if k.startswith(temp_func_name):
                            _col_name = k[len(temp_func_name):]
                            result.setdefault(_col_name, {})[f] = v
                return result
            else:
                return data

        columns = parse_columns(self, columns)

        # Ensure that is a list
        funcs = val_to_list(funcs)

        df = self

        # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving
        # unexpected results
        # df = df.cols.cast(columns, "float")

        # Create a Column Expression for every column
        exprs = []
        for col_name in columns:
            for func in funcs:
                exprs.append(
                    func(col_name).alias(func.__name__ + "_" + col_name))

        return (parse_col_names_funcs_to_keys(
            format_dict(collect_as_dict(df.agg(*exprs).collect()))))
Ejemplo n.º 9
0
 def count_zeros(columns):
     """
     Return the NAN and Null count in a Column
     :param columns: '*', list of columns names or a single column name.
     :param type: Accepts integer, float, string or None
     :return:
     """
     columns = parse_columns(self, columns)
     df = self
     return format_dict(collect_as_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]) \
                                        .collect()))
Ejemplo n.º 10
0
    def frequency(columns, buckets=10):
        """
        Output values frequency in json format
        :param columns: Column to be processed
        :param buckets: Number of buckets
        :return:
        """
        columns = parse_columns(self, columns)
        df = self
        for col_name in columns:
            df = df.groupBy(col_name).count().rows.sort([
                ("count", "desc"), (col_name, "desc")
            ]).limit(buckets).cols.rename(col_name, "value")

        return collect_as_dict(df.collect())
Ejemplo n.º 11
0
    def columns(df, columns, buckets=10):
        """
        Return statistical information about a specific column in json format
        count_data_type()
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets:
        :return: json object with the
        """

        columns = parse_columns(df, columns)

        # Get just a sample to infer the column data type
        # sample_size_number = sample_size(rows_count, 95.0, 2.0)
        # fraction = sample_size_number / rows_count
        # sample = df.sample(False, fraction, seed=1)

        # Initialize Objects
        column_info = {}
        column_info['columns'] = {}

        rows_count = df.count()
        column_info['rows_count'] = rows_count

        count_dtypes = Profiler.count_data_types(df, columns)

        column_info["count_types"] = count_dtypes["count_types"]
        column_info['size'] = human_readable_bytes(df.size())

        def na(col_name):
            return F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))

        def zeros(col_name):
            return F.count(F.when(F.col(col_name) == 0, col_name))

        # Cast every column to a specific type to ensure the correct profiling
        # For example if we calculate the min or max of a string column with numeric values the result will be incorrect
        for col_name in columns:
            dtype = count_dtypes["columns"][col_name]['dtype']
            # Not force date type conversion, we can not trust that is going to be representative
            if dtype in ["string", "float", "int", "bool"]:
                df = df.cols.cast(col_name, dtype)

        stats = df.cols._exprs(
            [F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, na,
             zeros],
            columns)

        for col_name in columns:
            logging.info("Processing column '" + col_name + "'...")

            col_info = {}
            col_info["stats"] = {}
            column_info['columns'][col_name] = {}

            column_type = count_dtypes["columns"][col_name]['type']
            col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype']

            na = stats[col_name]["na"]
            max_value = stats[col_name]["max"]
            min_value = stats[col_name]["min"]

            col_info['name'] = col_name
            col_info['column_type'] = column_type

            # Numeric Column
            if column_type == "numeric" or column_type == "date":
                # Merge
                col_info["stats"] = stats[col_name]

            # Missing
            col_info['stats']['missing_count'] = round(na, 2)
            col_info['stats']['p_missing'] = round(na / rows_count * 100, 2)
            col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details']

            if column_type == "categorical" or column_type == "numeric" or column_type == "date" or column_type == "bool":
                # Frequency

                col_info['frequency'] = collect_as_dict(df.groupBy(col_name)
                                                        .count()
                                                        .rows.sort([("count", "desc"), (col_name, "desc")])
                                                        .limit(10)
                                                        .withColumn("percentage",
                                                                    F.round((F.col("count") / rows_count) * 100,
                                                                            3))
                                                        .cols.rename(col_name, "value")
                                                        .collect())
                # Uniques
                uniques = stats[col_name].pop("approx_count_distinct")
                col_info['stats']["uniques_count"] = uniques
                col_info['stats']["p_uniques"] = round(uniques / rows_count * 100, 3)

            if column_type == "numeric":
                # Additional Stats
                # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass
                # https: // stackoverflow.com / questions / 45287832 / pyspark - approxquantile - function
                max_value = fast_float(max_value)
                min_value = fast_float(min_value)
                col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95])
                col_info['stats']['range'] = max_value - min_value
                col_info['stats']['median'] = col_info['stats']['quantile'][0.5]
                col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \
                                                           col_info['stats']['quantile'][0.25]
                col_info['stats']['coef_variation'] = round((col_info['stats']['stddev'] / col_info['stats']['mean']),
                                                            5)
                col_info['stats']['mad'] = round(df.cols.mad(col_name), 5)

                col_info["hist"] = df.cols.hist(col_name, min_value, max_value, buckets)

            column_info['columns'][col_name] = col_info

        return column_info
Ejemplo n.º 12
0
def table_html(self,
               limit=10,
               columns=None,
               title=None,
               full=False,
               truncate=True):
    """
    Return a HTML table with the dataframe cols, data types and values
    :param self:
    :param columns: Columns to be printed
    :param limit: How many rows will be printed
    :param title: Table title
    :param full: Include html header and footer
    :param truncate: Truncate the row information

    :return:
    """

    columns = parse_columns(self, columns)

    if limit is None:
        limit = 10

    if limit == "all":
        data = collect_as_dict(self.cols.select(columns))
    else:
        data = collect_as_dict(self.cols.select(columns).limit(limit))

    # Load the Jinja template
    template_loader = jinja2.FileSystemLoader(
        searchpath=absolute_path("/templates/out"))
    template_env = jinja2.Environment(loader=template_loader, autoescape=True)
    template = template_env.get_template("table.html")

    # Filter only the columns and data type info need it
    dtypes = []
    for i, j in zip(self.dtypes, self.schema):
        if i[1].startswith("array<struct"):
            dtype = "array<struct>"
        elif i[1].startswith("struct"):
            dtype = "struct"
        else:
            dtype = i[1]

        dtypes.append((i[0], dtype, j.nullable))

    # Remove not selected columns
    final_columns = []
    for i in dtypes:
        for j in columns:
            if i[0] == j:
                final_columns.append(i)

    total_rows = self.rows.approx_count()

    if limit == "all":
        limit = total_rows
    elif total_rows < limit:
        limit = total_rows

    total_rows = humanize.intword(total_rows)

    total_cols = self.cols.count()
    total_partitions = self.partitions()

    output = template.render(cols=final_columns,
                             data=data,
                             limit=limit,
                             total_rows=total_rows,
                             total_cols=total_cols,
                             partitions=total_partitions,
                             title=title,
                             truncate=truncate)

    if full is True:
        output = HEADER + output + FOOTER
    return output
Ejemplo n.º 13
0
def to_json(self):
    return collect_as_dict(self.collect())
Ejemplo n.º 14
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """

            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            # Parse dtype
            if col_data_type == "smallint" or col_data_type == "tinyint":
                col_data_type = "int"
            elif col_data_type == "float" or col_data_type == "double":
                col_data_type = "decimal"
            elif col_data_type.find("array") >= 0:
                col_data_type = "array"

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":
                logger.print("Processing column '" + col_name + "'...")
                types = collect_as_dict(df
                                        .h_repartition(col_name=col_name)
                                        .withColumn(temp, fbdt(col_name, get_type=True))
                                        .groupBy(temp).count()
                                        )

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                # if boolean not support count na
                if "count_na" in stats[col_name]:
                    nulls = stats[col_name]["count_na"]
                    count_by_data_type[col_data_type] = int(df_count) - nulls
                    count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }
            # Get the greatest count by column data type
            greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get)

            if greatest_data_type_count == "string" or greatest_data_type_count == "boolean":
                cat = "categorical"
            elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal":
                cat = "numeric"
            elif greatest_data_type_count == "date":
                cat = "date"
            elif greatest_data_type_count == "array":
                cat = "array"
            elif greatest_data_type_count == "binary":
                cat = "binary"
            elif greatest_data_type_count == "null":
                cat = "null"
            else:
                cat = None

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**count_by_data_type, **null_missed_count}

            return col