Esempio n. 1
0
    def nest(input_cols, output_col, shape="string", separator=""):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """

        df = self

        if has_(input_cols, F.Column):
            # Transform non Column data to lit
            columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols]
        else:
            columns = parse_columns(self, input_cols)

        if shape is "vector":
            columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

            vector_assembler = VectorAssembler(
                inputCols=columns,
                outputCol=output_col)
            df = vector_assembler.transform(df)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":
            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
Esempio n. 2
0
    def _mad(self, action):
        """

               :type action:
               :return:
               """

        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)
        for c in columns:
            mad_value = df.cols.mad(c, more=True)
            lower_bound = mad_value["median"] - threshold * mad_value["mad"]
            upper_bound = mad_value["median"] + threshold * mad_value["mad"]

            if action is "select":
                df = df.rows.select((F.col(c) > upper_bound)
                                    | (F.col(c) < lower_bound))
            elif action is "drop":
                df = df.rows.drop((F.col(c) > upper_bound)
                                  | (F.col(c) < lower_bound))
        return df
Esempio n. 3
0
    def decision_tree(df, columns, input_col, **kargs):
        """
        Runs a decision tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with decision tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        assert isinstance(input_col,
                          str), "Error, input column must be a string"

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = DecisionTreeClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        dt_model = model.fit(df)
        df_model = dt_model.transform(df)
        return df_model, dt_model
Esempio n. 4
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            counts = (df.groupBy(col_name + "_buckets").agg(
                F.count(col_name + "_buckets").alias("count")).cols.rename(
                    col_name + "_buckets",
                    "value").sort(F.asc("value")).to_json())

            hist = []
            for x, y in zip(counts, splits):
                # if x["value"] is not None and x["count"] != 0:
                hist.append({
                    "lower": y["lower"],
                    "upper": y["upper"],
                    "count": x["count"]
                })

        return hist
Esempio n. 5
0
    def apply_by_dtypes(columns,
                        func,
                        func_return_type,
                        args=None,
                        func_type=None,
                        data_type=None):
        """
        Apply a function using pandas udf or udf if apache arrow is not available
        :param columns: Columns in which the function is going to be applied
        :param func: Functions to be applied to a columns
        :param func_return_type
        :param args:
        :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed)
        :param data_type:
        :return:
        """
        columns = parse_columns(self, columns)

        for c in columns:
            df = self.cols.apply(c,
                                 func,
                                 func_return_type,
                                 args=args,
                                 func_type=func_type,
                                 when=fbdt(c, data_type))
        return df
Esempio n. 6
0
def table_html(self, limit=100, columns=None):
    """
    Return a HTML table with the dataframe cols, data types and values
    :param self:
    :param columns: Columns to be printed
    :param limit: how many rows will be printed
    :return:
    """

    columns = parse_columns(self, columns)

    data = self.select(columns).limit(limit).to_json()

    # Load template
    path = os.path.dirname(os.path.abspath(__file__))
    template_loader = jinja2.FileSystemLoader(searchpath=path +
                                              "//../templates")
    template_env = jinja2.Environment(loader=template_loader, autoescape=True)
    template = template_env.get_template("table.html")

    # Filter only the columns and data type info need it
    dtypes = list(filter(lambda x: x[0] in columns, self.dtypes))

    total_rows = self.count()
    if total_rows < limit:
        limit = total_rows

    # Print table
    output = template.render(cols=dtypes,
                             data=data,
                             limit=limit,
                             total_rows=total_rows,
                             total_cols=self.cols.count())
    return output
Esempio n. 7
0
    def nest(input_cols, output_col, shape=None, separator=" "):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """
        columns = parse_columns(self, input_cols)
        df = self

        if shape is "vector":
            vector_assembler = VectorAssembler(inputCols=input_cols,
                                               outputCol=output_col)
            df = vector_assembler.transform(self)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":

            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIfNot.value_error(shape, ["vector", "array", "string"])

        return df
Esempio n. 8
0
    def percentile(columns, values=None, error=1):
        """
        Return the percentile of a dataframe
        :param columns:  '*', list of columns names or a single column name.
        :param values: list of percentiles to be calculated
        :return: percentiles per columns
        """
        start_time = timeit.default_timer()

        if values is None:
            values = [0.05, 0.25, 0.5, 0.75, 0.95]

        columns = parse_columns(self, columns)

        # Get percentiles
        percentile_results = []
        for c in columns:
            percentile_per_col = self \
                .rows.drop_na(c) \
                .cols.cast(c, "double") \
                .approxQuantile(c, values, error)

            percentile_results.append(dict(zip(values, percentile_per_col)))

        percentile_results = dict(zip(columns, percentile_results))

        logging.info("percentile")
        logging.info(timeit.default_timer() - start_time)
        return format_dict(percentile_results)
Esempio n. 9
0
    def _iqr(self, action):
        """
        Select or drop outliers
        :param action:
        :return:
        """
        df = self.df
        columns = self.columns

        if not is_dataframe(self.df):
            raise TypeError("Spark Dataframe expected")

        columns = parse_columns(self.df, columns)

        for col_name in columns:
            iqr = df.cols.iqr(col_name, more=True)
            lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5)
            upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5)

            if action is "drop":
                df = df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
            elif action is "select":
                df = df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))

        return df
Esempio n. 10
0
    def z_score(df, columns, threshold=None):
        """
        Delete outlier using z score
        :param df:
        :param columns:
        :param threshold:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)

        for c in columns:
            # the column with the z_col value is always the string z_col plus the name of column
            z_col = "z_col_" + c

            df = df.cols.z_score(c) \
                .rows.drop(F.col(z_col) > threshold) \
                .cols.drop(z_col)

        return df
Esempio n. 11
0
    def random_forest(df, columns, input_col, **kargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = RandomForestClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
Esempio n. 12
0
    def _z_score(self, action):
        """
        Get outlier using z score

        :return:
        """
        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        columns = parse_columns(df, columns)

        for col_name in columns:
            # the column with the z_col value is always the string z_col plus the name of column
            z_col_name = _z_score_col_name(col_name)

            if action is "drop":
                df = df.cols.z_score(col_name,z_col_name) \
                    .rows.drop(F.col(z_col_name) > threshold) \
                    .cols.drop(z_col_name)

            elif action is "select":
                df = df.cols.z_score(col_name) \
                    .rows.select(F.col(z_col_name) > threshold) \
                    .cols.drop(z_col_name)

        return df
Esempio n. 13
0
    def parquet(path, mode="overwrite", num_partitions=1):
        """
        Save data frame to a parquet file
        :param path: path where the dataframe will be saved.
        :param mode: Specifies the behavior of the save operation when data already exists.
                    "append": Append contents of this DataFrame to existing data.
                    "overwrite" (default case): Overwrite existing data.
                    "ignore": Silently ignore this operation if data already exists.
                    "error": Throw an exception if data already exists.
        :param num_partitions: the number of partitions of the DataFrame
        :return:
        """
        # This character are invalid as column names by parquet
        invalid_character = [
            " ", ",", ";", "{", "}", "(", ")", "\n", "\t", "="
        ]

        def func(col_name):
            for i in invalid_character:
                col_name = col_name.replace(i, "_")
            return col_name

        df = self.cols.rename(func)

        columns = parse_columns(self, "*", filter_by_column_dtypes=["null"])
        df = df.cols.cast(columns, "str")

        try:
            df.coalesce(num_partitions) \
                .write \
                .mode(mode) \
                .parquet(path)
        except IOError as e:
            logger.print(e)
            raise
Esempio n. 14
0
    def csv(path, header="true", mode="overwrite", sep=",", num_partitions=1):
        """
        Save data frame to a CSV file.
        :param path: path where the dataframe will be saved.
        :param header: True or False to include header
        :param mode: Specifies the behavior of the save operation when data already exists.
                    "append": Append contents of this DataFrame to existing data.
                    "overwrite" (default case): Overwrite existing data.
                    "ignore": Silently ignore this operation if data already exists.
                    "error": Throw an exception if data already exists.
        :param sep: sets the single character as a separator for each field and value. If None is set,
        it uses the default value.
        :param num_partitions: the number of partitions of the DataFrame
        :return: Dataframe in a CSV format in the specified path.
        """

        try:
            df = self
            columns = parse_columns(self,
                                    "*",
                                    filter_by_column_dtypes=[
                                        "date", "array", "vector", "binary",
                                        "null"
                                    ])
            df = df.cols.cast(columns, "str").repartition(num_partitions)

            # Save to csv
            df.write.options(header=header).mode(mode).csv(path, sep=sep)
        except IOError as error:
            logger.print(error)
            raise
Esempio n. 15
0
def fingerprint_cluster(df, columns):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :param df:
    :param columns: Columns to be processed
    :return:
    """
    # df = self.df
    columns = parse_columns(df, columns)

    for col_name in columns:
        output_col = col_name + "_FINGERPRINT"
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (
            df.groupBy(col_name).count().select('count', col_name).repartition(
                1)  # Needed for optimization in a single machine
            .cache())
        # Calculate the fingeprint
        df = fingerprint(df, col_name)

        # Create cluster
        df = df.groupby(output_col).agg(
            F.collect_set(col_name).alias("cluster"),
            F.sum("count").alias("count"),
            F.first(col_name).alias("recommended"),
            F.size(F.collect_set(col_name)).alias("cluster_size")
        ) \
            .select("cluster_size", "cluster", "count", "recommended")
    return df
Esempio n. 16
0
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :param type: Accepts integer, float, string or None
        :return:
        """

        columns = parse_columns(self, columns)

        df = self
        expr = []
        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure

            if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")
            expr.append(
                F.count(
                    F.when(
                        F.isnan(col_name) | F.col(col_name).isNull(),
                        col_name)).alias(col_name))

        result = format_dict(collect_as_dict(df.select(*expr).collect()))

        return result
Esempio n. 17
0
def fingerprint(df, columns):
    """
    Create the fingerprint for a
    :param df:
    :param columns:
    :return:
    """
    def _split_sort_remove_join(value, args):
        """
        Helper function to split, remove duplicates, sort and join back together
        :param value:
        :param args:
        :return:
        """
        # Split into whitespace-separated token
        split_key = value.split()

        # Sort and remove duplicated items
        sorted(set(split_key))

        # join the tokens back together
        return "".join(split_key)

    columns = parse_columns(df, columns)
    for col_name in columns:
        output_col = col_name + "_FINGERPRINT"
        df = (df.withColumn(output_col, F.col(col_name)).cols.trim(
            output_col).cols.lower(output_col).cols.remove_special_chars(
                output_col).cols.remove_accents(output_col).cols.apply(
                    output_col, _split_sort_remove_join,
                    "string").repartition(1).cache())
    return df
Esempio n. 18
0
def n_gram_fingerprint_cluster(df, columns, n_size=2):
    """
    Cluster a DataFrame column based on the N-Gram Fingerprint algorithm
    :param df:
    :param columns:
    :param n_size:
    :return:
    """
    columns = parse_columns(df, columns)
    for col_name in columns:
        n_gram_col = col_name + "_ngram_fingerprint"

        # Prepare a group so we don need to apply the fingerprint to the whole data set
        df = (
            df.select(col_name).groupBy(col_name).count().select(
                'count', col_name).repartition(
                    1)  # Needed for optimization in a single machine
            .cache())

        df = n_gram_fingerprint(df, col_name, n_size)
        # df.table()
        df = df.groupby(n_gram_col).agg(
            F.collect_set(col_name).alias("cluster"),
            F.sum("count").alias("count"),
            F.first(col_name).alias("recommended"),
            F.size(F.collect_set(col_name)).alias("cluster_size")).select(
                "cluster_size", "cluster", "count", "recommended")

        return df
Esempio n. 19
0
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :return:
        """

        columns = parse_columns(self, columns)
        df = self
        expr = []

        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean
            if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")

            if is_(df.cols.schema_dtype(col_name), (float, int)):
                expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name))

            elif is_(df.cols.schema_dtype(col_name), (NullType)):
                expr.append(F.count(col_name).alias(col_name))

            else:
                expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name))

        result = format_dict(df.select(*expr).to_json())
        return result
Esempio n. 20
0
    def years_between(columns, date_format):
        """
        This method compute the age based on a born date.
        :param  columns: Name of the column born dates column.
        :param  date_format: String format date of the column provided.
        """

        # Asserting if column if in dataFrame:
        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NOT_ARRAY_TYPES)

        # Output format date
        format_dt = "yyyy-MM-dd"  # Some SimpleDateFormat string

        def _years_between(_new_col_name, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                _new_col_name)

        df = self
        for col_name in columns:
            new_col_name = col_name + "_years_between"
            df = df.cols.apply_expr(new_col_name, _years_between, [date_format, col_name]).cols.cast(new_col_name,
                                                                                                     "float")
        return df
Esempio n. 21
0
    def gbt(df, columns, input_col, **kargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = GBTClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Esempio n. 22
0
    def mode(columns):
        """
        Return the the column mode
        :param columns: '*', list of columns names or a single column name.
        :return:
        """

        columns = parse_columns(self, columns)
        mode_result = []

        for col_name in columns:
            cnts = self.groupBy(col_name).count()
            mode_df = cnts.join(
                cnts.agg(F.max("count").alias("max_")), F.col("count") == F.col("max_")
            )

            # if none of the values are repeated we not have mode
            mode_list = (mode_df
                         .rows.select(mode_df["count"] > 1)
                         .cols.select(col_name)
                         .collect())

            mode_result.append({col_name: filter_list(mode_list)})

        return mode_result
Esempio n. 23
0
    def date_transform(columns, current_format, output_format):
        """
        Tranform a column date format
        :param  columns: Columns to be transformed.
        :param  current_format: current_format is the current string dat format of columns specified. Of course,
                                all columns specified must have the same format. Otherwise the function is going
                                to return tons of null values because the transformations in the columns with
                                different formats will fail.
        :param  output_format: output date string format to be expected.
        """

        def _date_transform(_new_col_name, attr):
            _col_name = attr[0]
            _current_format = attr[1]
            _output_format = attr[2]

            return F.date_format(F.unix_timestamp(_col_name, _current_format).cast("timestamp"), _output_format).alias(
                _new_col_name)

        # Asserting if column if in dataFrame:
        columns = parse_columns(self, columns)
        df = self

        for col_name in columns:
            new_col_name = col_name + "_data_transform"
            df = df.cols.apply_expr(new_col_name, _date_transform, [col_name, current_format, output_format])

        return df
Esempio n. 24
0
    def mad(columns, more=None):
        """
        Return the Median Absolute Deviation
        :param columns: Column to be processed
        :param more: Return some extra computed values (Median).
        :return:
        """
        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
        result = {}
        for col_name in columns:

            _mad = {}

            # return mean(absolute(data - mean(data, axis)), axis)
            median_value = self.cols.median(col_name)

            mad_value = self.select(col_name) \
                .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \
                .cols.median(col_name)

            if more:
                _mad = {"mad": mad_value, "median": median_value}
            else:
                _mad = {"mad": mad_value}

            result[col_name] = _mad

        return format_dict(result)
Esempio n. 25
0
    def percentile(columns, values=None, error=1):
        """
        Return the percentile of a dataframe
        :param columns:  '*', list of columns names or a single column name.
        :param values: list of percentiles to be calculated
        :param error:
        :return: percentiles per columns
        """

        if values is None:
            values = [0.05, 0.25, 0.5, 0.75, 0.95]

        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        # Get percentiles
        percentile_results = []
        for c in columns:
            percentile_per_col = self \
                .rows.drop_na(c) \
                .cols.cast(c, "double") \
                .approxQuantile(c, values, error)

            percentile_results.append(dict(zip(values, percentile_per_col)))

        percentile_results = dict(zip(columns, percentile_results))

        return format_dict(percentile_results)
Esempio n. 26
0
    def apply(columns, func, func_return_type, args=None, func_type=None, when=None, filter_col_by_dtypes=None,
              verbose=True):
        """
        Apply a function using pandas udf or udf if apache arrow is not available
        :param columns: Columns in which the function is going to be applied
        :param func: Functions to be applied to a columns. The declaration must have always 2 params.
            def func(value, args):
        :param func_return_type: function return type. This is required by UDF and Pandas UDF.
        :param args: Arguments to be passed to the function
        :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed)
        :param when: A expression to better control when the function is going to be apllied
        :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool
        :param verbose: Print additional information about
        :return: DataFrame
        """

        columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True)

        df = self

        def expr(_when):
            main_query = audf(c, func, func_return_type, args, func_type, verbose=verbose)
            if when is not None:
                # Use the data type to filter the query
                main_query = F.when(_when, main_query).otherwise(F.col(c))

            return main_query

        for c in columns:
            df = df.withColumn(c, expr(when))
        return df
Esempio n. 27
0
    def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True):
        """
        Apply a expression to column.
        :param columns: Columns in which the function is going to be applied
        :param func: function to be applied
        :type func: A plain expression or a function
        :param args: Argument passed to the function
        :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool
        :param verbose: Print additional information about
        :return: Dataframe
        """

        # It handle if func param is a plain expression or a function returning and expression
        def func_col_exp(col_name, attr):
            return func

        if is_(func, F.Column):
            _func = func_col_exp
        else:
            _func = func

        columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True)

        df = self
        for col_name in columns:
            df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose))
        return df
Esempio n. 28
0
    def _exprs(funcs, columns):
        """
        Helper function to apply multiple columns expression to multiple columns
        :param funcs: Aggregation functions from Apache Spark
        :param columns: list or string of columns names or a .
        :return:
        """
        def parse_col_names_funcs_to_keys(data):
            """
            Helper function that return a formatted json with function:value inside columns. Transform from
            {'max_antiguedad_anos': 15,
            'max_m2_superficie_construida': 1800000,
            'min_antiguedad_anos': 2,
            'min_m2_superficie_construida': 20}

            to

            {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}}

            :param data: json data
            :return: json
            """
            functions_array = [
                "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum",
                "variance", "approx_count_distinct", "na", "zeros",
                "percentile"
            ]
            result = {}
            if is_dict(data):
                for k, v in data.items():
                    for f in functions_array:
                        temp_func_name = f + "_"
                        if k.startswith(temp_func_name):
                            _col_name = k[len(temp_func_name):]
                            result.setdefault(_col_name, {})[f] = v
                return result
            else:
                return data

        columns = parse_columns(self, columns)

        # Ensure that is a list
        funcs = val_to_list(funcs)

        df = self

        # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving
        # unexpected results
        # df = df.cols.cast(columns, "float")

        # Create a Column Expression for every column
        exprs = []
        for col_name in columns:
            for func in funcs:
                exprs.append(
                    func(col_name).alias(func.__name__ + "_" + col_name))

        return (parse_col_names_funcs_to_keys(
            format_dict(df.agg(*exprs).to_json())))
Esempio n. 29
0
def correlation(self,
                columns,
                method="pearson",
                strategy="mean",
                output="json"):
    """
    Calculate the correlation between columns. It will try to cast a column to float where necessary and impute
    missing values
    :param self:
    :param columns: Columns to be processed
    :param method: Method used to calculate the correlation
    :param strategy: Imputing strategy
    :param output: array or json
    :return:
    """
    columns = parse_columns(self, columns)
    # try to parse the select column to float and create a vector

    df = self
    for col_name in columns:
        df = df.cols.cast(col_name, "float")
        logging.info(
            "Casting {col_name} to float...".format(col_name=col_name))

    # Impute missing values
    imputed_cols = [c + "_imputed" for c in columns]
    df = df.cols.impute(columns, imputed_cols, strategy)
    logging.info("Imputing {columns}, Using '{strategy}'...".format(
        columns=columns, strategy=strategy))

    # Create Vector necessary to calculate the correlation
    df = df.cols.nest(imputed_cols, "features", "vector")

    corr = Correlation.corr(df, "features", method).head()[0].toArray()

    if output is "array":
        result = corr

    elif output is "json":

        # Parse result to json
        col_pair = []
        for col_name in columns:
            for col_name_2 in columns:
                col_pair.append({"between": col_name, "an": col_name_2})

        # flat array
        values = corr.flatten('F').tolist()

        result = []
        for n, v in zip(col_pair, values):
            # Remove correlation between the same column
            if n["between"] is not n["an"]:
                n["value"] = v
                result.append(n)

        result = sorted(result, key=lambda k: k['value'], reverse=True)

    return result
Esempio n. 30
0
 def variance(columns):
     """
     Return the column variance
     :param columns: '*', list of columns names or a single column name.
     :return:
     """
     columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
     return _exprs(F.variance, columns)