def hist(self, col_name):
        # buckets = 20
        # min_value, max_value = self.df.cols.range(col_name)
        #
        # create_buckets(min_value, self.lower_bound, buckets)
        #
        # create_buckets(self.lower_bound, self.upper_bound, buckets)
        #
        # create_buckets(self.upper_bound, max_value, buckets)

        # lower bound
        lower_bound_hist = self.df.rows.select(
            self.df[col_name] < self.lower_bound).cols.hist(col_name, 20)

        # upper bound
        upper_bound_hist = self.df.rows.select(
            self.df[col_name] > self.upper_bound).cols.hist(col_name, 20)

        # Non outliers
        non_outlier_hist = self.df.rows.select(
            (F.col(col_name) >= self.lower_bound)
            & (F.col(col_name) <= self.upper_bound)).cols.hist(col_name, 20)

        result = {}
        if lower_bound_hist is not None:
            result.update(lower_bound_hist)
        if upper_bound_hist is not None:
            result.update(upper_bound_hist)
        if non_outlier_hist is not None:
            result.update(non_outlier_hist)

        return dump_json(result)
Exemple #2
0
    def info(self, output: str = "dict"):
        """
        Get whiskers, iqrs and outliers and non outliers count
        :return:
        """

        lower_bound = self.lower_bound
        upper_bound = self.upper_bound

        q1 = self.q1
        median = self.median
        q3 = self.q3
        iqr = self.iqr

        result = {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "lower_bound": lower_bound,
            "lower_bound_count": self.count_lower_bound(lower_bound),
            "upper_bound": upper_bound,
            "upper_bound_count": self.count_upper_bound(upper_bound),
            "q1": q1,
            "median": median,
            "q3": q3,
            "iqr": iqr
        }

        if output == "json":
            result = dump_json(result)
        return result
 def select_lower_bound(self):
     col_name = self.col_name
     sample = {
         "columns": [{
             "title": cols
         } for cols in val_to_list(self.col_name)],
         "value":
         self.df.rows.select(self.df[col_name] < self.lower_bound).limit(
             100).rows.to_list(col_name)
     }
     return dump_json(sample)
Exemple #4
0
def base_clustering_function(df, input_cols, output, func=None, args=None):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :return:
    """

    # df = self.df
    input_cols = parse_columns(df, input_cols)
    result = {}
    for input_col in input_cols:
        output_col = name_col(input_col, FINGERPRINT_COL)
        count_col = name_col(input_col, COUNT_COL)
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (df.groupBy(input_col).agg(
            F.count(input_col).alias(count_col)).select(
                count_col, input_col)).h_repartition(1)

        # Calculate the fingerprint
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        cluster_sum_col = name_col(input_col, CLUSTER_SUM_COL)
        cluster_count_col = name_col(input_col, CLUSTER_COUNT_COL)

        # Apply function cluster
        df = func(df, *args)

        df = df.withColumn(cluster_col, F.create_map([input_col, count_col]))

        df = df.groupBy(output_col).agg(
            F.max(F.struct(F.col(count_col),
                           F.col(input_col))).alias(recommended_col),
            F.collect_list(cluster_col).alias(cluster_col),
            F.count(output_col).alias(cluster_count_col),
            F.sum(count_col).alias(cluster_sum_col))
        df = df.select(recommended_col + "." + input_col, cluster_col,
                       cluster_count_col, cluster_sum_col)

        for row in df.collect():
            _row = list(row.asDict().values())
            # List of dict to dict
            flatted_dict = {
                k: v
                for element in _row[1] for k, v in element.items()
            }
            result[_row[0]] = {
                "similar": flatted_dict,
                "count": _row[2],
                "sum": _row[3]
            }

    if output == "json":
        result = dump_json(result)
    return result
Exemple #5
0
    def profile(self,
                columns="*",
                bins: int = MAX_BUCKETS,
                output: str = None,
                flush: bool = False,
                size=False):
        """
        Return a dict the profile of the dataset
        :param columns:
        :param bins:
        :param output:
        :param flush:
        :param size: get the dataframe size in memory. Use with caution this could be slow for big data frames.
        :return:
        """

        df = self.root

        meta = self.meta
        profile = Meta.get(meta, "profile")

        if not profile:
            flush = True

        if columns or flush:
            columns = parse_columns(df, columns) if columns else []
            transformations = Meta.get(meta, "transformations")

            calculate = False

            if flush or len(transformations):
                calculate = True

            else:
                for col in columns:
                    if col not in profile["columns"]:
                        calculate = True

            if calculate:
                df = df[columns].calculate_profile("*", bins, flush, size)
                profile = Meta.get(df.meta, "profile")
                self.meta = df.meta
            profile["columns"] = {
                key: profile["columns"][key]
                for key in columns
            }

        if output == "json":
            profile = dump_json(profile)

        return profile
Exemple #6
0
    def info(self, output: str = "dict"):
        """
        Get whiskers, iqrs and outliers and non outliers count
        :return:
        """
        upper_bound, lower_bound, = dict_filter(self.whiskers(),
                                                ["upper_bound", "lower_bound"])

        result = {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "lower_bound": lower_bound,
            "lower_bound_count": self.count_lower_bound(lower_bound),
            "upper_bound": upper_bound,
            "upper_bound_count": self.count_upper_bound(upper_bound)
        }
        if output == "json":
            result = dump_json(result)
        return result
Exemple #7
0
    def hist(self, col_name: str):
        df = self.df
        # lower bound
        lower_bound_hist = df.rows.select(df[col_name] < self.lower_bound).cols.hist(col_name, 20)

        # upper bound
        upper_bound_hist = df.rows.select(df[col_name] > self.upper_bound).cols.hist(col_name, 20)

        # Non outliers
        non_outlier_hist = df.rows.select(
            (df[col_name] >= self.lower_bound) & (df[col_name] <= self.upper_bound)).cols.hist(col_name, 20)

        result = {}
        if lower_bound_hist is not None:
            result.update(lower_bound_hist)
        if upper_bound_hist is not None:
            result.update(upper_bound_hist)
        if non_outlier_hist is not None:
            result.update(non_outlier_hist)

        return dump_json(result)
def levenshtein_cluster(df,
                        input_col,
                        threshold: int = None,
                        output: str = "dict"):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col: Column to be processed
    :param threshold: number
    :param output: "dict" or "json"
    :return:
    """
    # Create fingerprint
    df_fingerprint = keycollision.fingerprint(df, input_col)

    # Names
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)
    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"
    count = "count"

    # Prepare the columns to calculate the cross join
    fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \
        .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2),
             F.count(input_col).alias(count)) \
        .select(temp_col_1, temp_col_2, count).collect()

    df = df_fingerprint.select(
        input_col,
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    if threshold is None:
        where = ((F.col(distance_col) == 0) &
                 (F.col(temp_col_1) != F.col(temp_col_2)))
    else:
        where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold)

    df_r = (
        df.rows.drop(where).cols.replace(
            distance_col, 0, None,
            search_by="numeric").groupby(temp_col_1).agg(
                F.min(distance_col).alias(distance_r_col))
        # .cols.rename(distance_col, distance_r_col)
        .cols.rename(temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists
    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2),
                                     F.count(temp_col_2)))

    # Replace ngram per string
    kv_dict = {}
    for row in fingerprint_count:
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = {_row[0]: _row[2]}

    result = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        d = {}
        for i in _row[1]:
            key = list(kv_dict[i].keys())[0]
            value = list(kv_dict[i].values())[0]
            d[key] = value
        key = list(kv_dict[_row[0]].keys())[0]
        value = list(kv_dict[_row[0]].values())[0]
        d.update({key: value})
        result[key] = d

    # Calculate count and sum
    f = {}
    for k, v in result.items():
        _sum = 0
        for x, y in v.items():
            _sum = _sum + y
        f[k] = {"similar": v, "count": len(v), "sum": _sum}

    result = f
    if output == "json":
        result = dump_json(result)
    return result
Exemple #9
0
    def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True,
                sample=10000, stats=True, format="dict", mismatch=None, advanced_stats=False):
        """
        Return the profiling data in json format
        :param df: Dataframe to be processed
        :param columns: column to calculate the histogram
        :param buckets: buckets on the histogram
        :param infer:
        :param relative_error:
        :param approx_count:
        :param sample: numbers of rows to retrieve with random sampling
        :param stats: calculate stats, if not only data table returned
        :param format: dict or json
        :param mismatch:
        :param advanced_stats:
        :return: dict or json
        """
        output_columns = self.output_columns
        cols_to_profile = self.cols_needs_profiling(df, columns)

        # Get the stats for all the columns
        if stats is True:
            # Are there column to process?
            if cols_to_profile or not self.is_cached():
                rows_count = df.count()
                self.rows_count = rows_count
                self.cols_count = cols_count = len(df.columns)
                updated_columns = self.columns_stats(df, cols_to_profile, buckets, infer, relative_error, approx_count,
                                                     mismatch, advanced_stats)

                output_columns = update_dict(output_columns, updated_columns)

                assign(output_columns, "name", df.get_name(), dict)
                assign(output_columns, "file_name", df.get_meta("file_name"), dict)

                # Add the General data summary to the output
                data_set_info = {'cols_count': cols_count,
                                 'rows_count': rows_count,
                                 'size': humanize.naturalsize(df.size()),
                                 'sample_size': sample}

                assign(output_columns, "summary", data_set_info, dict)

                # Nulls
                total_count_na = 0
                for k, v in output_columns["columns"].items():
                    total_count_na = total_count_na + v["stats"]["count_na"]

                assign(output_columns, "summary.missing_count", total_count_na, dict)
                assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2))

            # TODO: drop, rename and move operation must affect the sample
            sample = {"columns": [{"title": cols} for cols in df.cols.names()],
                      "value": df.sample_n(sample).rows.to_list(columns)}

            assign(output_columns, "sample", sample, dict)

        actual_columns = output_columns["columns"]
        # Order columns
        output_columns["columns"] = dict(OrderedDict(
            {_cols_name: actual_columns[_cols_name] for _cols_name in df.cols.names() if
             _cols_name in list(actual_columns.keys())}))

        df = df.set_meta(value={})
        df = df.columns_meta(df.cols.names())

        # col_names = output_columns["columns"].keys()
        if format == "json":
            result = dump_json(output_columns)
        else:
            result = output_columns

        self.output_columns = output_columns
        df.set_meta("transformations.actions", {})

        return result