Exemple #1
0
def entropy_from_column_summary(summary: ColumnSummary, histogram: datasketches.kll_floats_sketch):
    """
    Calculate the estimated entropy for a ColumnProfile, using the ColumnSummary
    Can be used for both continuous and discrete types of data.

    Parameters
    ----------
    summary : ColumnSummary
        Protobuf summary message
    histogram: datasketches.kll_floats_sketch
        Data sketch for quantiles

    Returns
    -------
    entropy : float
        Estimated entropy value,
        np.nan if the inferred data type of the column is not categorical or numeric

    """

    frequent_items = summary.frequent_items
    unique_count = summary.unique_count.estimate
    inferred_type = summary.schema.inferred_type.type
    total_count = summary.counters.count

    if inferred_type == InferredType.Type.FRACTIONAL:
        if histogram.get_min_value() == histogram.get_max_value() or histogram.get_n() <= 1:
            return 0
        bins = np.linspace(histogram.get_min_value(), histogram.get_max_value(), 100)
        pmf = histogram.get_pmf(bins)
        pmf = list(filter(lambda x: x > 0, pmf))
        entropy = -np.sum(pmf * np.log(pmf))
        return entropy

    elif inferred_type in (InferredType.Type.INTEGRAL, InferredType.Type.STRING, InferredType.Type.BOOLEAN):
        if total_count == 0:
            return 0

        entropy = 0
        for item in frequent_items.items:
            i_frequency = item.estimate / total_count
            entropy += i_frequency * np.log(i_frequency)

        frequent_items_count = len(frequent_items.items)
        n_singles = unique_count - frequent_items_count
        if math.isclose(n_singles, 0.0, abs_tol=10e-3):
            return -entropy

        n_singles_frequency = n_singles / total_count
        entropy += n_singles_frequency * np.log(n_singles_frequency)
        return -entropy

    return np.nan
Exemple #2
0
def _compute_kl_divergence_continuous_distributions(target_distribution: kll_floats_sketch, reference_distribution: kll_floats_sketch):
    """
    Calculates the estimated KL divergence for two continuous distributions.
    Uses the `datasketches.kll_floats_sketch` sketch to calculate the KL divergence based on the PMFs.
    Only applicable to continuous distributions.

    Parameters
    ----------
    target_distribution : datasketches.kll_floats_sketch
        The quantiles summary of the target feature's distribution.
    reference_distribution : datasketches.kll_floats_sketch
        The quantiles summary of the reference feature's distribution.

    Returns
    -------
        kl_divergence : float
        The estimated KL divergence between two continuous features.

    """
    almost_zero_probability_of_event = 10e-5
    bins_target = np.linspace(target_distribution.get_min_value(), target_distribution.get_max_value(), 100)
    pmf_target = np.array(target_distribution.get_pmf(bins_target))

    pmf_reference = np.array(reference_distribution.get_pmf(bins_target))
    pmf_reference[pmf_reference == 0] = almost_zero_probability_of_event

    kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
    return type("Object", (), {"kl_divergence": kl_divergence})
def histogram_from_sketch(sketch: kll_floats_sketch,
                          max_buckets: int = None,
                          avg_per_bucket: int = None):
    """
    Generate a summary of a kll_floats_sketch, including a histogram

    Parameters
    ----------
    sketch : kll_floats_sketch
        Data sketch
    max_buckets : int
        Override the default maximum number of buckets
    avg_per_bucket : int
        Override the default target number of items per bucket.

    Returns
    -------
    histogram : HistogramSummary
        Protobuf histogram message
    """
    n = sketch.get_n()
    start = sketch.get_min_value()
    max_val = sketch.get_max_value()
    end = max_val
    if max_buckets is None:
        max_buckets = MAX_HIST_BUCKETS
    if avg_per_bucket is None:
        avg_per_bucket = HIST_AVG_NUMBER_PER_BUCKET

    if (n < 2) or (start == end):
        dx = abs(start) * 1e-7
        end = start + dx
        bins = [start, end]
        counts = [n]
    else:
        # Include the max value in the right-most bin
        end += abs(end) * (1e-7)
        # Include the right edge in the bin edges
        n_buckets = min(math.ceil(n / HIST_AVG_NUMBER_PER_BUCKET),
                        MAX_HIST_BUCKETS)
        width = (end - start) / n_buckets
        # Calculate histograms from the Probability Mass Function
        bins = [start + i * width for i in range(n_buckets + 1)]
        pmf = sketch.get_pmf(bins)
        counts = [round(p * n) for p in pmf]
        counts = counts[1:-1]

    return HistogramSummary(
        start=start,
        end=end,
        width=0,
        counts=counts,
        max=max_val,
        min=start,
        bins=bins,
        n=n,
    )