Esempio n. 1
0
def _compute_kl_divergence_continuous_distributions(target_distribution: kll_floats_sketch, reference_distribution: kll_floats_sketch):
    """
    Calculates the estimated KL divergence for two continuous distributions.
    Uses the `datasketches.kll_floats_sketch` sketch to calculate the KL divergence based on the PMFs.
    Only applicable to continuous distributions.

    Parameters
    ----------
    target_distribution : datasketches.kll_floats_sketch
        The quantiles summary of the target feature's distribution.
    reference_distribution : datasketches.kll_floats_sketch
        The quantiles summary of the reference feature's distribution.

    Returns
    -------
        kl_divergence : float
        The estimated KL divergence between two continuous features.

    """
    almost_zero_probability_of_event = 10e-5
    bins_target = np.linspace(target_distribution.get_min_value(), target_distribution.get_max_value(), 100)
    pmf_target = np.array(target_distribution.get_pmf(bins_target))

    pmf_reference = np.array(reference_distribution.get_pmf(bins_target))
    pmf_reference[pmf_reference == 0] = almost_zero_probability_of_event

    kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
    return type("Object", (), {"kl_divergence": kl_divergence})
def histogram_from_sketch(sketch: kll_floats_sketch,
                          max_buckets: int = None,
                          avg_per_bucket: int = None):
    """
    Generate a summary of a kll_floats_sketch, including a histogram

    Parameters
    ----------
    sketch : kll_floats_sketch
        Data sketch
    max_buckets : int
        Override the default maximum number of buckets
    avg_per_bucket : int
        Override the default target number of items per bucket.

    Returns
    -------
    histogram : HistogramSummary
        Protobuf histogram message
    """
    n = sketch.get_n()
    start = sketch.get_min_value()
    max_val = sketch.get_max_value()
    end = max_val
    if max_buckets is None:
        max_buckets = MAX_HIST_BUCKETS
    if avg_per_bucket is None:
        avg_per_bucket = HIST_AVG_NUMBER_PER_BUCKET

    if (n < 2) or (start == end):
        dx = abs(start) * 1e-7
        end = start + dx
        bins = [start, end]
        counts = [n]
    else:
        # Include the max value in the right-most bin
        end += abs(end) * (1e-7)
        # Include the right edge in the bin edges
        n_buckets = min(math.ceil(n / HIST_AVG_NUMBER_PER_BUCKET),
                        MAX_HIST_BUCKETS)
        width = (end - start) / n_buckets
        # Calculate histograms from the Probability Mass Function
        bins = [start + i * width for i in range(n_buckets + 1)]
        pmf = sketch.get_pmf(bins)
        counts = [round(p * n) for p in pmf]
        counts = counts[1:-1]

    return HistogramSummary(
        start=start,
        end=end,
        width=0,
        counts=counts,
        max=max_val,
        min=start,
        bins=bins,
        n=n,
    )
Esempio n. 3
0
def entropy_from_column_summary(summary: ColumnSummary, histogram: datasketches.kll_floats_sketch):
    """
    Calculate the estimated entropy for a ColumnProfile, using the ColumnSummary
    Can be used for both continuous and discrete types of data.

    Parameters
    ----------
    summary : ColumnSummary
        Protobuf summary message
    histogram: datasketches.kll_floats_sketch
        Data sketch for quantiles

    Returns
    -------
    entropy : float
        Estimated entropy value,
        np.nan if the inferred data type of the column is not categorical or numeric

    """

    frequent_items = summary.frequent_items
    unique_count = summary.unique_count.estimate
    inferred_type = summary.schema.inferred_type.type
    total_count = summary.counters.count

    if inferred_type == InferredType.Type.FRACTIONAL:
        if histogram.get_min_value() == histogram.get_max_value() or histogram.get_n() <= 1:
            return 0
        bins = np.linspace(histogram.get_min_value(), histogram.get_max_value(), 100)
        pmf = histogram.get_pmf(bins)
        pmf = list(filter(lambda x: x > 0, pmf))
        entropy = -np.sum(pmf * np.log(pmf))
        return entropy

    elif inferred_type in (InferredType.Type.INTEGRAL, InferredType.Type.STRING, InferredType.Type.BOOLEAN):
        if total_count == 0:
            return 0

        entropy = 0
        for item in frequent_items.items:
            i_frequency = item.estimate / total_count
            entropy += i_frequency * np.log(i_frequency)

        frequent_items_count = len(frequent_items.items)
        n_singles = unique_count - frequent_items_count
        if math.isclose(n_singles, 0.0, abs_tol=10e-3):
            return -entropy

        n_singles_frequency = n_singles / total_count
        entropy += n_singles_frequency * np.log(n_singles_frequency)
        return -entropy

    return np.nan
Esempio n. 4
0
def quantiles_from_sketch(sketch: kll_floats_sketch, quantiles=None):
    """
    Calculate quantiles from a data sketch

    Parameters
    ----------
    sketch : kll_floats_sketch
        Data sketch
    quantiles : list-like
        Override the default quantiles.  Should be a list of values from
        0 to 1 inclusive.
    """
    if quantiles is None:
        quantiles = QUANTILES
    qvals = sketch.get_quantiles(quantiles)
    return QuantileSummary(
        quantiles=quantiles,
        quantile_values=qvals,
    )
Esempio n. 5
0
def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float):
    """
    Calculate the specified quantile from a data sketch

    Parameters
    ----------
    sketch : kll_floats_sketch
        Data sketch
    quantile : float
        Override the default quantiles to a single quantile.  Should be a value from
        0 to 1 inclusive.

    Returns
    ----------
    Anonymous object with one filed equal to the quantile value
    """
    if quantile is None:
        raise ValueError("The quantile value is required and should be of type float")
    qval = sketch.get_quantiles([quantile])
    return type("Object", (), {"quantile": qval[0]})
Esempio n. 6
0
def ks_test_compute_p_value(target_distribution: kll_floats_sketch, reference_distribution: kll_floats_sketch):
    """
    Compute the Kolmogorov-Smirnov test p-value of two continuous distributions.
    Uses the quantile values and the corresponding CDFs to calculate the approximate KS statistic.
    Only applicable to continuous distributions.
    The null hypothesis expects the samples to come from the same distribution.

    Parameters
    ----------
    target_distribution : datasketches.kll_floats_sketch
        A kll_floats_sketch (quantiles sketch) from the target distribution's values
    reference_distribution : datasketches.kll_floats_sketch
        A kll_floats_sketch (quantiles sketch) from the reference (expected) distribution's values
        Can be generated from a theoretical distribution, or another sample for the same feature.

    Returns
    -------
        p_value : float
        The estimated p-value from the parametrized KS test, applied on the target and reference distributions'
        kll_floats_sketch summaries

    """

    D_max = 0
    target_quantile_values = target_distribution.get_quantiles(QUANTILES)
    ref_quantile_values = reference_distribution.get_quantiles(QUANTILES)

    num_quantiles = len(QUANTILES)
    i, j = 0, 0
    while i < num_quantiles and j < num_quantiles:

        if target_quantile_values[i] < ref_quantile_values[j]:
            current_quantile = target_quantile_values[i]
            i += 1
        else:
            current_quantile = ref_quantile_values[j]
            j += 1

        cdf_target = target_distribution.get_cdf([current_quantile])[0]
        cdf_ref = reference_distribution.get_cdf([current_quantile])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D

    while i < num_quantiles:
        cdf_target = target_distribution.get_cdf([target_quantile_values[i]])[0]
        cdf_ref = reference_distribution.get_cdf([target_quantile_values[i]])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D
        i += 1

    while j < num_quantiles:
        cdf_target = target_distribution.get_cdf([ref_quantile_values[j]])[0]
        cdf_ref = reference_distribution.get_cdf([ref_quantile_values[j]])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D
        j += 1

    m, n = sorted([target_distribution.get_n(), reference_distribution.get_n()], reverse=True)
    en = m * n / (m + n)

    p_value = stats.distributions.kstwo.sf(D_max, np.round(en))

    return type("Object", (), {"ks_test": p_value})