def _compute_kl_divergence_continuous_distributions(target_distribution: kll_floats_sketch, reference_distribution: kll_floats_sketch): """ Calculates the estimated KL divergence for two continuous distributions. Uses the `datasketches.kll_floats_sketch` sketch to calculate the KL divergence based on the PMFs. Only applicable to continuous distributions. Parameters ---------- target_distribution : datasketches.kll_floats_sketch The quantiles summary of the target feature's distribution. reference_distribution : datasketches.kll_floats_sketch The quantiles summary of the reference feature's distribution. Returns ------- kl_divergence : float The estimated KL divergence between two continuous features. """ almost_zero_probability_of_event = 10e-5 bins_target = np.linspace(target_distribution.get_min_value(), target_distribution.get_max_value(), 100) pmf_target = np.array(target_distribution.get_pmf(bins_target)) pmf_reference = np.array(reference_distribution.get_pmf(bins_target)) pmf_reference[pmf_reference == 0] = almost_zero_probability_of_event kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0)) return type("Object", (), {"kl_divergence": kl_divergence})
def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None): """ Generate a summary of a kll_floats_sketch, including a histogram Parameters ---------- sketch : kll_floats_sketch Data sketch max_buckets : int Override the default maximum number of buckets avg_per_bucket : int Override the default target number of items per bucket. Returns ------- histogram : HistogramSummary Protobuf histogram message """ n = sketch.get_n() start = sketch.get_min_value() max_val = sketch.get_max_value() end = max_val if max_buckets is None: max_buckets = MAX_HIST_BUCKETS if avg_per_bucket is None: avg_per_bucket = HIST_AVG_NUMBER_PER_BUCKET if (n < 2) or (start == end): dx = abs(start) * 1e-7 end = start + dx bins = [start, end] counts = [n] else: # Include the max value in the right-most bin end += abs(end) * (1e-7) # Include the right edge in the bin edges n_buckets = min(math.ceil(n / HIST_AVG_NUMBER_PER_BUCKET), MAX_HIST_BUCKETS) width = (end - start) / n_buckets # Calculate histograms from the Probability Mass Function bins = [start + i * width for i in range(n_buckets + 1)] pmf = sketch.get_pmf(bins) counts = [round(p * n) for p in pmf] counts = counts[1:-1] return HistogramSummary( start=start, end=end, width=0, counts=counts, max=max_val, min=start, bins=bins, n=n, )
def entropy_from_column_summary(summary: ColumnSummary, histogram: datasketches.kll_floats_sketch): """ Calculate the estimated entropy for a ColumnProfile, using the ColumnSummary Can be used for both continuous and discrete types of data. Parameters ---------- summary : ColumnSummary Protobuf summary message histogram: datasketches.kll_floats_sketch Data sketch for quantiles Returns ------- entropy : float Estimated entropy value, np.nan if the inferred data type of the column is not categorical or numeric """ frequent_items = summary.frequent_items unique_count = summary.unique_count.estimate inferred_type = summary.schema.inferred_type.type total_count = summary.counters.count if inferred_type == InferredType.Type.FRACTIONAL: if histogram.get_min_value() == histogram.get_max_value() or histogram.get_n() <= 1: return 0 bins = np.linspace(histogram.get_min_value(), histogram.get_max_value(), 100) pmf = histogram.get_pmf(bins) pmf = list(filter(lambda x: x > 0, pmf)) entropy = -np.sum(pmf * np.log(pmf)) return entropy elif inferred_type in (InferredType.Type.INTEGRAL, InferredType.Type.STRING, InferredType.Type.BOOLEAN): if total_count == 0: return 0 entropy = 0 for item in frequent_items.items: i_frequency = item.estimate / total_count entropy += i_frequency * np.log(i_frequency) frequent_items_count = len(frequent_items.items) n_singles = unique_count - frequent_items_count if math.isclose(n_singles, 0.0, abs_tol=10e-3): return -entropy n_singles_frequency = n_singles / total_count entropy += n_singles_frequency * np.log(n_singles_frequency) return -entropy return np.nan