def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None): """ Generate a summary of a kll_floats_sketch, including a histogram Parameters ---------- sketch : kll_floats_sketch Data sketch max_buckets : int Override the default maximum number of buckets avg_per_bucket : int Override the default target number of items per bucket. Returns ------- histogram : HistogramSummary Protobuf histogram message """ n = sketch.get_n() start = sketch.get_min_value() max_val = sketch.get_max_value() end = max_val if max_buckets is None: max_buckets = MAX_HIST_BUCKETS if avg_per_bucket is None: avg_per_bucket = HIST_AVG_NUMBER_PER_BUCKET if (n < 2) or (start == end): dx = abs(start) * 1e-7 end = start + dx bins = [start, end] counts = [n] else: # Include the max value in the right-most bin end += abs(end) * (1e-7) # Include the right edge in the bin edges n_buckets = min(math.ceil(n / HIST_AVG_NUMBER_PER_BUCKET), MAX_HIST_BUCKETS) width = (end - start) / n_buckets # Calculate histograms from the Probability Mass Function bins = [start + i * width for i in range(n_buckets + 1)] pmf = sketch.get_pmf(bins) counts = [round(p * n) for p in pmf] counts = counts[1:-1] return HistogramSummary( start=start, end=end, width=0, counts=counts, max=max_val, min=start, bins=bins, n=n, )
def entropy_from_column_summary(summary: ColumnSummary, histogram: datasketches.kll_floats_sketch): """ Calculate the estimated entropy for a ColumnProfile, using the ColumnSummary Can be used for both continuous and discrete types of data. Parameters ---------- summary : ColumnSummary Protobuf summary message histogram: datasketches.kll_floats_sketch Data sketch for quantiles Returns ------- entropy : float Estimated entropy value, np.nan if the inferred data type of the column is not categorical or numeric """ frequent_items = summary.frequent_items unique_count = summary.unique_count.estimate inferred_type = summary.schema.inferred_type.type total_count = summary.counters.count if inferred_type == InferredType.Type.FRACTIONAL: if histogram.get_min_value() == histogram.get_max_value() or histogram.get_n() <= 1: return 0 bins = np.linspace(histogram.get_min_value(), histogram.get_max_value(), 100) pmf = histogram.get_pmf(bins) pmf = list(filter(lambda x: x > 0, pmf)) entropy = -np.sum(pmf * np.log(pmf)) return entropy elif inferred_type in (InferredType.Type.INTEGRAL, InferredType.Type.STRING, InferredType.Type.BOOLEAN): if total_count == 0: return 0 entropy = 0 for item in frequent_items.items: i_frequency = item.estimate / total_count entropy += i_frequency * np.log(i_frequency) frequent_items_count = len(frequent_items.items) n_singles = unique_count - frequent_items_count if math.isclose(n_singles, 0.0, abs_tol=10e-3): return -entropy n_singles_frequency = n_singles / total_count entropy += n_singles_frequency * np.log(n_singles_frequency) return -entropy return np.nan
def ks_test_compute_p_value(target_distribution: kll_floats_sketch, reference_distribution: kll_floats_sketch): """ Compute the Kolmogorov-Smirnov test p-value of two continuous distributions. Uses the quantile values and the corresponding CDFs to calculate the approximate KS statistic. Only applicable to continuous distributions. The null hypothesis expects the samples to come from the same distribution. Parameters ---------- target_distribution : datasketches.kll_floats_sketch A kll_floats_sketch (quantiles sketch) from the target distribution's values reference_distribution : datasketches.kll_floats_sketch A kll_floats_sketch (quantiles sketch) from the reference (expected) distribution's values Can be generated from a theoretical distribution, or another sample for the same feature. Returns ------- p_value : float The estimated p-value from the parametrized KS test, applied on the target and reference distributions' kll_floats_sketch summaries """ D_max = 0 target_quantile_values = target_distribution.get_quantiles(QUANTILES) ref_quantile_values = reference_distribution.get_quantiles(QUANTILES) num_quantiles = len(QUANTILES) i, j = 0, 0 while i < num_quantiles and j < num_quantiles: if target_quantile_values[i] < ref_quantile_values[j]: current_quantile = target_quantile_values[i] i += 1 else: current_quantile = ref_quantile_values[j] j += 1 cdf_target = target_distribution.get_cdf([current_quantile])[0] cdf_ref = reference_distribution.get_cdf([current_quantile])[0] D = abs(cdf_target - cdf_ref) if D > D_max: D_max = D while i < num_quantiles: cdf_target = target_distribution.get_cdf([target_quantile_values[i]])[0] cdf_ref = reference_distribution.get_cdf([target_quantile_values[i]])[0] D = abs(cdf_target - cdf_ref) if D > D_max: D_max = D i += 1 while j < num_quantiles: cdf_target = target_distribution.get_cdf([ref_quantile_values[j]])[0] cdf_ref = reference_distribution.get_cdf([ref_quantile_values[j]])[0] D = abs(cdf_target - cdf_ref) if D > D_max: D_max = D j += 1 m, n = sorted([target_distribution.get_n(), reference_distribution.get_n()], reverse=True) en = m * n / (m + n) p_value = stats.distributions.kstwo.sf(D_max, np.round(en)) return type("Object", (), {"ks_test": p_value})