def test_string_as_arrays_does_not_throw(): InferredType.Type c = ColumnProfile("col") data = "[0,0]" # this string will be parsed as an array c.track(data) summary: ColumnSummary = c.to_summary() assert summary.schema.inferred_type.type == InferredType.Type.UNKNOWN
def test_summary(): c = ColumnProfile("col") for n in [1, 2, 3]: c.track(n) summary = c.to_summary() actual_val = message_to_dict(summary) expected_val = { "counters": { "count": "3", }, "schema": { "inferredType": {"type": "INTEGRAL", "ratio": 1.0}, "typeCounts": {"INTEGRAL": "3"}, }, "numberSummary": { "count": "3", "min": 1.0, "max": 3.0, "mean": 2.0, "stddev": 1.0, "isDiscrete": False, "histogram": { "start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0, "bins": [1.0, 3.0000003], "n": "3", "width": 0.0, }, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], }, "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0}, }, } # Top-level unique count needs to be approximately equal expected_unique = { "estimate": 3.000000014901161, "lower": 3.0, "upper": 3.0001498026537594, } actual_unique = actual_val.pop("uniqueCount") assert actual_unique == pytest.approx(expected_unique, 0.0001) # Cannot do a straightforward frequentItems count since order is ambiguous actual_freq = actual_val.pop("frequentItems") assert set(actual_freq.keys()) == {"items"} expected = [("1", "1"), ("2", "1"), ("3", "1")] assert len(actual_freq["items"]) == len(expected) counts = [] for v in actual_freq["items"]: counts.append((v["jsonValue"], v["estimate"])) assert set(counts) == set(expected) # Compare the messages, excluding the frequent numbers counters assert actual_val == expected_val
def test_mostly_nulls_inferred_type_not_null(): Type = InferredType.Type c = ColumnProfile("col") data = [None, np.nan, None] * 3 + ["not a null val!"] for val in data: c.track(val) summary = c.to_summary() assert summary.schema.inferred_type.type != Type.NULL
def test_all_nulls_inferred_type_null(data, nulls_expected, expected_type): InferredType.Type c = ColumnProfile("col") for val in data: c.track(val) summary: ColumnSummary = c.to_summary() assert summary.counters.null_count.value == nulls_expected assert summary.schema.inferred_type.type == expected_type
def test_fallback_fallbacks_to_number_counter(): col = ColumnProfile("test") vals = ["a", "b", 1.0, 2.0] for v in vals: col.track(v) col.cardinality_tracker = HllSketch() summary = col.to_summary() assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
def test_fallback_number_counter(): col = ColumnProfile("test") vals = [1, 1.0, 2, 3, 4, 5, 6, 6.0, "text"] for v in vals: col.track(v) col.cardinality_tracker = HllSketch() summary = col.to_summary() assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
def test_all_nulls_inferred_type_null(): import numpy as np from whylogs.proto import InferredType Type = InferredType.Type c = ColumnProfile("col") data = [None, np.nan, None] * 3 for val in data: c.track(val) summary = c.to_summary() assert summary.schema.inferred_type.type == Type.NULL
def test_copy_counters_null_count_in_schema_tracker(): col = ColumnProfile("test") vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0] for v in vals: col.track(v) assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2 # ensuring we can still access the value in summary mode assert col.to_summary().counters.null_count.value == 2 # Mimic a legal protobuf with null_count set msg: ColumnMessage = col.to_protobuf() msg.counters.null_count.value = 2 roundtrip = ColumnProfile.from_protobuf(msg) assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
def test_summary(): c = ColumnProfile("col") for n in [1, 2, 3]: c.track(n) summary = c.to_summary() actual_val = message_to_dict(summary) expected_val = { "counters": {"count": "3",}, "schema": { "inferredType": {"type": "INTEGRAL", "ratio": 1.0}, "typeCounts": {"INTEGRAL": "3"}, }, "numberSummary": { "count": "3", "min": 1.0, "max": 3.0, "mean": 2.0, "stddev": 1.0, "isDiscrete": False, "histogram": { "start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0, "bins": [1.0, 3.0000003], "n": "3", "width": 0.0, }, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], }, "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0}, }, } # Top-level unique count needs to be approximately equal expected_unique = { "estimate": 3.000000014901161, "lower": 3.0, "upper": 3.0001498026537594, } actual_unique = actual_val.pop("uniqueCount") assert actual_unique == pytest.approx(expected_unique, 0.0001) # Cannot do a straightforward comparison of frequent number counts, since # their orders can vary actual_freq = actual_val["numberSummary"]["frequentNumbers"] actual_val["numberSummary"].pop("frequentNumbers") counts = [] for num_list in (actual_freq["longs"], actual_freq["doubles"]): for xi in num_list: val = xi["value"] if isinstance(val, str): # Parse JSON encoded int64 val = json.loads(val) count = xi["estimate"] if isinstance(count, str): # Parse JSON encoded int64 count = json.loads(count) counts.append((val, count)) expected_counts = {(1, 1), (2, 1), (3, 1)} assert len(counts) == len(expected_counts) assert set(counts) == expected_counts # Cannot do a straightforward frequentItems count since order is ambiguous actual_freq = actual_val.pop("frequentItems") assert set(actual_freq.keys()) == {"items"} expected = [("1", "1"), ("2", "1"), ("3", "1")] assert len(actual_freq["items"]) == len(expected) counts = [] for v in actual_freq["items"]: counts.append((v["jsonValue"], v["estimate"])) assert set(counts) == set(expected) # Compare the messages, excluding the frequent numbers counters assert actual_val == expected_val