Ejemplo n.º 1
0
def test_string_as_arrays_does_not_throw():
    InferredType.Type
    c = ColumnProfile("col")
    data = "[0,0]"  # this string will be parsed as an array
    c.track(data)
    summary: ColumnSummary = c.to_summary()
    assert summary.schema.inferred_type.type == InferredType.Type.UNKNOWN
Ejemplo n.º 2
0
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {
            "count": "3",
        },
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val
Ejemplo n.º 3
0
def test_mostly_nulls_inferred_type_not_null():
    Type = InferredType.Type
    c = ColumnProfile("col")
    data = [None, np.nan, None] * 3 + ["not a null val!"]
    for val in data:
        c.track(val)
    summary = c.to_summary()
    assert summary.schema.inferred_type.type != Type.NULL
Ejemplo n.º 4
0
def test_all_nulls_inferred_type_null(data, nulls_expected, expected_type):
    InferredType.Type
    c = ColumnProfile("col")
    for val in data:
        c.track(val)
    summary: ColumnSummary = c.to_summary()
    assert summary.counters.null_count.value == nulls_expected
    assert summary.schema.inferred_type.type == expected_type
Ejemplo n.º 5
0
def test_fallback_fallbacks_to_number_counter():
    col = ColumnProfile("test")
    vals = ["a", "b", 1.0, 2.0]
    for v in vals:
        col.track(v)
    col.cardinality_tracker = HllSketch()

    summary = col.to_summary()
    assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
Ejemplo n.º 6
0
def test_fallback_number_counter():
    col = ColumnProfile("test")
    vals = [1, 1.0, 2, 3, 4, 5, 6, 6.0, "text"]
    for v in vals:
        col.track(v)
    col.cardinality_tracker = HllSketch()

    summary = col.to_summary()
    assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
Ejemplo n.º 7
0
def test_all_nulls_inferred_type_null():
    import numpy as np
    from whylogs.proto import InferredType

    Type = InferredType.Type
    c = ColumnProfile("col")
    data = [None, np.nan, None] * 3
    for val in data:
        c.track(val)
    summary = c.to_summary()
    assert summary.schema.inferred_type.type == Type.NULL
Ejemplo n.º 8
0
def test_copy_counters_null_count_in_schema_tracker():
    col = ColumnProfile("test")
    vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0]
    for v in vals:
        col.track(v)
    assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2

    # ensuring we can still access the value in summary mode
    assert col.to_summary().counters.null_count.value == 2

    # Mimic a legal protobuf with null_count set
    msg: ColumnMessage = col.to_protobuf()
    msg.counters.null_count.value = 2

    roundtrip = ColumnProfile.from_protobuf(msg)
    assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
Ejemplo n.º 9
0
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {"count": "3",},
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward comparison of frequent number counts, since
    # their orders can vary
    actual_freq = actual_val["numberSummary"]["frequentNumbers"]
    actual_val["numberSummary"].pop("frequentNumbers")
    counts = []
    for num_list in (actual_freq["longs"], actual_freq["doubles"]):
        for xi in num_list:
            val = xi["value"]
            if isinstance(val, str):
                # Parse JSON encoded int64
                val = json.loads(val)
            count = xi["estimate"]
            if isinstance(count, str):
                # Parse JSON encoded int64
                count = json.loads(count)
            counts.append((val, count))
    expected_counts = {(1, 1), (2, 1), (3, 1)}
    assert len(counts) == len(expected_counts)
    assert set(counts) == expected_counts

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val