Beispiel #1
0
def test_message_to_dict_returns_default_values():
    msg1 = DoublesMessage(min=0, max=0, sum=0, count=10)
    d1 = protobuf.message_to_dict(msg1)

    msg2 = DoublesMessage(count=10)
    d2 = protobuf.message_to_dict(msg2)

    true_val = {
        "min": 0.0,
        "max": 0.0,
        "sum": 0.0,
        "count": "10",
    }
    assert d1 == true_val
    assert d2 == true_val
Beispiel #2
0
def test_summary():
    import pandas as pd

    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One", "six", None, None]
    for record in data:
        x.update(record)
    # Check the full output.  NOTE: the order of the "items" below should
    # really be arbitrary
    expected = {
        "uniqueCount": {"estimate": 5.0, "upper": 5.0, "lower": 5.0},
        "frequent": {
            "items": [
                {"value": "one", "estimate": 3.0},
                {"value": "three", "estimate": 1.0},
                {"value": "six", "estimate": 1.0},
                {"value": "One", "estimate": 1.0},
                {"value": "two", "estimate": 1.0},
            ]
        },
    }
    expected_items = pd.DataFrame(expected["frequent"]["items"]).sort_values(["value", "estimate"])
    expected["frequent"].pop("items")

    actual = message_to_dict(x.to_summary())
    actual_items = pd.DataFrame(actual["frequent"]["items"]).sort_values(["value", "estimate"])
    actual["frequent"].pop("items")

    assert expected == actual
    pd.testing.assert_frame_equal(
        actual_items.reset_index(drop=True).sort_index(axis=1), expected_items.reset_index(drop=True).sort_index(axis=1),
    )
Beispiel #3
0
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {
            "count": "3",
        },
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val
def test_all_zeros_returns_summary_with_stats():
    stats = ("min", "max", "stddev", "mean")
    array = np.zeros([100, 1])

    prof = array_profile(array)
    msg = prof.to_summary()
    d = message_to_dict(msg)
    d1 = json.loads(message_to_json(msg))
    number_summary = d["columns"]["0"]["numberSummary"]
    missing_stats = [k for k in stats if k not in number_summary]
    if len(missing_stats) > 0:
        raise RuntimeError(f"Stats missing from number summary: {missing_stats}")

    assert d == d1
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {"count": "3",},
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward comparison of frequent number counts, since
    # their orders can vary
    actual_freq = actual_val["numberSummary"]["frequentNumbers"]
    actual_val["numberSummary"].pop("frequentNumbers")
    counts = []
    for num_list in (actual_freq["longs"], actual_freq["doubles"]):
        for xi in num_list:
            val = xi["value"]
            if isinstance(val, str):
                # Parse JSON encoded int64
                val = json.loads(val)
            count = xi["estimate"]
            if isinstance(count, str):
                # Parse JSON encoded int64
                count = json.loads(count)
            counts.append((val, count))
    expected_counts = {(1, 1), (2, 1), (3, 1)}
    assert len(counts) == len(expected_counts)
    assert set(counts) == expected_counts

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val
Beispiel #6
0
def test_message_to_dict_equals_message_to_json():
    msg = DoublesMessage(min=0, max=1.0, sum=2.0, count=10)
    d1 = protobuf.message_to_dict(msg)
    d2 = json.loads(protobuf.message_to_json(msg))
    assert d1 == d2
Beispiel #7
0
def test_summary():
    import pandas as pd

    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One", "six", None, None]
    for record in data:
        x.update(record)
    # Check the full output.  NOTE: the order of the "items" below should
    # really be arbitrary
    expected = {
        "uniqueCount": {"estimate": 5.0, "upper": 5.0, "lower": 5.0},
        "frequent": {
            "items": [
                {"value": "one", "estimate": 3.0},
                {"value": "three", "estimate": 1.0},
                {"value": "six", "estimate": 1.0},
                {"value": "One", "estimate": 1.0},
                {"value": "two", "estimate": 1.0},
            ]
        },
        "length": {
            "count": "7",
            "min": 3.0,
            "max": 5.0,
            "mean": 3.2857142857142856,
            "stddev": 0.7559289460184544,
            "histogram": {
                "start": 3.0,
                "end": 5.0000005,
                "counts": ["6", "1"],
                "max": 5.0,
                "min": 3.0,
                "bins": [3.0, 4.000000249999999, 5.0000005],
                "n": "7",
                "width": 0.0,
            },
            "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0},
            "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0]},
            "frequentNumbers": {"longs": [{"estimate": "6", "value": "3", "rank": 0}, {"estimate": "1", "value": "5", "rank": 1}], "doubles": []},
            "isDiscrete": False,
        },
        "tokenLength": {
            "count": "7",
            "min": 1.0,
            "max": 1.0,
            "mean": 1.0,
            "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["7"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "7", "width": 0.0},
            "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
            "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]},
            "frequentNumbers": {"longs": [{"estimate": "7", "value": "1", "rank": 0}], "doubles": []},
            "stddev": 0.0,
            "isDiscrete": False,
        },
        "charPosTracker": {
            "characterList": "!#$%&()*+,-./0123456789?@[]^_abcdefghijklmnopqrstuvwyz{}",
            "charPosMap": {
                "i": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "t": {
                    "count": "2",
                    "histogram": {"counts": ["2"], "bins": [0.0, 0.0], "n": "2", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "2", "value": "0", "rank": 0}], "doubles": []},
                    "min": 0.0,
                    "max": 0.0,
                    "mean": 0.0,
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "s": {
                    "count": "1",
                    "histogram": {"counts": ["1"], "bins": [0.0, 0.0], "n": "1", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "0", "rank": 0}], "doubles": []},
                    "min": 0.0,
                    "max": 0.0,
                    "mean": 0.0,
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "n": {
                    "count": "4",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["4"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "4", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "4", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "h": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "o": {
                    "count": "5",
                    "max": 2.0,
                    "mean": 0.4,
                    "stddev": 0.894427190999916,
                    "histogram": {
                        "end": 2.0000002,
                        "counts": ["4", "1"],
                        "max": 2.0,
                        "bins": [0.0, 1.0000001, 2.0000002],
                        "n": "5",
                        "start": 0.0,
                        "width": 0.0,
                        "min": 0.0,
                    },
                    "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "4", "value": "0", "rank": 0}, {"estimate": "1", "value": "2", "rank": 1}], "doubles": []},
                    "min": 0.0,
                    "isDiscrete": False,
                },
                "NITL": {
                    "count": "1",
                    "min": 2.0,
                    "max": 2.0,
                    "mean": 2.0,
                    "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "w": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "e": {
                    "count": "6",
                    "min": 2.0,
                    "max": 4.0,
                    "mean": 2.5,
                    "stddev": 0.8366600265340756,
                    "histogram": {
                        "start": 2.0,
                        "end": 4.0000004,
                        "counts": ["5", "1"],
                        "max": 4.0,
                        "min": 2.0,
                        "bins": [2.0, 3.0000002, 4.0000004],
                        "n": "6",
                        "width": 0.0,
                    },
                    "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 4.0],
                    },
                    "frequentNumbers": {
                        "longs": [
                            {"estimate": "4", "value": "2", "rank": 0},
                            {"estimate": "1", "value": "4", "rank": 1},
                            {"estimate": "1", "value": "3", "rank": 2},
                        ],
                        "doubles": [],
                    },
                    "isDiscrete": False,
                },
                "r": {
                    "count": "1",
                    "min": 2.0,
                    "max": 2.0,
                    "mean": 2.0,
                    "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
            },
        },
    }
    expected_items = pd.DataFrame(expected["frequent"]["items"]).sort_values(["value", "estimate"])
    expected["frequent"].pop("items")

    # removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu

    for char, value in expected["charPosTracker"]["charPosMap"].items():
        value.pop("frequentNumbers")

    actual = message_to_dict(x.to_summary())
    actual_items = pd.DataFrame(actual["frequent"]["items"]).sort_values(["value", "estimate"])
    actual["frequent"].pop("items")

    # same as above, removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu
    for char, value in actual["charPosTracker"]["charPosMap"].items():
        value.pop("frequentNumbers")

    assert expected == actual
    pd.testing.assert_frame_equal(actual_items.reset_index(drop=True).sort_index(axis=1), expected_items.reset_index(drop=True).sort_index(axis=1))