Esempio n. 1
0
def test_character_pos_tracker():

    x = StringTracker()

    data = ["abc abc", "93341-1", "912254", None]
    no_nulls = [a for a in data if a is not None]
    count = len(no_nulls)
    n_unique = len(set(no_nulls))

    for record in data:
        x.update(record)

    assert x.items.get_num_active_items() == n_unique
    assert x.items.get_total_weight() == count

    assert pytest.approx(x.length.ints.mean(), 0.001) == 6.666

    assert pytest.approx(x.token_length.ints.mean(), 0.001) == 1.333

    assert x.theta_sketch.get_result().get_estimate() == float(n_unique)
    assert x.count == count

    assert x.char_pos_tracker is not None
    assert x.length.count == 3
    assert x.char_pos_tracker.char_pos_map["a"].histogram.get_min_value() == 0
    assert x.char_pos_tracker.char_pos_map["a"].count == 2
    assert x.char_pos_tracker.char_pos_map["-"].histogram.get_min_value() == 5
Esempio n. 2
0
def test_unicode_character_pos():
    x = StringTracker()
    data = ["👷‍♀️ 🤺"]
    for record in data:
        x.update(record, character_list="🤺")

    assert x.char_pos_tracker.char_pos_map["NITL"].count == 5
    assert x.char_pos_tracker.char_pos_map["🤺"].count == 1
    assert x.char_pos_tracker.char_pos_map["🤺"].histogram.get_min_value() == 5
Esempio n. 3
0
def test_merge_character_pos():
    x = StringTracker()
    y = StringTracker()
    data = ["abc abc", "93341-1", "912254", "bac tralalala"]
    data_2 = ["geometric inference ", "93341-1", "912254", "bac tralalala", "😀 this is a sale! a ❄️ sale!", "this is a long sentence that ends in an A", None]
    for record in data:
        x.update(record)
    for record in data_2:
        y.update(record)

    assert x.char_pos_tracker.char_pos_map["NITL"].count == 2
    assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 3
    assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_min_value() == 3

    assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 12
    assert x.char_pos_tracker.char_pos_map["a"].histogram.get_min_value() == 0
    assert y.char_pos_tracker.char_pos_map["NITL"].count == 22
    assert y.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 39
    new_tracker = x.merge(y)

    assert new_tracker.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40

    assert new_tracker.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 39
    assert new_tracker.char_pos_tracker.char_pos_map["NITL"].histogram.get_min_value() == 0
    assert new_tracker.char_pos_tracker.char_pos_map["NITL"].count == 24
Esempio n. 4
0
def test_merge_mod_character_lists():
    x = StringTracker()
    y = StringTracker()
    data = ["abc abc", "93341-1", "912254", "bac tralalala"]
    data_2 = [
        "geometric inference ", "93341-1", "912254", "bac tralalala",
        "😀 this is a sale! a ❄️ sale!",
        "this is a long sentence that ends in an A", None
    ]

    for record in data:
        x.update(record, character_list="ab")
    for record in data_2:
        y.update(record, character_list="a")

    assert x.char_pos_tracker.char_pos_map["NITL"].count == 23
    assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value(
    ) == 11
    assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 12
    assert y.char_pos_tracker.char_pos_map["NITL"].count == 102
    assert y.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40
    assert y.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value(
    ) == 39

    x = x.merge(y)

    assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40
    assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value(
    ) == 39
    assert x.char_pos_tracker.char_pos_map["NITL"].count == 125
    assert x.token_length.histogram.get_max_value() == 10
Esempio n. 5
0
def test_tracking():
    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One", "six", None, None]
    no_nulls = [a for a in data if a is not None]
    count = len(no_nulls)
    n_unique = len(set(no_nulls))

    for record in data:
        x.update(record)

    assert x.items.get_num_active_items() == n_unique
    assert x.items.get_total_weight() == count
    assert [("one", 3, 3, 3)] == x.items.get_frequent_items(datasketches.frequent_items_error_type.NO_FALSE_NEGATIVES, 2)

    assert x.theta_sketch.get_result().get_estimate() == float(n_unique)
    assert x.count == count
    # check case insensitive tracking
    assert x.char_pos_tracker.char_pos_map["o"].count == 5
Esempio n. 6
0
def test_string_tracker_merge():
    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One test", "six", None, None]
    for record in data:
        x.update(record)
    assert x.token_length.histogram.get_max_value() == 2
    assert x.token_length.histogram.get_min_value() == 1
    x2 = StringTracker()
    data = ["this is a long sentence that ends in an A"]
    for record in data:
        x2.update(record)
    assert x2.token_length.histogram.get_max_value() == 10
    assert x2.token_length.histogram.get_min_value() == 10
    new_string_track = x2.merge(x)
    assert new_string_track.token_length.histogram.get_max_value() == 10
    assert new_string_track.token_length.histogram.get_min_value() == 1
Esempio n. 7
0
def test_protobuf():
    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One", "six", None, None]
    for record in data:
        x.update(record)
    x2 = StringTracker.from_protobuf(x.to_protobuf())
    assert x.count == x2.count
    assert x.items.get_total_weight() == x2.items.get_total_weight()
    assert x.theta_sketch.get_result().get_estimate() == x2.theta_sketch.get_result().get_estimate()
Esempio n. 8
0
    def from_protobuf(message):
        """
        Load from a protobuf message

        Returns
        -------
        column_profile : ColumnProfile
        """
        schema_tracker = SchemaTracker.from_protobuf(
            message.schema,
            legacy_null_count=message.counters.null_count.value)
        return ColumnProfile(
            message.name,
            counters=(CountersTracker.from_protobuf(message.counters)),
            schema_tracker=schema_tracker,
            number_tracker=NumberTracker.from_protobuf(message.numbers),
            string_tracker=StringTracker.from_protobuf(message.strings),
            frequent_items=FrequentItemsSketch.from_protobuf(
                message.frequent_items),
            cardinality_tracker=HllSketch.from_protobuf(
                message.cardinality_tracker),
        )
Esempio n. 9
0
    def __init__(
        self,
        name: str,
        number_tracker: NumberTracker = None,
        string_tracker: StringTracker = None,
        schema_tracker: SchemaTracker = None,
        counters: CountersTracker = None,
        frequent_items: FrequentItemsSketch = None,
        cardinality_tracker: HllSketch = None,
        constraints: ValueConstraints = None,
    ):
        # Handle default values
        if counters is None:
            counters = CountersTracker()
        if number_tracker is None:
            number_tracker = NumberTracker()
        if string_tracker is None:
            string_tracker = StringTracker()
        if schema_tracker is None:
            schema_tracker = SchemaTracker()
        if frequent_items is None:
            frequent_items = FrequentItemsSketch()
        if cardinality_tracker is None:
            cardinality_tracker = HllSketch()
        if constraints is None:
            constraints = ValueConstraints()

        # Assign values
        self.column_name = name
        self.number_tracker = number_tracker
        self.string_tracker = string_tracker
        self.schema_tracker = schema_tracker
        self.counters = counters
        self.frequent_items = frequent_items
        self.cardinality_tracker = cardinality_tracker
        self.constraints = constraints
Esempio n. 10
0
def test_summary():
    import pandas as pd

    x = StringTracker()
    data = ["one", "two", "three", "one", "one", "One", "six", None, None]
    for record in data:
        x.update(record)
    # Check the full output.  NOTE: the order of the "items" below should
    # really be arbitrary
    expected = {
        "uniqueCount": {"estimate": 5.0, "upper": 5.0, "lower": 5.0},
        "frequent": {
            "items": [
                {"value": "one", "estimate": 3.0},
                {"value": "three", "estimate": 1.0},
                {"value": "six", "estimate": 1.0},
                {"value": "One", "estimate": 1.0},
                {"value": "two", "estimate": 1.0},
            ]
        },
        "length": {
            "count": "7",
            "min": 3.0,
            "max": 5.0,
            "mean": 3.2857142857142856,
            "stddev": 0.7559289460184544,
            "histogram": {
                "start": 3.0,
                "end": 5.0000005,
                "counts": ["6", "1"],
                "max": 5.0,
                "min": 3.0,
                "bins": [3.0, 4.000000249999999, 5.0000005],
                "n": "7",
                "width": 0.0,
            },
            "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0},
            "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0]},
            "frequentNumbers": {"longs": [{"estimate": "6", "value": "3", "rank": 0}, {"estimate": "1", "value": "5", "rank": 1}], "doubles": []},
            "isDiscrete": False,
        },
        "tokenLength": {
            "count": "7",
            "min": 1.0,
            "max": 1.0,
            "mean": 1.0,
            "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["7"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "7", "width": 0.0},
            "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
            "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]},
            "frequentNumbers": {"longs": [{"estimate": "7", "value": "1", "rank": 0}], "doubles": []},
            "stddev": 0.0,
            "isDiscrete": False,
        },
        "charPosTracker": {
            "characterList": "!#$%&()*+,-./0123456789?@[]^_abcdefghijklmnopqrstuvwyz{}",
            "charPosMap": {
                "i": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "t": {
                    "count": "2",
                    "histogram": {"counts": ["2"], "bins": [0.0, 0.0], "n": "2", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "2", "value": "0", "rank": 0}], "doubles": []},
                    "min": 0.0,
                    "max": 0.0,
                    "mean": 0.0,
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "s": {
                    "count": "1",
                    "histogram": {"counts": ["1"], "bins": [0.0, 0.0], "n": "1", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "0", "rank": 0}], "doubles": []},
                    "min": 0.0,
                    "max": 0.0,
                    "mean": 0.0,
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "n": {
                    "count": "4",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["4"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "4", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "4", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "h": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "o": {
                    "count": "5",
                    "max": 2.0,
                    "mean": 0.4,
                    "stddev": 0.894427190999916,
                    "histogram": {
                        "end": 2.0000002,
                        "counts": ["4", "1"],
                        "max": 2.0,
                        "bins": [0.0, 1.0000001, 2.0000002],
                        "n": "5",
                        "start": 0.0,
                        "width": 0.0,
                        "min": 0.0,
                    },
                    "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "4", "value": "0", "rank": 0}, {"estimate": "1", "value": "2", "rank": 1}], "doubles": []},
                    "min": 0.0,
                    "isDiscrete": False,
                },
                "NITL": {
                    "count": "1",
                    "min": 2.0,
                    "max": 2.0,
                    "mean": 2.0,
                    "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "w": {
                    "count": "1",
                    "min": 1.0,
                    "max": 1.0,
                    "mean": 1.0,
                    "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
                "e": {
                    "count": "6",
                    "min": 2.0,
                    "max": 4.0,
                    "mean": 2.5,
                    "stddev": 0.8366600265340756,
                    "histogram": {
                        "start": 2.0,
                        "end": 4.0000004,
                        "counts": ["5", "1"],
                        "max": 4.0,
                        "min": 2.0,
                        "bins": [2.0, 3.0000002, 4.0000004],
                        "n": "6",
                        "width": 0.0,
                    },
                    "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 4.0],
                    },
                    "frequentNumbers": {
                        "longs": [
                            {"estimate": "4", "value": "2", "rank": 0},
                            {"estimate": "1", "value": "4", "rank": 1},
                            {"estimate": "1", "value": "3", "rank": 2},
                        ],
                        "doubles": [],
                    },
                    "isDiscrete": False,
                },
                "r": {
                    "count": "1",
                    "min": 2.0,
                    "max": 2.0,
                    "mean": 2.0,
                    "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0},
                    "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0},
                    "quantiles": {
                        "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                        "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
                    },
                    "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []},
                    "stddev": 0.0,
                    "isDiscrete": False,
                },
            },
        },
    }
    expected_items = pd.DataFrame(expected["frequent"]["items"]).sort_values(["value", "estimate"])
    expected["frequent"].pop("items")

    # removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu

    for char, value in expected["charPosTracker"]["charPosMap"].items():
        value.pop("frequentNumbers")

    actual = message_to_dict(x.to_summary())
    actual_items = pd.DataFrame(actual["frequent"]["items"]).sort_values(["value", "estimate"])
    actual["frequent"].pop("items")

    # same as above, removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu
    for char, value in actual["charPosTracker"]["charPosMap"].items():
        value.pop("frequentNumbers")

    assert expected == actual
    pd.testing.assert_frame_equal(actual_items.reset_index(drop=True).sort_index(axis=1), expected_items.reset_index(drop=True).sort_index(axis=1))