def test_protobuf(): c = ColumnProfile("col") for val in [1, 2, 3]: c.track(val) msg = c.to_protobuf() c1 = ColumnProfile.from_protobuf(msg) assert c1.column_name == c.column_name == "col" assert hasattr(c1, "number_tracker") assert hasattr(c1, "string_tracker") assert c1.string_tracker.length is not None assert c1.string_tracker.length.count == 0 assert len(c1.string_tracker.char_pos_tracker.character_list) == 56 c1.to_protobuf()
def test_copy_counters_null_count_in_schema_tracker(): col = ColumnProfile("test") vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0] for v in vals: col.track(v) assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2 # ensuring we can still access the value in summary mode assert col.to_summary().counters.null_count.value == 2 # Mimic a legal protobuf with null_count set msg: ColumnMessage = col.to_protobuf() msg.counters.null_count.value = 2 roundtrip = ColumnProfile.from_protobuf(msg) assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
def test_protobuf(): c = ColumnProfile("col") for val in [1, 2, 3]: c.track(val) msg = c.to_protobuf() c1 = ColumnProfile.from_protobuf(msg) assert c1.column_name == c.column_name == "col" assert hasattr(c1, "number_tracker") msg2 = c1.to_protobuf() # We cannot do a straight equality comparison for serialized frequent # strings objects compare_frequent_items( c1.number_tracker.frequent_numbers.get_frequent_items(), c.number_tracker.frequent_numbers.get_frequent_items(), ) msg.numbers.frequent_numbers.sketch = bytes() msg2.numbers.frequent_numbers.sketch = bytes()