def from_protobuf(message: DatasetProfileMessage) -> "DatasetProfile": """ Load from a protobuf message Parameters ---------- message : DatasetProfileMessage The protobuf message. Should match the output of `DatasetProfile.to_protobuf()` Returns ------- dataset_profile : DatasetProfile """ properties: DatasetProperties = message.properties name = (properties.tags or {}).get( "name", None) or (properties.tags or {}).get("Name", None) or "" return DatasetProfile( name=name, session_id=properties.session_id, session_timestamp=from_utc_ms(properties.session_timestamp), dataset_timestamp=from_utc_ms(properties.data_timestamp), columns={ k: ColumnProfile.from_protobuf(v) for k, v in message.columns.items() }, tags=dict(properties.tags or {}), metadata=dict(properties.metadata or {}), model_profile=ModelProfile.from_protobuf(message.modeProfile), )
def from_protobuf(message: DatasetProfileMessage): """ Load from a protobuf message Parameters ---------- message : DatasetProfileMessage The protobuf message. Should match the output of `DatasetProfile.to_protobuf()` Returns ------- dataset_profile : DatasetProfile """ return DatasetProfile( name=message.properties.tags["Name"], session_id=message.properties.session_id, session_timestamp=from_utc_ms(message.properties.session_timestamp), data_timestamp=from_utc_ms(message.properties.data_timestamp), columns={ k: ColumnProfile.from_protobuf(v) for k, v in message.columns.items() }, tags=dict(message.properties.tags), metadata=dict(message.properties.metadata), )
def test_protobuf(): c = ColumnProfile("col") for val in [1, 2, 3]: c.track(val) msg = c.to_protobuf() c1 = ColumnProfile.from_protobuf(msg) assert c1.column_name == c.column_name == "col" assert hasattr(c1, "number_tracker") assert hasattr(c1, "string_tracker") assert c1.string_tracker.length is not None assert c1.string_tracker.length.count == 0 assert len(c1.string_tracker.char_pos_tracker.character_list) == 56 c1.to_protobuf()
def test_copy_counters_null_count_in_schema_tracker(): col = ColumnProfile("test") vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0] for v in vals: col.track(v) assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2 # ensuring we can still access the value in summary mode assert col.to_summary().counters.null_count.value == 2 # Mimic a legal protobuf with null_count set msg: ColumnMessage = col.to_protobuf() msg.counters.null_count.value = 2 roundtrip = ColumnProfile.from_protobuf(msg) assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
def test_protobuf(): c = ColumnProfile("col") for val in [1, 2, 3]: c.track(val) msg = c.to_protobuf() c1 = ColumnProfile.from_protobuf(msg) assert c1.column_name == c.column_name == "col" assert hasattr(c1, "number_tracker") msg2 = c1.to_protobuf() # We cannot do a straight equality comparison for serialized frequent # strings objects compare_frequent_items( c1.number_tracker.frequent_numbers.get_frequent_items(), c.number_tracker.frequent_numbers.get_frequent_items(), ) msg.numbers.frequent_numbers.sketch = bytes() msg2.numbers.frequent_numbers.sketch = bytes()