Example #1
0
    def from_protobuf(message: DatasetProfileMessage) -> "DatasetProfile":
        """
        Load from a protobuf message

        Parameters
        ----------
        message : DatasetProfileMessage
            The protobuf message.  Should match the output of
            `DatasetProfile.to_protobuf()`

        Returns
        -------
        dataset_profile : DatasetProfile
        """
        properties: DatasetProperties = message.properties
        name = (properties.tags or {}).get(
            "name", None) or (properties.tags or {}).get("Name", None) or ""

        return DatasetProfile(
            name=name,
            session_id=properties.session_id,
            session_timestamp=from_utc_ms(properties.session_timestamp),
            dataset_timestamp=from_utc_ms(properties.data_timestamp),
            columns={
                k: ColumnProfile.from_protobuf(v)
                for k, v in message.columns.items()
            },
            tags=dict(properties.tags or {}),
            metadata=dict(properties.metadata or {}),
            model_profile=ModelProfile.from_protobuf(message.modeProfile),
        )
Example #2
0
    def from_protobuf(message: DatasetProfileMessage):
        """
        Load from a protobuf message

        Parameters
        ----------
        message : DatasetProfileMessage
            The protobuf message.  Should match the output of
            `DatasetProfile.to_protobuf()`

        Returns
        -------
        dataset_profile : DatasetProfile
        """
        return DatasetProfile(
            name=message.properties.tags["Name"],
            session_id=message.properties.session_id,
            session_timestamp=from_utc_ms(message.properties.session_timestamp),
            data_timestamp=from_utc_ms(message.properties.data_timestamp),
            columns={
                k: ColumnProfile.from_protobuf(v) for k, v in message.columns.items()
            },
            tags=dict(message.properties.tags),
            metadata=dict(message.properties.metadata),
        )
Example #3
0
def test_protobuf():
    c = ColumnProfile("col")
    for val in [1, 2, 3]:
        c.track(val)
    msg = c.to_protobuf()
    c1 = ColumnProfile.from_protobuf(msg)
    assert c1.column_name == c.column_name == "col"
    assert hasattr(c1, "number_tracker")
    assert hasattr(c1, "string_tracker")
    assert c1.string_tracker.length is not None

    assert c1.string_tracker.length.count == 0
    assert len(c1.string_tracker.char_pos_tracker.character_list) == 56
    c1.to_protobuf()
Example #4
0
def test_copy_counters_null_count_in_schema_tracker():
    col = ColumnProfile("test")
    vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0]
    for v in vals:
        col.track(v)
    assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2

    # ensuring we can still access the value in summary mode
    assert col.to_summary().counters.null_count.value == 2

    # Mimic a legal protobuf with null_count set
    msg: ColumnMessage = col.to_protobuf()
    msg.counters.null_count.value = 2

    roundtrip = ColumnProfile.from_protobuf(msg)
    assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
def test_protobuf():
    c = ColumnProfile("col")
    for val in [1, 2, 3]:
        c.track(val)
    msg = c.to_protobuf()
    c1 = ColumnProfile.from_protobuf(msg)
    assert c1.column_name == c.column_name == "col"
    assert hasattr(c1, "number_tracker")
    msg2 = c1.to_protobuf()
    # We cannot do a straight equality comparison for serialized frequent
    # strings objects
    compare_frequent_items(
        c1.number_tracker.frequent_numbers.get_frequent_items(),
        c.number_tracker.frequent_numbers.get_frequent_items(),
    )
    msg.numbers.frequent_numbers.sketch = bytes()
    msg2.numbers.frequent_numbers.sketch = bytes()