def test_merge_different_columns():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    x1.track("col1", "value")
    x2 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x2"},
    )
    x2.track("col2", "value")

    merged = x1.merge(x2)

    assert merged.name == "test"
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert set(list(merged.columns.keys())) == {"col1", "col2"}
    assert merged.columns["col1"].counters.count == 1
    assert merged.columns["col2"].counters.count == 1
    assert merged.tags == dict({"Name": "test", "key": "value"})
    assert merged.metadata == dict({"key": "x1"})
def test_protobuf_round_trip():
    now = datetime.datetime.utcnow()
    tags = {"k1": "rock", "k2": "scissors", "k3": "paper"}
    original = DatasetProfile(
        name="test",
        dataset_timestamp=now,
        tags=tags,
    )
    original.track("col1", "value")
    original.track("col2", "value")

    msg = original.to_protobuf()
    roundtrip = DatasetProfile.from_protobuf(msg)

    assert roundtrip.to_protobuf() == msg
    assert roundtrip.name == "test"
    assert roundtrip.session_id == original.session_id
    assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms(
        original.session_timestamp)
    assert set(list(roundtrip.columns.keys())) == {"col1", "col2"}
    assert roundtrip.columns["col1"].counters.count == 1
    assert roundtrip.columns["col2"].counters.count == 1

    tags["Name"] = "test"
    assert set(roundtrip.tags) == set(tags)
    assert roundtrip.metadata == original.metadata
def test_write_delimited_multiple():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(
        name="test",
        session_id="test.session.id",
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    original.track("col1", "value")

    output_bytes = original.serialize_delimited()

    multiple_entries = output_bytes
    for i in range(1, 5):
        multiple_entries += output_bytes

    entries = DatasetProfile.parse_delimited(multiple_entries)
    assert len(entries) == 5

    for entry in entries:
        assert entry.session_id == original.session_id
        # Python time precisions are different
        assert time.to_utc_ms(entry.session_timestamp) == time.to_utc_ms(
            original.session_timestamp)
        assert entry.tags == original.tags
        assert entry.metadata == original.metadata
def test_errors():

    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    with pytest.raises(TypeError):
        original.track(columns=1, data=34)
Exemple #5
0
def test_viz():
    now = datetime.datetime.utcnow()
    session_id = uuid4().hex
    x1 = DatasetProfile(name="test", session_id=session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"},)
    x1.track("col1", "value")
    viz = ProfileVisualizer()
    viz.available_plots()

    viz.set_profiles([x1])
def test_track():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)

    data = {
        "rows": 1,
        "names": "roger roger",
    }
    original.track(columns=data)
def test_chunk_iterator():

    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    data = {
        "rows": 1,
        "names": "roger roger",
    }
    original.track(columns=data)

    for each_chuck in original.chunk_iterator():
        assert each_chuck is not None
def test_write_delimited_single():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    original.track("col1", "value")

    output_bytes = original.serialize_delimited()
    pos, roundtrip = DatasetProfile.parse_delimited_single(output_bytes)

    assert roundtrip.session_id == original.session_id
    # Python time precision includes nanoseconds
    assert time.to_utc_ms(roundtrip.session_timestamp) == time.to_utc_ms(
        original.session_timestamp)
    assert roundtrip.tags == original.tags
    assert roundtrip.metadata == original.metadata
def test_merge_same_columns():
    now = datetime.datetime.now(datetime.timezone.utc)
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    x1.track("col1", "value1")
    x2 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    x2.track("col1", "value1")
    x2.track("col2", "value")

    merged = x1.merge(x2)
    assert merged.name == "test"
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert set(list(merged.columns.keys())) == {"col1", "col2"}
    assert merged.columns["col1"].counters.count == 2
    assert merged.columns["col2"].counters.count == 1
def test_track_null_item():
    prof = DatasetProfile("name")
    prof.track("column_name", 1)
    prof = DatasetProfile("name")
    prof.track("column_name", None)
    assert prof.flat_summary()["summary"]["column"][0] == "column_name"
    assert prof.flat_summary()["summary"]["null_count"][0] == 1
    prof.track("column_name", None)
    assert prof.flat_summary()["summary"]["null_count"][0] == 2
    assert prof.flat_summary()["summary"]["column"][0] == "column_name"