def test_array():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    with pytest.raises(ValueError):
        original.track_array(np.random.rand(3))
def test_merge_lhs_no_profile():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    x2 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
        model_profile=ModelProfile(),
    )

    merged = x1.merge(x2)
    assert merged.name == "test"
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert merged.columns == {}
    assert merged.model_profile is not None
Beispiel #3
0
def test_mismatched_tags_merge_succeeds():
    now = datetime.datetime.utcnow()
    x1 = DatasetProfile("test", now, tags={"key": "foo"})
    x2 = DatasetProfile("test2", now, tags={"key": "bar"})

    result = x1.merge(x2)
    assert result.tags.get("key") == "foo"
def test_errors():

    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    with pytest.raises(TypeError):
        original.track(columns=1, data=34)
Beispiel #5
0
def test_viz():
    now = datetime.datetime.utcnow()
    session_id = uuid4().hex
    x1 = DatasetProfile(name="test", session_id=session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"},)
    x1.track("col1", "value")
    viz = ProfileVisualizer()
    viz.available_plots()

    viz.set_profiles([x1])
def test_flat_summary():

    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    flat_summary = original.flat_summary()
    assert flat_summary is not None
    assert len(original.flat_summary()) == 4
def test_mismatched_tags_raises_assertion_error():
    now = datetime.datetime.utcnow()
    x1 = DatasetProfile("test", now, tags={"key": "foo"})
    x2 = DatasetProfile("test", now, tags={"key": "bar"})
    try:
        x1.merge(x2)
        raise RuntimeError("Assertion error not raised")
    except AssertionError:
        pass
def test_merge_different_columns():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    x1.track("col1", "value")
    x2 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x2"},
    )
    x2.track("col2", "value")

    merged = x1.merge(x2)

    assert merged.name == "test"
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert set(list(merged.columns.keys())) == {"col1", "col2"}
    assert merged.columns["col1"].counters.count == 1
    assert merged.columns["col2"].counters.count == 1
    assert merged.tags == dict({"Name": "test", "key": "value"})
    assert merged.metadata == dict({"key": "x1"})
def test_verify_schema_version():
    dp = DatasetProfile(
        name="test",
        session_id="test.session.id",
        session_timestamp=datetime.datetime.now(),
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    props = dp.to_properties()
    assert props.schema_major_version == 1
    assert props.schema_minor_version == 1
def test_track():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)

    data = {
        "rows": 1,
        "names": "roger roger",
    }
    original.track(columns=data)
def test_parse_delimited_from_java_single():
    dir_path = os.path.dirname(os.path.realpath(__file__))

    with open(os.path.join(dir_path, "output_from_java_08242020.bin"),
              "rb") as f:
        data = f.read()
        assert DatasetProfile.parse_delimited_single(data) is not None

    with open(os.path.join(dir_path, "output_from_java_01212021.bin"),
              "rb") as f:
        data = f.read()
        assert DatasetProfile.parse_delimited_single(data) is not None
def test_empty_valid_datasetprofiles_empty():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(name="test", session_id=shared_session_id, session_timestamp=now, tags={
                        "key": "value"}, metadata={"key": "value"},)
    x2 = DatasetProfile(name="test", session_id=shared_session_id, session_timestamp=now, tags={
                        "key": "value"}, metadata={"key": "value"},)

    merged = x1.merge(x2)
    assert merged.name == "test"
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert merged.columns == {}
def test_chunk_iterator():

    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    data = {
        "rows": 1,
        "names": "roger roger",
    }
    original.track(columns=data)

    for each_chuck in original.chunk_iterator():
        assert each_chuck is not None
Beispiel #14
0
def test_track_PIL_img():
    from whylogs.proto import InferredType

    Type = InferredType.Type

    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    num_image_features = len(_IMAGE_FEATURES)
    num_metadata_features = len(_METADATA_DEFAULT_ATTRIBUTES)

    test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg")

    total_default_features = num_image_features + num_metadata_features
    profile_1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    img = Image.open(open(test_image_path, "rb"))
    trackImage = TrackImage(img=img)

    trackImage(profile_1)
    columns = profile_1.columns
    assert len(columns) == total_default_features
    assert columns["Saturation"].number_tracker.count == 67500
    assert columns["BitsPerSample"].counters.count == 3
Beispiel #15
0
def test_track_json_annotation():

    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    num_bb_features = len(BB_ATTRIBUTES)

    test_annotation_path = os.path.join(TEST_DATA_PATH, "files",
                                        "yolo_bounding_box.jsonl")

    profile_1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )

    objs = [
        json.loads(eachline) for eachline in open(test_annotation_path, "r")
    ]
    trackbb = TrackBB(obj=objs)

    trackbb(profile_1)
    columns = profile_1.columns
    assert len(columns) == len(BB_ATTRIBUTES)
    assert columns["annotation_count"].number_tracker.count == 100
Beispiel #16
0
def test_track_bb_annotation():

    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex

    test_annotation_path = os.path.join(TEST_DATA_PATH, "files",
                                        "yolo_bounding_box.jsonl")

    # total_default_features = num_image_features + num_metadata_features
    profile_1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    trackbb = TrackBB(test_annotation_path)
    trackbb(profile_1)

    columns = profile_1.columns
    assert len(columns) == len(BB_ATTRIBUTES)
    for each_attribute in BB_ATTRIBUTES:
        assert columns.get(each_attribute, None) is not None
        if each_attribute in ("annotation_count", "area_coverage",
                              "annotation_density"):
            assert columns[each_attribute].number_tracker.count == 100
        else:
            assert columns[each_attribute].number_tracker.count == 4183
Beispiel #17
0
def profile_lending_club():

    import datetime
    from uuid import uuid4

    now = datetime.datetime.utcnow()
    session_id = uuid4().hex
    df = pd.read_csv(
        os.path.join(_MY_DIR, os.pardir, "testdata", "lending_club_1000.csv"))
    profile = DatasetProfile(name="test",
                             session_id=session_id,
                             session_timestamp=now)

    profile.track_dataframe(df)

    return profile
def tests_timestamp():
    time = datetime.datetime.now()
    dp = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=datetime.datetime.now(
    ), tags={"key": "value"}, metadata={"key": "value"},)
    time_2 = dp.session_timestamp_ms
    assert time_2 == int(time.replace(
        tzinfo=datetime.timezone.utc).timestamp() * 1000.0)
Beispiel #19
0
def test_track_PIL_img():
    now = datetime.datetime.now(datetime.timezone.utc)
    shared_session_id = uuid4().hex
    total_default_features = _METADATA_DEFAULT_ATTRIBUTES

    test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg")

    profile_1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    img = Image.open(open(test_image_path, "rb"))
    trackImage = TrackImage(img=img)

    trackImage(profile_1)
    columns = profile_1.columns
    for feature_name in total_default_features:
        assert feature_name in columns, f"{feature_name} not in {columns}"
    assert columns["Hue.mean"].number_tracker.count == 1
    assert columns["Saturation.mean"].number_tracker.count == 1
    assert columns["Brightness.mean"].number_tracker.count == 1
    assert columns["ImagePixelWidth"].counters.count == 1
    assert columns["ImagePixelHeight"].counters.count == 1
def test_dataframe_profile():
    time = datetime.datetime.now()
    df = util.testing.makeDataFrame()

    profile = DatasetProfile("test", time)
    profile.track_dataframe(df)

    profile_factory = dataframe_profile(df, name="test", timestamp=time)

    assert profile_factory.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean

    profile_factory_2 = dataframe_profile(df)
    assert profile_factory_2.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean
    profile_factory_3 = dataframe_profile(df, timestamp=103433)

    assert profile_factory_3.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean
def test_track_image():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    num_image_features = len(_IMAGE_FEATURES)
    num_metadata_features = len(_METADATA_DEFAULT_ATTRIBUTES)
    test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg")

    total_default_features = num_image_features + num_metadata_features
    profile_1 = DatasetProfile(
        name="test",
        session_id=shared_session_id,
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "x1"},
    )
    trackImage = TrackImage(test_image_path)

    pixels_per_image = 67500
    trackImage(profile_1)
    columns = profile_1.columns
    assert len(columns) == total_default_features
    assert columns["Saturation"].number_tracker.count == pixels_per_image
    assert columns["BitsPerSample"].counters.count == 3
    trackImage = TrackImage(test_image_path)
    trackImage(profile_1)
    columns = profile_1.columns
    assert len(columns) == total_default_features
    assert columns["Saturation"].number_tracker.count == 2 * pixels_per_image
def test_parse_delimited_from_java_multiple():
    dir_path = os.path.dirname(os.path.realpath(__file__))

    with open(os.path.join(dir_path, "output_from_java_08242020.bin"), "rb") as f:
        data = f.read()
        multiple = data + data
        result = DatasetProfile.parse_delimited(multiple)
        assert len(result) == 2
def test_write_delimited_multiple():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(
        name="test",
        session_id="test.session.id",
        session_timestamp=now,
        tags={"key": "value"},
        metadata={"key": "value"},
    )
    original.track("col1", "value")

    output_bytes = original.serialize_delimited()

    multiple_entries = output_bytes
    for i in range(1, 5):
        multiple_entries += output_bytes

    entries = DatasetProfile.parse_delimited(multiple_entries)
    assert len(entries) == 5

    for entry in entries:
        assert entry.session_id == original.session_id
        # Python time precisions are different
        assert time.to_utc_ms(entry.session_timestamp) == time.to_utc_ms(
            original.session_timestamp)
        assert entry.tags == original.tags
        assert entry.metadata == original.metadata
def test_protobuf_round_trip():
    now = datetime.datetime.utcnow()
    tags = {"k1": "rock", "k2": "scissors", "k3": "paper"}
    original = DatasetProfile(
        name="test",
        dataset_timestamp=now,
        tags=tags,
    )
    original.track("col1", "value")
    original.track("col2", "value")

    msg = original.to_protobuf()
    roundtrip = DatasetProfile.from_protobuf(msg)

    assert roundtrip.to_protobuf() == msg
    assert roundtrip.name == "test"
    assert roundtrip.session_id == original.session_id
    assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms(
        original.session_timestamp)
    assert set(list(roundtrip.columns.keys())) == {"col1", "col2"}
    assert roundtrip.columns["col1"].counters.count == 1
    assert roundtrip.columns["col2"].counters.count == 1

    tags["Name"] = "test"
    assert set(roundtrip.tags) == set(tags)
    assert roundtrip.metadata == original.metadata
def test_non_string_tag_raises_assert_error():
    now = datetime.datetime.utcnow()
    tags = {"key": "value"}
    x = DatasetProfile("test", now, tags=tags)
    x.validate()
    # Include a non-string tag
    x._tags["number"] = 1

    try:
        x.validate()
        raise RuntimeError("validate should raise an AssertionError")
    except AssertionError:
        pass
Beispiel #26
0
def test_serde_without_dataset_timezone():
    session = Session("project", "pipeline", writers=[])
    dt = datetime.datetime.fromtimestamp(1634939335, tz=None)
    logger = session.logger("", dataset_timestamp=dt)

    logger.log_csv(io.StringIO("""a,b,c
    1,1,1
    1,1,2
    4,4,3
    """))

    profile = logger.profile
    deserialized_profile = DatasetProfile.parse_delimited_single(
        profile.serialize_delimited())[1]
    profiles_eq(profile, deserialized_profile)
def test_write_delimited_single():
    now = datetime.datetime.utcnow()

    original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={
                              "key": "value"}, metadata={"key": "value"},)
    original.track("col1", "value")

    output_bytes = original.serialize_delimited()
    pos, roundtrip = DatasetProfile.parse_delimited_single(output_bytes)

    assert roundtrip.session_id == original.session_id
    # Python time precision includes nanoseconds
    assert time.to_utc_ms(roundtrip.session_timestamp) == time.to_utc_ms(
        original.session_timestamp)
    assert roundtrip.tags == original.tags
    assert roundtrip.metadata == original.metadata
def test_track_null_item():
    prof = DatasetProfile("name")
    prof.track("column_name", 1)
    prof = DatasetProfile("name")
    prof.track("column_name", None)
    assert prof.flat_summary()["summary"]["column"][0] == "column_name"
    assert prof.flat_summary()["summary"]["null_count"][0] == 1
    prof.track("column_name", None)
    assert prof.flat_summary()["summary"]["null_count"][0] == 2
    assert prof.flat_summary()["summary"]["column"][0] == "column_name"
def test_parse_from_protobuf():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    DatasetProfile.read_protobuf(
        os.path.join(dir_path, "output_from_java_08242020.bin"))
def test_name_always_appear_in_tags():
    x1 = DatasetProfile(name="test")
    assert x1.tags["Name"] == "test"