def test_array(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) with pytest.raises(ValueError): original.track_array(np.random.rand(3))
def test_merge_lhs_no_profile(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, ) x2 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, model_profile=ModelProfile(), ) merged = x1.merge(x2) assert merged.name == "test" assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert merged.columns == {} assert merged.model_profile is not None
def test_mismatched_tags_merge_succeeds(): now = datetime.datetime.utcnow() x1 = DatasetProfile("test", now, tags={"key": "foo"}) x2 = DatasetProfile("test2", now, tags={"key": "bar"}) result = x1.merge(x2) assert result.tags.get("key") == "foo"
def test_errors(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) with pytest.raises(TypeError): original.track(columns=1, data=34)
def test_viz(): now = datetime.datetime.utcnow() session_id = uuid4().hex x1 = DatasetProfile(name="test", session_id=session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"},) x1.track("col1", "value") viz = ProfileVisualizer() viz.available_plots() viz.set_profiles([x1])
def test_flat_summary(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) flat_summary = original.flat_summary() assert flat_summary is not None assert len(original.flat_summary()) == 4
def test_mismatched_tags_raises_assertion_error(): now = datetime.datetime.utcnow() x1 = DatasetProfile("test", now, tags={"key": "foo"}) x2 = DatasetProfile("test", now, tags={"key": "bar"}) try: x1.merge(x2) raise RuntimeError("Assertion error not raised") except AssertionError: pass
def test_merge_different_columns(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) x1.track("col1", "value") x2 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x2"}, ) x2.track("col2", "value") merged = x1.merge(x2) assert merged.name == "test" assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert set(list(merged.columns.keys())) == {"col1", "col2"} assert merged.columns["col1"].counters.count == 1 assert merged.columns["col2"].counters.count == 1 assert merged.tags == dict({"Name": "test", "key": "value"}) assert merged.metadata == dict({"key": "x1"})
def test_verify_schema_version(): dp = DatasetProfile( name="test", session_id="test.session.id", session_timestamp=datetime.datetime.now(), tags={"key": "value"}, metadata={"key": "value"}, ) props = dp.to_properties() assert props.schema_major_version == 1 assert props.schema_minor_version == 1
def test_track(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) data = { "rows": 1, "names": "roger roger", } original.track(columns=data)
def test_parse_delimited_from_java_single(): dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, "output_from_java_08242020.bin"), "rb") as f: data = f.read() assert DatasetProfile.parse_delimited_single(data) is not None with open(os.path.join(dir_path, "output_from_java_01212021.bin"), "rb") as f: data = f.read() assert DatasetProfile.parse_delimited_single(data) is not None
def test_empty_valid_datasetprofiles_empty(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile(name="test", session_id=shared_session_id, session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) x2 = DatasetProfile(name="test", session_id=shared_session_id, session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) merged = x1.merge(x2) assert merged.name == "test" assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert merged.columns == {}
def test_chunk_iterator(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) data = { "rows": 1, "names": "roger roger", } original.track(columns=data) for each_chuck in original.chunk_iterator(): assert each_chuck is not None
def test_track_PIL_img(): from whylogs.proto import InferredType Type = InferredType.Type now = datetime.datetime.utcnow() shared_session_id = uuid4().hex num_image_features = len(_IMAGE_FEATURES) num_metadata_features = len(_METADATA_DEFAULT_ATTRIBUTES) test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg") total_default_features = num_image_features + num_metadata_features profile_1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) img = Image.open(open(test_image_path, "rb")) trackImage = TrackImage(img=img) trackImage(profile_1) columns = profile_1.columns assert len(columns) == total_default_features assert columns["Saturation"].number_tracker.count == 67500 assert columns["BitsPerSample"].counters.count == 3
def test_track_json_annotation(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex num_bb_features = len(BB_ATTRIBUTES) test_annotation_path = os.path.join(TEST_DATA_PATH, "files", "yolo_bounding_box.jsonl") profile_1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) objs = [ json.loads(eachline) for eachline in open(test_annotation_path, "r") ] trackbb = TrackBB(obj=objs) trackbb(profile_1) columns = profile_1.columns assert len(columns) == len(BB_ATTRIBUTES) assert columns["annotation_count"].number_tracker.count == 100
def test_track_bb_annotation(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex test_annotation_path = os.path.join(TEST_DATA_PATH, "files", "yolo_bounding_box.jsonl") # total_default_features = num_image_features + num_metadata_features profile_1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) trackbb = TrackBB(test_annotation_path) trackbb(profile_1) columns = profile_1.columns assert len(columns) == len(BB_ATTRIBUTES) for each_attribute in BB_ATTRIBUTES: assert columns.get(each_attribute, None) is not None if each_attribute in ("annotation_count", "area_coverage", "annotation_density"): assert columns[each_attribute].number_tracker.count == 100 else: assert columns[each_attribute].number_tracker.count == 4183
def profile_lending_club(): import datetime from uuid import uuid4 now = datetime.datetime.utcnow() session_id = uuid4().hex df = pd.read_csv( os.path.join(_MY_DIR, os.pardir, "testdata", "lending_club_1000.csv")) profile = DatasetProfile(name="test", session_id=session_id, session_timestamp=now) profile.track_dataframe(df) return profile
def tests_timestamp(): time = datetime.datetime.now() dp = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=datetime.datetime.now( ), tags={"key": "value"}, metadata={"key": "value"},) time_2 = dp.session_timestamp_ms assert time_2 == int(time.replace( tzinfo=datetime.timezone.utc).timestamp() * 1000.0)
def test_track_PIL_img(): now = datetime.datetime.now(datetime.timezone.utc) shared_session_id = uuid4().hex total_default_features = _METADATA_DEFAULT_ATTRIBUTES test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg") profile_1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) img = Image.open(open(test_image_path, "rb")) trackImage = TrackImage(img=img) trackImage(profile_1) columns = profile_1.columns for feature_name in total_default_features: assert feature_name in columns, f"{feature_name} not in {columns}" assert columns["Hue.mean"].number_tracker.count == 1 assert columns["Saturation.mean"].number_tracker.count == 1 assert columns["Brightness.mean"].number_tracker.count == 1 assert columns["ImagePixelWidth"].counters.count == 1 assert columns["ImagePixelHeight"].counters.count == 1
def test_dataframe_profile(): time = datetime.datetime.now() df = util.testing.makeDataFrame() profile = DatasetProfile("test", time) profile.track_dataframe(df) profile_factory = dataframe_profile(df, name="test", timestamp=time) assert profile_factory.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean profile_factory_2 = dataframe_profile(df) assert profile_factory_2.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean profile_factory_3 = dataframe_profile(df, timestamp=103433) assert profile_factory_3.columns["A"].number_tracker.variance.mean == profile.columns["A"].number_tracker.variance.mean
def test_track_image(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex num_image_features = len(_IMAGE_FEATURES) num_metadata_features = len(_METADATA_DEFAULT_ATTRIBUTES) test_image_path = os.path.join(TEST_DATA_PATH, "images", "flower2.jpg") total_default_features = num_image_features + num_metadata_features profile_1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) trackImage = TrackImage(test_image_path) pixels_per_image = 67500 trackImage(profile_1) columns = profile_1.columns assert len(columns) == total_default_features assert columns["Saturation"].number_tracker.count == pixels_per_image assert columns["BitsPerSample"].counters.count == 3 trackImage = TrackImage(test_image_path) trackImage(profile_1) columns = profile_1.columns assert len(columns) == total_default_features assert columns["Saturation"].number_tracker.count == 2 * pixels_per_image
def test_parse_delimited_from_java_multiple(): dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, "output_from_java_08242020.bin"), "rb") as f: data = f.read() multiple = data + data result = DatasetProfile.parse_delimited(multiple) assert len(result) == 2
def test_write_delimited_multiple(): now = datetime.datetime.utcnow() original = DatasetProfile( name="test", session_id="test.session.id", session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, ) original.track("col1", "value") output_bytes = original.serialize_delimited() multiple_entries = output_bytes for i in range(1, 5): multiple_entries += output_bytes entries = DatasetProfile.parse_delimited(multiple_entries) assert len(entries) == 5 for entry in entries: assert entry.session_id == original.session_id # Python time precisions are different assert time.to_utc_ms(entry.session_timestamp) == time.to_utc_ms( original.session_timestamp) assert entry.tags == original.tags assert entry.metadata == original.metadata
def test_protobuf_round_trip(): now = datetime.datetime.utcnow() tags = {"k1": "rock", "k2": "scissors", "k3": "paper"} original = DatasetProfile( name="test", dataset_timestamp=now, tags=tags, ) original.track("col1", "value") original.track("col2", "value") msg = original.to_protobuf() roundtrip = DatasetProfile.from_protobuf(msg) assert roundtrip.to_protobuf() == msg assert roundtrip.name == "test" assert roundtrip.session_id == original.session_id assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms( original.session_timestamp) assert set(list(roundtrip.columns.keys())) == {"col1", "col2"} assert roundtrip.columns["col1"].counters.count == 1 assert roundtrip.columns["col2"].counters.count == 1 tags["Name"] = "test" assert set(roundtrip.tags) == set(tags) assert roundtrip.metadata == original.metadata
def test_non_string_tag_raises_assert_error(): now = datetime.datetime.utcnow() tags = {"key": "value"} x = DatasetProfile("test", now, tags=tags) x.validate() # Include a non-string tag x._tags["number"] = 1 try: x.validate() raise RuntimeError("validate should raise an AssertionError") except AssertionError: pass
def test_serde_without_dataset_timezone(): session = Session("project", "pipeline", writers=[]) dt = datetime.datetime.fromtimestamp(1634939335, tz=None) logger = session.logger("", dataset_timestamp=dt) logger.log_csv(io.StringIO("""a,b,c 1,1,1 1,1,2 4,4,3 """)) profile = logger.profile deserialized_profile = DatasetProfile.parse_delimited_single( profile.serialize_delimited())[1] profiles_eq(profile, deserialized_profile)
def test_write_delimited_single(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) original.track("col1", "value") output_bytes = original.serialize_delimited() pos, roundtrip = DatasetProfile.parse_delimited_single(output_bytes) assert roundtrip.session_id == original.session_id # Python time precision includes nanoseconds assert time.to_utc_ms(roundtrip.session_timestamp) == time.to_utc_ms( original.session_timestamp) assert roundtrip.tags == original.tags assert roundtrip.metadata == original.metadata
def test_track_null_item(): prof = DatasetProfile("name") prof.track("column_name", 1) prof = DatasetProfile("name") prof.track("column_name", None) assert prof.flat_summary()["summary"]["column"][0] == "column_name" assert prof.flat_summary()["summary"]["null_count"][0] == 1 prof.track("column_name", None) assert prof.flat_summary()["summary"]["null_count"][0] == 2 assert prof.flat_summary()["summary"]["column"][0] == "column_name"
def test_parse_from_protobuf(): dir_path = os.path.dirname(os.path.realpath(__file__)) DatasetProfile.read_protobuf( os.path.join(dir_path, "output_from_java_08242020.bin"))
def test_name_always_appear_in_tags(): x1 = DatasetProfile(name="test") assert x1.tags["Name"] == "test"