def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="1m", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns for column_name in _EXPECTED_COLUMNS: assert column_name in columns, f"{column_name} not found in {columns}" shutil.rmtree(output_path, ignore_errors=True)
def test_log_metrics_with_boolean_labels(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = [True, False, True] predictions = [False, True, False] scores = [0.2, 0.5, 0.6] with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len(metrics_profile.metrics.confusion_matrix.labels) == 2 shutil.rmtree(output_path, ignore_errors=True)
def test_log_metrics(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = ["class_name1", "class_name2", "class_name3"] predictions = ["class_name1", "class_name2", "class_name2"] scores = [0.2, 0.5, 0.6] num_labels = 3 with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len( metrics_profile.metrics.confusion_matrix.labels) == num_labels shutil.rmtree(output_path)
def test_segments_with_rotation(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='s', segments=["home_ownership"], profile_full_dataset=True, cache=1) as logger: logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) df = util.testing.makeDataFrame() with pytest.raises(KeyError): logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 8 shutil.rmtree(output_path)
def test_log_rotation_days(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='d', cache_size=1) as logger: df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=1)) df = util.testing.makeDataFrame() logger.log_dataframe(df) df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=2)) df = util.testing.makeDataFrame() logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 3 shutil.rmtree(output_path)
def test_log_multiple_calls(tmpdir, df_lending_club): original_dir = os.curdir os.chdir(script_dir) p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath( ), filename_template="dataset_summary-$dataset_timestamp") yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) now = datetime.datetime.now() for i in range(0, 5): with session.logger(dataset_timestamp=now + datetime.timedelta(days=i)) as logger: logger.log_dataframe(df_lending_club) output_files = [] for root, subdirs, files in os.walk(p): output_files += files # we run 5 times, so we should have five times more files than the above test assert len(output_files) == 25 os.chdir(original_dir)
def test_log_dataframe(tmpdir, df_lending_club): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("lendingclub") as logger: assert logger is not None logger.log_dataframe(df_lending_club) profile = logger.profile assert profile is not None summary = profile.flat_summary() flat_summary = summary['summary'] assert len(flat_summary) == 151 output_files = [] for root, subdirs, files in os.walk(p): output_files += files assert len(output_files) == 5
def test_no_log_rotation(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() basewriter = writer_from_config(WriterConfig.from_yaml(yaml_data)) l = Logger(session_id="", dataset_name="testing", writers=[basewriter], dataset_timestamp=datetime.datetime.now(tz=timezone.utc), with_rotation_time=None) l.log({"quick_test": 3}) l.flush()
def test_segments(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with session_from_config(session_config) as session: with session.logger( "test", segments=[ [{ "key": "home_ownership", "value": "RENT" }], [{ "key": "home_ownership", "value": "MORTGAGE" }], ], cache_size=1, ) as logger: logger.log_dataframe(df_lending_club) profile = logger.profile profiles = logger.segmented_profiles mortage_segment = logger.get_segment([{ "key": "home_ownership", "value": "MORTGAGE" }]) assert profile is None assert len(profiles) == 2 assert profiles[list(profiles.keys())[0]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "RENT" }]) assert profiles[list(profiles.keys())[1]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "MORTGAGE" }]) check_segment = profiles[list(profiles.keys())[1]] assert mortage_segment == check_segment shutil.rmtree(output_path, ignore_errors=True)
def test_segments_keys(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test", segments=["emp_title", "home_ownership"], cache_size=1) as logger: logger.log_dataframe(df_lending_club) profiles = logger.segmented_profiles assert len(profiles) == 47 shutil.rmtree(output_path, ignore_errors=True)
def test_config_api(tmpdir): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test_name") as logger: logger.log_dataframe(pd.DataFrame()) session.close()
def test_log_multiple_segments(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) df = pd.DataFrame(data={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3], "y": [4, 5, 6, 5, 6, 4, 6, 4, 5], "z": [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3]}) with session.logger("image_test", segments=["x", "y"]) as logger: logger.log_segments(df) assert len(logger.segmented_profiles) == 9
def get_or_create_session(): """ Retrieve the current active global session. If no active session exists, attempt to load config and create a new session. If an active session exists, return the session without loading new config. Returns ------- session : Session The global active session """ global _session if _session is not None and _session.is_active(): _getLogger(__name__).debug( "Active session found, ignoring session kwargs") else: config = load_config() if config is None: print("WARN: Missing config") writer = WriterConfig(type="local", output_path="output", formats=["all"]) config = SessionConfig("default-project", "default-pipeline", [writer], False) _session = session_from_config(config) return _session
def get_or_create_session(path_to_config: Optional[str] = None, report_progress: Optional[bool] = False): """ Retrieve the current active global session. If no active session exists, attempt to load config and create a new session. If an active session exists, return the session without loading new config. :return: The global active session :rtype: Session :type path_to_config: str """ global _session if _session is not None and _session.is_active(): _getLogger(__name__).debug("Active session found, ignoring session kwargs") else: config = load_config(path_to_config) if config is None: print("WARN: Missing config") config = SessionConfig( "default-project", "default-pipeline", [WriterConfig(type="local", output_path="output", formats=["all"])], MetadataConfig(type="local", output_path="output", input_path=""), False, ) if report_progress is not None: config.report_progress = report_progress _session = session_from_config(config) return _session
def test_write_template_path(): data_time = time.from_utc_ms(9999) session_time = time.from_utc_ms(88888) path_template = "$name-$session_timestamp-$dataset_timestamp-$session_id" writer_config = WriterConfig("local", ["protobuf", "flat"], "output", path_template, "dataset-profile-$name") writer = writer_from_config(writer_config) dp = DatasetProfile("name", data_time, session_time, session_id="session") assert writer.path_suffix(dp) == "name-88888-9999-session" assert writer.file_name(dp, ".txt") == "dataset-profile-name.txt"
def test_log_rotation_hour(tmpdir, df): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: with session_from_config(session_config) as session: with session.logger("test", with_rotation_time="h", cache_size=1) as logger: logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(hours=3)) logger.log(feature_name="E", value=4) logger.log_dataframe(df) output_files = [] for _, _, files in os.walk(output_path): output_files += files assert len(output_files) == 2 shutil.rmtree(output_path, ignore_errors=True)
def reset_default_session(): """ Reset and deactivate the global whylogs logging session. """ global _session if _session is not None: _session.close() config: SessionConfig = load_config() if config is None: config = SessionConfig("default-project", "default-pipeline", [ WriterConfig(type="local", output_path="output", formats=["all"]) ], False) _session = session_from_config(config)
def test_log_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_test") as logger: for image_file_path in image_files: logger.log_image(image_file_path) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path, ignore_errors=True)
def test_segments(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) test_segments = [ [{"key": "home_ownership", "value": "RENT"}], [{"key": "home_ownership", "value": "MORTGAGE"}], ] session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with session_from_config(session_config) as session: with session.logger( "test", segments=test_segments, cache_size=1, ) as logger: logger.log_dataframe(df_lending_club) profile = logger.profile profiles = logger.segmented_profiles mortage_segment = logger.get_segment(test_segments[1]) assert profile is None assert len(profiles) == 2 segment_keys = [key for key in profiles[list(profiles.keys())[0]].tags.keys() if key.startswith(_TAG_PREFIX)] for segment_key in segment_keys: assert profiles[list(profiles.keys())[0]].tags[segment_key] == test_segments[0][0][_TAG_VALUE] # 'RENT' segment_keys = [key for key in profiles[list(profiles.keys())[1]].tags.keys() if key.startswith(_TAG_PREFIX)] for segment_key in segment_keys: assert profiles[list(profiles.keys())[1]].tags[segment_key] == test_segments[1][0][_TAG_VALUE] # 'MORTGAGE' check_segment = profiles[list(profiles.keys())[1]] assert mortage_segment == check_segment shutil.rmtree(output_path, ignore_errors=True)
def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="s", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path)
def test_log_rotation_concurrency(tmpdir): log_rotation_interval = "1s" sleep_interval = 2 test_path = tmpdir.mkdir("log_rotation_concurrency_repro") writer_config = WriterConfig( "local", ["json"], test_path.realpath(), filename_template="dataset_summary-$dataset_timestamp") # Load the full lending club 1000 csv, to get a chance at hitting the bug. csv_path = os.path.join(script_dir, "lending_club_1000.csv") full_df = pd.read_csv(csv_path) # full_df has shape (1000, 151) so create a test df with 4x size by iteratively appending to self 2 times for _ in range(2): full_df = full_df.append(full_df) TEST_LOGGER.info(f"test dataframe has shape {full_df.shape}") # Create a whylogs logging session session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) TEST_LOGGER.info( f"Running rotate log test with {log_rotation_interval} flush intervals and {sleep_interval}s pause" ) profiler = cProfile.Profile() profiler.enable() with session.logger(tags={"datasetId": "model-1"}, with_rotation_time=log_rotation_interval) as ylog: ylog.log_dataframe( full_df ) # Log a larger dataframe to increase chance of rotation before seeing all columns sleep(sleep_interval) ylog.log_dataframe( full_df.head(n=2) ) # Log a smaller dataframe to get more features before rotation sleep(sleep_interval) profiler.disable() stats = pstats.Stats(profiler).sort_stats("cumulative") TEST_LOGGER.info(stats.print_stats(10)) output_files = [] for root, subdir, file_names in os.walk(test_path): if not file_names: continue if subdir: for directory in subdir: for file in file_names: full_file_path = os.path.join(root, directory, file) output_files += [full_file_path] else: for file in file_names: full_file_path = os.path.join(root, file) output_files += [full_file_path] assert len( output_files) > 0, "No output files were generated during stress test" TEST_LOGGER.debug(f"Generated {len(output_files)} dataset summary files.") feature_counts = [] for filename in output_files: feature_count = count_features(filename) if feature_count > 0: feature_counts.append((count_features(filename), filename)) assert len( feature_counts ) > 0, f"feature counts are all empty, we expect some empty files with aggressive log rotation but not all empty!" TEST_LOGGER.info( f"Feature counts all same, first file with features was {feature_counts[0]}" ) TEST_LOGGER.debug(f"There were {len(feature_counts)} files with features.") assert_all_elements_equal(feature_counts) rmtree(test_path, ignore_errors=True) TEST_LOGGER.debug(f"End cleaning up test directory {test_path}")