Ejemplo n.º 1
0
def test_log_pil_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_pil_test",
                        with_rotation_time="1m",
                        cache_size=1) as logger:

        for image_file_path in image_files:
            img = Image.open(image_file_path)
            logger.log_image(img)

        profile = logger.profile
        columns = profile.columns
        for column_name in _EXPECTED_COLUMNS:
            assert column_name in columns, f"{column_name} not found in {columns}"
    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 2
0
def test_log_metrics_with_boolean_labels(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)
    targets = [True, False, True]

    predictions = [False, True, False]
    scores = [0.2, 0.5, 0.6]
    with session.logger("metrics_test") as logger:
        logger.log_metrics(targets, predictions, scores)

        profile = logger.profile
        metrics_profile = profile.model_profile

        assert metrics_profile is not None
        assert len(metrics_profile.metrics.confusion_matrix.labels) == 2
    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 3
0
def test_s3_writer_metadata(df_lending_club, moto_boto,
                            s3_all_config_metadata_path):

    assert os.path.exists(s3_all_config_metadata_path)

    config = load_config(s3_all_config_metadata_path)
    session = session_from_config(config)
    session.estimate_segments(df_lending_club,
                              name="dataset_test",
                              target_field="funded_amnt_inv",
                              max_segments=30)
    client = boto3.client("s3")
    objects = client.list_objects(Bucket="mocked_bucket")

    for idx, each_objc in enumerate(objects["Contents"]):
        assert each_objc["Key"] == "metadata/segments.json"

    with session.logger("dataset_test") as logger:
        logger.log_dataframe(df_lending_club)
    session.close()

    objects = client.list_objects(Bucket="mocked_bucket")
    print(objects)
    for idx, each_objc in enumerate(objects["Contents"]):
        print(each_objc["Key"])
        assert each_objc["Key"] == object_keys_meta_config[idx]
Ejemplo n.º 4
0
def profile_csv(session_config: SessionConfig, project_dir: str) -> str:
    package_nb_path = os.path.join(os.path.dirname(__file__), "notebooks")
    demo_csv = os.path.join(package_nb_path, LENDING_CLUB_CSV)
    file: io.TextIOWrapper = click.prompt(
        "CSV input path (leave blank to use our demo dataset)",
        type=click.File(mode="rt"),
        default=io.StringIO(),
        show_default=False,
    )
    if type(file) is io.StringIO:
        echo("Using the demo Lending Club Data (1K randomized samples)",
             fg="green")
        destination_csv = os.path.join(project_dir, LENDING_CLUB_CSV)
        echo("Copying the demo file to: %s" % destination_csv)
        shutil.copy(demo_csv, destination_csv)
        full_input = os.path.realpath(destination_csv)
    else:
        file.close()
        full_input = os.path.realpath(file.name)
    echo(f"Input file: {full_input}")
    echo(RUN_PROFILING)
    session = session_from_config(session_config)
    df = pd.read_csv(full_input)
    session.log_dataframe(df)
    session.close()
    return full_input
Ejemplo n.º 5
0
def test_log_metrics(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)
    targets = ["class_name1", "class_name2", "class_name3"]

    predictions = ["class_name1", "class_name2", "class_name2"]
    scores = [0.2, 0.5, 0.6]
    num_labels = 3
    with session.logger("metrics_test") as logger:

        logger.log_metrics(targets, predictions, scores)

        profile = logger.profile
        metrics_profile = profile.model_profile

        assert metrics_profile is not None
        assert len(
            metrics_profile.metrics.confusion_matrix.labels) == num_labels
    shutil.rmtree(output_path)
Ejemplo n.º 6
0
def test_segments_with_rotation(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        session = session_from_config(session_config)
        with session.logger("test", with_rotation_time='s', segments=["home_ownership"], profile_full_dataset=True, cache=1) as logger:
            logger.log_dataframe(df_lending_club)
            frozen_time.tick(delta=datetime.timedelta(seconds=1))
            logger.log_dataframe(df_lending_club)
            frozen_time.tick(delta=datetime.timedelta(seconds=1))

            df = util.testing.makeDataFrame()
            with pytest.raises(KeyError):
                logger.log_dataframe(df)
    output_files = []
    for root, subdirs, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 8
    shutil.rmtree(output_path)
Ejemplo n.º 7
0
def test_log_rotation_days(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        session = session_from_config(session_config)
        with session.logger("test", with_rotation_time='d',
                            cache_size=1) as logger:
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            frozen_time.tick(delta=datetime.timedelta(days=1))
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            frozen_time.tick(delta=datetime.timedelta(days=2))
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
    output_files = []
    for root, subdirs, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 3
    shutil.rmtree(output_path)
Ejemplo n.º 8
0
def test_log_multiple_calls(tmpdir, df_lending_club):
    original_dir = os.curdir
    os.chdir(script_dir)

    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath(
    ), filename_template="dataset_summary-$dataset_timestamp")
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)

    now = datetime.datetime.now()
    for i in range(0, 5):
        with session.logger(dataset_timestamp=now + datetime.timedelta(days=i)) as logger:
            logger.log_dataframe(df_lending_club)

    output_files = []
    for root, subdirs, files in os.walk(p):
        output_files += files
    # we run 5 times, so we should have five times more files than the above test
    assert len(output_files) == 25
    os.chdir(original_dir)
Ejemplo n.º 9
0
def test_log_dataframe(tmpdir, df_lending_club):
    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)

    with session.logger("lendingclub") as logger:
        assert logger is not None
        logger.log_dataframe(df_lending_club)
        profile = logger.profile
        assert profile is not None

        summary = profile.flat_summary()

        flat_summary = summary['summary']

        assert len(flat_summary) == 151

    output_files = []
    for root, subdirs, files in os.walk(p):
        output_files += files
    assert len(output_files) == 5
Ejemplo n.º 10
0
def test_session_log_dataframe(df):
    pass

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    session.log_dataframe(df)

    assert session.logger() is not None

    assert session.logger("default-project").dataset_name == "default-project"
Ejemplo n.º 11
0
def test_session_log_dataframe():
    _session = None

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    df = util.testing.makeDataFrame()
    profile = session.log_dataframe(df)

    assert session.logger() is not None

    assert session.logger("default-project").dataset_name == "default-project"
Ejemplo n.º 12
0
def test_session_profile(df):

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    profile = session.log_dataframe(df)
    assert profile is not None

    summary = profile.flat_summary()

    flat_summary = summary["summary"]
    assert len(flat_summary) == 4
Ejemplo n.º 13
0
def test_session_profile():

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    df = util.testing.makeDataFrame()
    profile = session.log_dataframe(df)
    assert profile is not None

    summary = profile.flat_summary()

    flat_summary = summary['summary']
    assert len(flat_summary) == 4
Ejemplo n.º 14
0
def test_segments(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])
    with session_from_config(session_config) as session:
        with session.logger(
                "test",
                segments=[
                    [{
                        "key": "home_ownership",
                        "value": "RENT"
                    }],
                    [{
                        "key": "home_ownership",
                        "value": "MORTGAGE"
                    }],
                ],
                cache_size=1,
        ) as logger:
            logger.log_dataframe(df_lending_club)
            profile = logger.profile
            profiles = logger.segmented_profiles
            mortage_segment = logger.get_segment([{
                "key": "home_ownership",
                "value": "MORTGAGE"
            }])

    assert profile is None
    assert len(profiles) == 2
    assert profiles[list(profiles.keys())[0]].tags["segment"] == json.dumps([{
        "key":
        "home_ownership",
        "value":
        "RENT"
    }])
    assert profiles[list(profiles.keys())[1]].tags["segment"] == json.dumps([{
        "key":
        "home_ownership",
        "value":
        "MORTGAGE"
    }])
    check_segment = profiles[list(profiles.keys())[1]]
    assert mortage_segment == check_segment

    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 15
0
def test_config_api(tmpdir):
    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("test_name") as logger:
        logger.log_dataframe(pd.DataFrame())
    session.close()
Ejemplo n.º 16
0
def test_segments_keys(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)
    with session.logger("test", segments=["emp_title", "home_ownership"], cache_size=1) as logger:
        logger.log_dataframe(df_lending_club)
        profiles = logger.segmented_profiles
        assert len(profiles) == 47
    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 17
0
def test_log_multiple_segments(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])

    session = session_from_config(session_config)

    df = pd.DataFrame(data={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3], "y": [4, 5, 6, 5, 6, 4, 6, 4, 5], "z": [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3]})
    with session.logger("image_test", segments=["x", "y"]) as logger:
        logger.log_segments(df)
        assert len(logger.segmented_profiles) == 9
Ejemplo n.º 18
0
def test_s3_writer(df_lending_club, moto_boto, s3_all_config_path):

    assert os.path.exists(s3_all_config_path)

    config = load_config(s3_all_config_path)
    session = session_from_config(config)

    with session.logger("dataset_test_s3") as logger:
        logger.log_dataframe(df_lending_club)
    session.close()

    client = boto3.client("s3")
    objects = client.list_objects(Bucket="mocked_bucket")
    for idx, each_objc in enumerate(objects["Contents"]):
        assert each_objc["Key"] == object_keys[idx]
Ejemplo n.º 19
0
def test_mlflow_patched(mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    assert whylogs.enable_mlflow(session)
    assert mlflow.whylogs is not None
    print("HEY LISTEN")
    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 20
0
def test_s3_writer(df_lending_club, moto_boto, s3_config_path):

    assert os.path.exists(s3_config_path)

    config = load_config(s3_config_path)
    session = session_from_config(config)

    with session.logger("dataset_test_s3") as logger:
        logger.log_dataframe(df_lending_club)

    client = boto3.client('s3')
    objects = client.list_objects(Bucket="mocked_bucket")

    assert len([each_obj["Key"] for each_obj in objects["Contents"]]) == 1
    assert objects["Contents"][0]["Key"] == "dataset_test_s3/dataset_summary/protobuf/dataset_summary.bin"
    assert "s3:" not in [d.name for d in os.scandir(
        os.getcwd()) if d.is_dir()]
Ejemplo n.º 21
0
def test_profile_viewer(tmpdir, local_config_path):

    config = load_config(local_config_path)
    session = session_from_config(config)

    with session.logger("mytestytest",
                        dataset_timestamp=datetime.datetime(2021, 6,
                                                            2)) as logger:
        for _ in range(5):
            logger.log({"uniform_integers": np.random.randint(0, 50)})
            logger.log({"nulls": None})

        profile = logger.profile
    result = profile_viewer(profiles=[profile],
                            output_path=tmpdir + "my_test.html")
    assert os.path.exists(tmpdir + "my_test.html")
    assert result == tmpdir + "my_test.html"
Ejemplo n.º 22
0
def test_patch_multiple_times(mlflow_config_path):
    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    # patch three times
    assert whylogs.enable_mlflow(session)
    assert whylogs.enable_mlflow(session)
    assert whylogs.enable_mlflow(session)

    import mlflow

    assert mlflow.whylogs is not None
    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 23
0
def test_get_run_profiles_shouldReturn_multipleProfiles(
        tmpdir, mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    set_up_mlflow(mlflow, tmpdir)
    whylogs.enable_mlflow(session)

    with mlflow.start_run():
        mlflow.whylogs.log(features={"a": 1})
        mlflow.whylogs.log(features={"a": 1}, dataset_name="another-profile")

    with mlflow.start_run():
        mlflow.whylogs.log(features={"a": 1}, dataset_name="another-profile")

    runs = whylogs.mlflow.list_whylogs_runs("0")
    default_profiles = whylogs.mlflow.get_run_profiles(run_id=runs[0].run_id)
    another_profile = whylogs.mlflow.get_run_profiles(
        run_id=runs[0].run_id, dataset_name="another-profile")

    assert len(runs) == 2
    # verify the number of profiles for each datasetname
    assert len(
        whylogs.mlflow.get_experiment_profiles("0",
                                               dataset_name="default")) == 2
    assert len(
        whylogs.mlflow.get_experiment_profiles(
            "0", dataset_name="another-profile")) == 2

    # for the first run, verify content
    assert len(default_profiles) == 1
    assert len(another_profile) == 1
    # assert default_profiles[0].name == "default"
    assert default_profiles[0].dataset_timestamp is not None
    assert another_profile[0].dataset_timestamp is not None
Ejemplo n.º 24
0
def test_assert_whylogsrun_close_is_called(tmpdir, mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    set_up_mlflow(mlflow, tmpdir)
    with mock.patch.object(whylogs.mlflow.patcher.WhyLogsRun,
                           "_close") as mock_close:
        whylogs.enable_mlflow(session)
        with mlflow.start_run():
            pass

        mock_close.assert_called_once()
    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 25
0
def test_assert_log_artifact_is_called(tmpdir, mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    set_up_mlflow(mlflow, tmpdir)
    with mock.patch.object(mlflow, "log_artifact") as log_artifact:
        whylogs.enable_mlflow(session)
        with mlflow.start_run():
            mlflow.whylogs.log(features={"a": 1})

        log_artifact.assert_called_once()

    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 26
0
def test_log_rotation_hour(tmpdir, df):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        with session_from_config(session_config) as session:
            with session.logger("test", with_rotation_time="h", cache_size=1) as logger:
                logger.log_dataframe(df)
                frozen_time.tick(delta=datetime.timedelta(hours=3))
                logger.log(feature_name="E", value=4)
                logger.log_dataframe(df)

    output_files = []
    for _, _, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 2
    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 27
0
def test_log_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_test") as logger:

        for image_file_path in image_files:
            logger.log_image(image_file_path)

        profile = logger.profile
        columns = profile.columns
        assert len(columns) == 19
    shutil.rmtree(output_path, ignore_errors=True)
Ejemplo n.º 28
0
def test_listRuns_shouldReturn_NoRuns(tmpdir, mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    set_up_mlflow(mlflow, tmpdir)
    whylogs.enable_mlflow(session)

    for i in range(0, 10):
        with mlflow.start_run():
            pass

    assert len(mlflow.list_run_infos("0")) == 10
    assert len(whylogs.mlflow.list_whylogs_runs("0")) == 0
    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 29
0
def test_listRuns_shouldReturn_CorrectRunCount(tmpdir, mlflow_config_path):
    import mlflow

    import whylogs
    from whylogs.app.config import load_config
    from whylogs.app.session import session_from_config

    assert os.path.exists(mlflow_config_path)

    config = load_config(mlflow_config_path)
    session = session_from_config(config)

    set_up_mlflow(mlflow, tmpdir)
    whylogs.enable_mlflow(session)

    for i in range(0, 10):
        with mlflow.start_run():
            if i % 2 == 0:
                mlflow.whylogs.log(features={"a": 1})
    print("WEIRD")
    assert len(mlflow.list_run_infos("0")) == 10
    assert len(whylogs.mlflow.list_whylogs_runs("0")) == 5
    assert len(whylogs.mlflow.get_experiment_profiles("0")) == 5
    whylogs.mlflow.disable_mlflow()
Ejemplo n.º 30
0
def test_log_pil_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_pil_test", with_rotation_time="s",
                        cache_size=1) as logger:

        for image_file_path in image_files:
            img = Image.open(image_file_path)
            logger.log_image(img)

        profile = logger.profile
        columns = profile.columns
        assert len(columns) == 19
    shutil.rmtree(output_path)