Beispiel #1
0
def test_autologging_dedups_multiple_reads_of_same_datasource(
        spark_session, format_to_file_path):
    kiwi.spark.autolog()
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = spark_session.read.format(data_format).option("header", "true"). \
        option("inferSchema", "true").load(file_path)
    with kiwi.start_run():
        run_id = kiwi.active_run().info.run_id
        df.collect()
        df.filter("number1 > 0").collect()
        df.limit(2).collect()
        df.collect()
        time.sleep(1)
    run = kiwi.get_run(run_id)
    _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
    # Test context provider flow
    df.filter("number1 > 0").collect()
    df.limit(2).collect()
    df.collect()
    with kiwi.start_run():
        run_id2 = kiwi.active_run().info.run_id
    time.sleep(1)
    run2 = kiwi.get_run(run_id2)
    _assert_spark_data_logged(run=run2,
                              path=file_path,
                              data_format=data_format)
Beispiel #2
0
def test_autologging_of_datasources_with_different_formats(
        spark_session, format_to_file_path):
    kiwi.spark.autolog()
    for data_format, file_path in format_to_file_path.items():
        base_df = spark_session.read.format(data_format).option("header", "true").\
            option("inferSchema", "true").load(file_path)
        base_df.createOrReplaceTempView("temptable")
        table_df0 = spark_session.table("temptable")
        table_df1 = spark_session.sql(
            "SELECT number1, number2 from temptable LIMIT 5")
        dfs = [
            base_df, table_df0, table_df1,
            base_df.filter("number1 > 0"),
            base_df.select("number1"),
            base_df.limit(2),
            base_df.filter("number1 > 0").select("number1").limit(2)
        ]

        for df in dfs:
            with kiwi.start_run():
                run_id = kiwi.active_run().info.run_id
                df.collect()
                time.sleep(1)
            run = kiwi.get_run(run_id)
            _assert_spark_data_logged(run=run,
                                      path=file_path,
                                      data_format=data_format)
def _fit_keras_model_no_active_run(pandas_df, epochs):
    orig_runs = kiwi.search_runs()
    orig_run_ids = set(orig_runs['run_id'])
    _fit_keras(pandas_df, epochs)
    new_runs = kiwi.search_runs()
    new_run_ids = set(new_runs['run_id'])
    assert len(new_run_ids) == len(orig_run_ids) + 1
    run_id = (new_run_ids - orig_run_ids).pop()
    return kiwi.get_run(run_id)
Beispiel #4
0
def test_autologging_multiple_reads_same_run(spark_session,
                                             format_to_file_path):
    kiwi.spark.autolog()
    with kiwi.start_run():
        for data_format, file_path in format_to_file_path.items():
            run_id = kiwi.active_run().info.run_id
            df = spark_session.read.format(data_format).load(file_path)
            df.collect()
            time.sleep(1)
        run = kiwi.get_run(run_id)
        assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags
        table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME]
        assert table_info_tag == "\n".join([
            _get_expected_table_info_row(path, data_format)
            for data_format, path in format_to_file_path.items()
        ])
Beispiel #5
0
def test_fetch_create_and_log(tmpdir):
    entry_point_name = "entry_point"
    parameters = {
        "method_name": "string",
    }
    entry_point = _project_spec.EntryPoint(entry_point_name, parameters,
                                           "run_model.sh")
    mock_fetched_project = _project_spec.Project(
        None, {entry_point_name: entry_point}, None, "my_project")
    experiment_id = kiwi.create_experiment("test_fetch_project")
    expected_dir = tmpdir
    project_uri = "http://someuri/myproject.git"
    user_param = {"method_name": "newton"}
    with mock.patch("mlflow.projects.utils._fetch_project",
                    return_value=expected_dir):
        with mock.patch("mlflow.projects._project_spec.load_project",
                        return_value=mock_fetched_project):
            work_dir = fetch_and_validate_project("", "", entry_point_name,
                                                  user_param)
            project = load_project(work_dir)
            assert mock_fetched_project == project
            assert expected_dir == work_dir
            # Create a run
            active_run = get_or_create_run(run_id=None,
                                           uri=project_uri,
                                           experiment_id=experiment_id,
                                           work_dir=work_dir,
                                           version=None,
                                           entry_point=entry_point_name,
                                           parameters=user_param)

            # check tags
            run = kiwi.get_run(active_run.info.run_id)
            assert MLFLOW_PROJECT_ENTRY_POINT in run.data.tags
            assert MLFLOW_SOURCE_NAME in run.data.tags
            assert entry_point_name == run.data.tags[
                MLFLOW_PROJECT_ENTRY_POINT]
            assert project_uri == run.data.tags[MLFLOW_SOURCE_NAME]
            assert user_param == run.data.params
Beispiel #6
0
def test_autologging_slow_api_requests(spark_session, format_to_file_path):
    import kiwi.utils.rest_utils
    orig = kiwi.utils.rest_utils.http_request

    def _slow_api_req_mock(*args, **kwargs):
        if kwargs.get("method") == "POST":
            print("Sleeping, %s, %s" % (args, kwargs))
            time.sleep(1)
        return orig(*args, **kwargs)

    kiwi.spark.autolog()
    with kiwi.start_run():
        # Mock slow API requests to log Spark datasource information
        with mock.patch(
                'mlflow.utils.rest_utils.http_request') as http_request_mock:
            http_request_mock.side_effect = _slow_api_req_mock
            run_id = kiwi.active_run().info.run_id
            for data_format, file_path in format_to_file_path.items():
                df = spark_session.read.format(data_format).option("header", "true"). \
                    option("inferSchema", "true").load(file_path)
                df.collect()
        # Sleep a bit prior to ending the run to guarantee that the Python process can pick up on
        # datasource read events (simulate the common case of doing work, e.g. model training,
        # on the DataFrame after reading from it)
        time.sleep(1)

    # Python subscriber threads should pick up the active run at the time they're notified
    # & make API requests against that run, even if those requests are slow.
    time.sleep(5)
    run = kiwi.get_run(run_id)
    assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags
    table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME]
    assert table_info_tag == "\n".join([
        _get_expected_table_info_row(path, data_format)
        for data_format, path in format_to_file_path.items()
    ])
def _fit_keras_model_with_active_run(pandas_df, epochs):
    run_id = kiwi.active_run().info.run_id
    _fit_keras(pandas_df, epochs)
    run_id = run_id
    return kiwi.get_run(run_id)