def test_autologging_dedups_multiple_reads_of_same_datasource( spark_session, format_to_file_path): kiwi.spark.autolog() data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = spark_session.read.format(data_format).option("header", "true"). \ option("inferSchema", "true").load(file_path) with kiwi.start_run(): run_id = kiwi.active_run().info.run_id df.collect() df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() time.sleep(1) run = kiwi.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format) # Test context provider flow df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() with kiwi.start_run(): run_id2 = kiwi.active_run().info.run_id time.sleep(1) run2 = kiwi.get_run(run_id2) _assert_spark_data_logged(run=run2, path=file_path, data_format=data_format)
def test_autologging_of_datasources_with_different_formats( spark_session, format_to_file_path): kiwi.spark.autolog() for data_format, file_path in format_to_file_path.items(): base_df = spark_session.read.format(data_format).option("header", "true").\ option("inferSchema", "true").load(file_path) base_df.createOrReplaceTempView("temptable") table_df0 = spark_session.table("temptable") table_df1 = spark_session.sql( "SELECT number1, number2 from temptable LIMIT 5") dfs = [ base_df, table_df0, table_df1, base_df.filter("number1 > 0"), base_df.select("number1"), base_df.limit(2), base_df.filter("number1 > 0").select("number1").limit(2) ] for df in dfs: with kiwi.start_run(): run_id = kiwi.active_run().info.run_id df.collect() time.sleep(1) run = kiwi.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def _fit_keras_model_no_active_run(pandas_df, epochs): orig_runs = kiwi.search_runs() orig_run_ids = set(orig_runs['run_id']) _fit_keras(pandas_df, epochs) new_runs = kiwi.search_runs() new_run_ids = set(new_runs['run_id']) assert len(new_run_ids) == len(orig_run_ids) + 1 run_id = (new_run_ids - orig_run_ids).pop() return kiwi.get_run(run_id)
def test_autologging_multiple_reads_same_run(spark_session, format_to_file_path): kiwi.spark.autolog() with kiwi.start_run(): for data_format, file_path in format_to_file_path.items(): run_id = kiwi.active_run().info.run_id df = spark_session.read.format(data_format).load(file_path) df.collect() time.sleep(1) run = kiwi.get_run(run_id) assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME] assert table_info_tag == "\n".join([ _get_expected_table_info_row(path, data_format) for data_format, path in format_to_file_path.items() ])
def test_fetch_create_and_log(tmpdir): entry_point_name = "entry_point" parameters = { "method_name": "string", } entry_point = _project_spec.EntryPoint(entry_point_name, parameters, "run_model.sh") mock_fetched_project = _project_spec.Project( None, {entry_point_name: entry_point}, None, "my_project") experiment_id = kiwi.create_experiment("test_fetch_project") expected_dir = tmpdir project_uri = "http://someuri/myproject.git" user_param = {"method_name": "newton"} with mock.patch("mlflow.projects.utils._fetch_project", return_value=expected_dir): with mock.patch("mlflow.projects._project_spec.load_project", return_value=mock_fetched_project): work_dir = fetch_and_validate_project("", "", entry_point_name, user_param) project = load_project(work_dir) assert mock_fetched_project == project assert expected_dir == work_dir # Create a run active_run = get_or_create_run(run_id=None, uri=project_uri, experiment_id=experiment_id, work_dir=work_dir, version=None, entry_point=entry_point_name, parameters=user_param) # check tags run = kiwi.get_run(active_run.info.run_id) assert MLFLOW_PROJECT_ENTRY_POINT in run.data.tags assert MLFLOW_SOURCE_NAME in run.data.tags assert entry_point_name == run.data.tags[ MLFLOW_PROJECT_ENTRY_POINT] assert project_uri == run.data.tags[MLFLOW_SOURCE_NAME] assert user_param == run.data.params
def test_autologging_slow_api_requests(spark_session, format_to_file_path): import kiwi.utils.rest_utils orig = kiwi.utils.rest_utils.http_request def _slow_api_req_mock(*args, **kwargs): if kwargs.get("method") == "POST": print("Sleeping, %s, %s" % (args, kwargs)) time.sleep(1) return orig(*args, **kwargs) kiwi.spark.autolog() with kiwi.start_run(): # Mock slow API requests to log Spark datasource information with mock.patch( 'mlflow.utils.rest_utils.http_request') as http_request_mock: http_request_mock.side_effect = _slow_api_req_mock run_id = kiwi.active_run().info.run_id for data_format, file_path in format_to_file_path.items(): df = spark_session.read.format(data_format).option("header", "true"). \ option("inferSchema", "true").load(file_path) df.collect() # Sleep a bit prior to ending the run to guarantee that the Python process can pick up on # datasource read events (simulate the common case of doing work, e.g. model training, # on the DataFrame after reading from it) time.sleep(1) # Python subscriber threads should pick up the active run at the time they're notified # & make API requests against that run, even if those requests are slow. time.sleep(5) run = kiwi.get_run(run_id) assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME] assert table_info_tag == "\n".join([ _get_expected_table_info_row(path, data_format) for data_format, path in format_to_file_path.items() ])
def _fit_keras_model_with_active_run(pandas_df, epochs): run_id = kiwi.active_run().info.run_id _fit_keras(pandas_df, epochs) run_id = run_id return kiwi.get_run(run_id)