Beispiel #1
0
def test_autologging_dedups_multiple_reads_of_same_datasource(
        spark_session, format_to_file_path):
    mlflow.spark.autolog()
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (spark_session.read.format(data_format).option(
        "header", "true").option("inferSchema", "true").load(file_path))
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.collect()
        df.filter("number1 > 0").collect()
        df.limit(2).collect()
        df.collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
    # Test context provider flow
    df.filter("number1 > 0").collect()
    df.limit(2).collect()
    df.collect()
    with mlflow.start_run():
        run_id2 = mlflow.active_run().info.run_id
    time.sleep(1)
    run2 = mlflow.get_run(run_id2)
    _assert_spark_data_logged(run=run2,
                              path=file_path,
                              data_format=data_format)
Beispiel #2
0
def test_autologging_of_datasources_with_different_formats(
        spark_session, format_to_file_path):
    mlflow.spark.autolog()
    for data_format, file_path in format_to_file_path.items():
        base_df = (spark_session.read.format(data_format).option(
            "header", "true").option("inferSchema", "true").load(file_path))
        base_df.createOrReplaceTempView("temptable")
        table_df0 = spark_session.table("temptable")
        table_df1 = spark_session.sql(
            "SELECT number1, number2 from temptable LIMIT 5")
        dfs = [
            base_df,
            table_df0,
            table_df1,
            base_df.filter("number1 > 0"),
            base_df.select("number1"),
            base_df.limit(2),
            base_df.filter("number1 > 0").select("number1").limit(2),
        ]

        for df in dfs:
            with mlflow.start_run():
                run_id = mlflow.active_run().info.run_id
                df.collect()
                time.sleep(1)
            run = mlflow.get_run(run_id)
            _assert_spark_data_logged(run=run,
                                      path=file_path,
                                      data_format=data_format)
def test_enabling_autologging_before_spark_session_works(disable):
    mlflow.spark.autolog(disable=disable)

    # creating spark session AFTER autolog was enabled
    spark_session = _get_or_create_spark_session()

    rows = [Row(100)]
    schema = StructType([StructField("number2", IntegerType())])
    rdd = spark_session.sparkContext.parallelize(rows)
    df = spark_session.createDataFrame(rdd, schema)
    tempdir = tempfile.mkdtemp()
    filepath = os.path.join(tempdir, "test-data")
    df.write.option("header", "true").format("csv").save(filepath)

    read_df = (spark_session.read.format("csv").option(
        "header", "true").option("inferSchema", "true").load(filepath))

    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        read_df.collect()
        time.sleep(1)

    run = mlflow.get_run(run_id)
    if disable:
        _assert_spark_data_not_logged(run=run)
    else:
        _assert_spark_data_logged(run=run, path=filepath, data_format="csv")

    shutil.rmtree(tempdir)
    spark_session.stop()
Beispiel #4
0
def test_spark_autologging_with_sklearn_autologging(spark_session, data_format,
                                                    file_path):
    assert mlflow.active_run() is None
    mlflow.spark.autolog()
    mlflow.sklearn.autolog()
    df = (spark_session.read.format(data_format).option(
        "header",
        "true").option("inferSchema",
                       "true").load(file_path).select("number1", "number2"))
    pandas_df = df.toPandas()
    run = _fit_sklearn_model(pandas_df)
    _assert_spark_data_logged(run, file_path, data_format)
    assert mlflow.active_run() is None
Beispiel #5
0
def test_autologging_multiple_runs_same_data(spark_session,
                                             format_to_file_path):
    mlflow.spark.autolog()
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (spark_session.read.format(data_format).option(
        "header", "true").option("inferSchema", "true").load(file_path))
    df.collect()

    for _ in range(2):
        with mlflow.start_run():
            time.sleep(1)
            run_id = mlflow.active_run().info.run_id
            run = mlflow.get_run(run_id)
            _assert_spark_data_logged(run=run,
                                      path=file_path,
                                      data_format=data_format)
Beispiel #6
0
def test_autologging_disabled_then_enabled(spark_session, format_to_file_path):
    mlflow.spark.autolog(disable=True)
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (spark_session.read.format(data_format).option(
        "header", "true").option("inferSchema", "true").load(file_path))
    # Logging is disabled here.
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_not_logged(run=run)

    # Logging is enabled here.
    mlflow.spark.autolog(disable=False)
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.filter("number1 > 0").collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
Beispiel #7
0
def test_spark_sklearn_autologging_context_provider(spark_session, data_format,
                                                    file_path):
    mlflow.spark.autolog()
    mlflow.sklearn.autolog()

    df = (spark_session.read.format(data_format).option(
        "header",
        "true").option("inferSchema",
                       "true").load(file_path).select("number1", "number2"))
    pandas_df = df.toPandas()

    # DF info should be logged to the first run (it should be added to our context provider after
    # the toPandas() call above & then logged here)
    with mlflow.start_run():
        run = _fit_sklearn_model(pandas_df)
    _assert_spark_data_logged(run, file_path, data_format)

    with mlflow.start_run():
        pandas_df2 = df.filter("number1 > 0").toPandas()
        run2 = _fit_sklearn_model(pandas_df2)
    assert run2.info.run_id != run.info.run_id
    _assert_spark_data_logged(run2, file_path, data_format)
    time.sleep(1)
    assert mlflow.active_run() is None