def __init__(original, self, *args, **kwargs): original(self, *args, **kwargs) _listen_for_spark_activity(self._sc)
def autolog(disable=False, silent=False): # pylint: disable=unused-argument """ Enables (or disables) and configures logging of Spark datasource paths, versions (if applicable), and formats when they are read. This method is not threadsafe and assumes a `SparkSession <https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession>`_ already exists with the `mlflow-spark JAR <http://mlflow.org/docs/latest/tracking.html#automatic-logging-from-spark-experimental>`_ attached. It should be called on the Spark driver, not on the executors (i.e. do not call this method within a function parallelized by Spark). This API requires Spark 3.0 or above. Datasource information is cached in memory and logged to all subsequent MLflow runs, including the active MLflow run (if one exists when the data is read). Note that autologging of Spark ML (MLlib) models is not currently supported via this API. Datasource autologging is best-effort, meaning that if Spark is under heavy load or MLflow logging fails for any reason (e.g., if the MLflow server is unavailable), logging may be dropped. For any unexpected issues with autologging, check Spark driver and executor logs in addition to stderr & stdout generated from your MLflow code - datasource information is pulled from Spark, so logs relevant to debugging may show up amongst the Spark logs. .. code-block:: python :caption: Example import mlflow.spark import os import shutil from pyspark.sql import SparkSession # Create and persist some dummy data # Note: On environments like Databricks with pre-created SparkSessions, # ensure the org.mlflow:mlflow-spark:1.11.0 is attached as a library to # your cluster spark = (SparkSession.builder .config("spark.jars.packages", "org.mlflow:mlflow-spark:1.11.0") .master("local[*]") .getOrCreate()) df = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], ["id", "text"]) import tempfile tempdir = tempfile.mkdtemp() df.write.csv(os.path.join(tempdir, "my-data-path"), header=True) # Enable Spark datasource autologging. mlflow.spark.autolog() loaded_df = spark.read.csv(os.path.join(tempdir, "my-data-path"), header=True, inferSchema=True) # Call toPandas() to trigger a read of the Spark datasource. Datasource info # (path and format) is logged to the current active run, or the # next-created MLflow run if no run is currently active with mlflow.start_run() as active_run: pandas_df = loaded_df.toPandas() :param disable: If ``True``, disables the Spark datasource autologging integration. If ``False``, enables the Spark datasource autologging integration. :param silent: If ``True``, suppress all event logs and warnings from MLflow during Spark datasource autologging. If ``False``, show all events and warnings during Spark datasource autologging. """ from mlflow.utils._spark_utils import _get_active_spark_session from mlflow._spark_autologging import _listen_for_spark_activity from pyspark.sql import SparkSession from pyspark import SparkContext def __init__(original, self, *args, **kwargs): original(self, *args, **kwargs) _listen_for_spark_activity(self._sc) safe_patch(FLAVOR_NAME, SparkSession, "__init__", __init__, manage_run=False) active_session = _get_active_spark_session() if active_session is not None: # We know SparkContext exists here already, so get it sc = SparkContext.getOrCreate() _listen_for_spark_activity(sc)