Example #1
0
def register_best_model(**kwargs):
    """ Take the best performing model, register it under the BestModel name, and ship it to prod """
    run_id = kwargs["ti"].xcom_pull(task_ids="get_best_model", key="best_model_run_id")
    model_uri = f"runs:/{run_id}/model"
    model_details = mlflow.register_model(model_uri, "BestModel")  # note this doesnt put it in prod, but updates the registered model in the model repo

    # This is what would make it prod, but probably shouldnt automate this without some testing and eyes on
    client = MlflowClient()
    client.transition_model_version_stage(name=model_details.name, version=model_details.version, stage='Production')
time.sleep(5)

# COMMAND ----------

# MAGIC %md ### Perform a model stage transition
# MAGIC
# MAGIC The MLflow Model Registry defines several model stages: **None**, **Staging**, **Production**, and **Archived**. Each stage has a unique meaning. For example, **Staging** is meant for model testing, while **Production** is for models that have completed the testing or review processes and have been deployed to applications.

# COMMAND ----------

from mlflow.tracking.client import MlflowClient
client = MlflowClient()

client.transition_model_version_stage(
    name=modelDetails.name,
    version=modelDetails.version,
    stage='Production',
)

# COMMAND ----------

# MAGIC %md The MLflow Model Registry allows multiple model versions to share the same stage. When referencing a model by stage, the Model Registry will use the latest model version (the model version with the largest version ID). The `MlflowClient.get_latest_versions()` function fetches the latest model version for a given stage or set of stages. The following cell uses this function to print the latest version of the power forecasting model that is in the `Production` stage.

# COMMAND ----------

latestVersionInfo = client.get_latest_versions(modelName,
                                               stages=["Production"])
latestVersion = latestVersionInfo[0].version
print("The latest production version of the model '%s' is '%s'." %
      (modelName, latestVersion))
model_name = "linear-regression-model"
artifact_path = "best_model"
model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path)
registered_model = mlflow.register_model(model_uri=model_uri, name=model_name, await_registration_for=120)

#Add model and model version descriptions to Model Registry
client.update_model_version(
  name=registered_model.name,
  version=registered_model.version,
  description="This predicts the age of a customer using transaction history."
)

#Transition a model version to Staging/Prod/Archived
client.transition_model_version_stage(
  name=registered_model.name,
  version=registered_model.version,
  stage='Staging',
)

#Get model version details
model_version = client.get_model_version(
  name=registered_model.name,
  version=registered_model.version,
)

#Load a specific model version from the Model Registry
model_uri = "models:/{model_name}/staging".format(model_name=model_name)

spark_model = mlflow.spark.load_model(model_uri)

# COMMAND ----------
        f"Bad current stage '{current_stage}' for model version {model_version}. Should be None or Staging."
    )

# COMMAND ----------

# MAGIC %md ## Make predictions on test data

# COMMAND ----------

data = spark.read.format("delta").load(input_data_path)
preds = data.withColumn(
    "prediction", model_udf(*data.drop("quality").columns)).select(
        "quality",
        "prediction").toPandas()  # it's okay since dataframe is small
mse = mean_squared_error(preds["quality"], preds["prediction"])
print(f"MSE: {mse}")

# COMMAND ----------

# MAGIC %md ## Test succeeded: transition the model to Staging

# COMMAND ----------

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage="Staging",
)

# COMMAND ----------
Example #5
0
client.update_registered_model(
  name=model_name,
  description="This model forecasts the wine quality based on the characteristics."
)

client.update_model_version(
  name=model_name,
  version=model_version,
  description="This model version was built using sklearn."
)

# COMMAND ----------

client.transition_model_version_stage(
  name=model_name,
  version=model_version,
  stage=stage,
  archive_existing_versions=True
)

model_version_details = client.get_model_version(
  name=model_name,
  version=model_version,
)
print("The current model stage is: '{stage}'".format(stage=model_version_details.current_stage))

latest_version_info = client.get_latest_versions(model_name, stages=[stage])
latest_production_version = latest_version_info[0].version
print("The latest production version of the model '%s' is '%s'." % (model_name, latest_production_version))

# COMMAND ----------
Example #6
0
# MAGIC 
# MAGIC If you have permission to transition a model to a particular stage, you can make the transition directly by using the `MlflowClient.update_model_version()` function. If you do not have permission, you can request a stage transition using the REST API; for example:
# MAGIC 
# MAGIC ```
# MAGIC %sh curl -i -X POST -H "X-Databricks-Org-Id: <YOUR_ORG_ID>" -H "Authorization: Bearer <YOUR_ACCESS_TOKEN>" https://<YOUR_DATABRICKS_WORKSPACE_URL>/api/2.0/preview/mlflow/transition-requests/create -d '{"comment": "Please move this model into production!", "model_version": {"version": 1, "registered_model": {"name": "power-forecasting-model"}}, "stage": "Production"}'
# MAGIC ```

# COMMAND ----------

# MAGIC %md Now that you've learned about stage transitions, transition the model to the `Production` stage.

# COMMAND ----------

client.transition_model_version_stage(
  name=model_details.name,
  version=model_details.version,
  stage='Production',
)

# COMMAND ----------

# MAGIC %md Use the `MlflowClient.get_model_version()` function to fetch the model's current stage.

# COMMAND ----------

model_version_details = client.get_model_version(
  name=model_details.name,
  version=model_details.version,
)
print("The current model stage is: '{stage}'".format(stage=model_version_details.current_stage))
def train(data_conf, model_conf, **kwargs):

    try:
        print("-----------------------------------")
        print("Starting Cashflow DL Model Training")
        print("-----------------------------------")
        print()

        # ==============================
        # 0. Main parameters definitions
        # ==============================

        # Size of X and y arrays definition
        N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int(
            data_conf['number_of_predicted_days'])  #365, 92
        print('Number of days used for prediction (X): ', N_days_X)
        print('Number of days predicted (y): ', N_days_y)
        print()

        # Date range definition
        start_date, end_date = data_conf['start_date'], data_conf['end_date']
        start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions(
            start_date, end_date, N_days_X, N_days_y)
        print('Date range: ', start_date, end_date)
        print()

        model_name = model_conf['model_name']

    except Exception as e:
        print("Errored on initialization")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.1 Pre-processing before model training
        # ========================================

        # Loading dataset
        table_in = data_conf[environment]['table_to_train_on']
        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        # Cleaning of the time series
        ts_balance = ts_balance.withColumn(
            'balance', ts_balance.balance.cast("array<float>"))

        ts_balance = ts_balance.withColumn(
            'keep_ts',
            F.udf(lambda x, y: time_series_cleaning(x, y), "int")('balance',
                                                                  F.lit(20))
        )  #at least 10 transactions in the ts, to be used in the training

        ts_balance = ts_balance.where('keep_ts == 1')

        # Creating the dataset on which we train (and test and validate) the model
        ts_balance_model = ts_balance.sample(
            False, 0.7,
            seed=0)  #now 0.7, but in real case would be 0.1 at best... or 0.05
        print('ts_balance_model.count()', ts_balance_model.count())

        # Pre-processing before model training
        ts_balance_model = pre_processing(ts_balance_model,
                                          end_date,
                                          spark,
                                          serving=False)
        ts_balance_model.show(3)

        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())
        ts_balance_model.show(3)

        # Saving prepared dataset
        table_out = 'cashflow_training_step1'
        #ts_balance_model.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance_model.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

    except Exception as e:
        print("Errored on step T.1: pre-processing before model training")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.2 Generating TRAIN, VAL, TEST datasets
        # ========================================

        # Loading datasets
        table_model = 'cashflow_training_step1'
        #ts_balance_model = spark.read.parquet("/mnt/test/{0}.parquet".format(table_model)).cache()
        ts_balance_model = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_model)).cache()
        ts_balance_model.show(3)

        print('ts_balance_model.count()', ts_balance_model.count())
        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())

        train_set, val_set, test_set = ts_balance_model.randomSplit(
            [0.6, 0.2, 0.2], seed=12345)
        train_set.show(3)
        print(
            'train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()',
            train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(),
            test_set.rdd.getNumPartitions())

        # Saving prepared datasets (train, val, test sets to parquet)
        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        table_test = data_conf[environment][
            'table_test_for_performance']  #'cashflow_test'

        train_set.select('X',
                         'y').write.format("delta").mode("overwrite").save(
                             "/mnt/delta/{0}".format(table_train))
        val_set.select('X', 'y').write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_val))
        test_set.select('primaryaccountholder','transactiondate','balance')\
            .write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format(table_test))

    except Exception as e:
        print("Errored on step T.2: pre-processings")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ==============================
        # T.3 MODEL DEFINITION AND TRAIN
        # ==============================

        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        #table_train = spark.read.parquet("/mnt/test/{0}.parquet".format(table_train))
        table_train = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_train))
        #table_val = spark.read.parquet("/mnt/test/{0}.parquet".format(table_val))
        table_val = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_val))
        table_train_count = table_train.count()
        table_val_count = table_val.count()
        #table_train_count, table_val_count

        from pyspark.sql.functions import col
        from petastorm.spark import SparkDatasetConverter, make_spark_converter

        # Set a cache directory on DBFS FUSE for intermediate data.
        spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
                       "file:///dbfs/tmp/petastorm/cache")
        converter_train = make_spark_converter(table_train)
        converter_val = make_spark_converter(table_val)

        print(f"train: {len(converter_train)}, val: {len(converter_val)}")

        def get_compiled_model(N_days_X, N_days_y, model_conf):  #lr=0.001
            #model = get_model(lr=lr)
            model = define_1dcnn_model(N_days_X, N_days_y, model_conf)

            hyperparameters = model_conf['hyperParameters']

            opt = tf.keras.optimizers.Adam()

            # Model compilation
            model.compile(optimizer=opt, loss=hyperparameters['loss'])

            return model

        # Enable auto-logging to MLflow to capture TensorBoard metrics.
        mlflow.tensorflow.autolog(every_n_iter=1)

        model_name = model_conf['model_name']
        mlflow_model_name = model_name
        model_dir = "/tmp/" + model_name
        try:
            dbutils.fs.rm(model_dir, recurse=True)
        except OSError:
            pass

        with mlflow.start_run():

            NUM_EPOCHS = model_conf['hyperParameters']['epochs']  #5
            BATCH_SIZE = model_conf['hyperParameters']['batch_size']  #500

            def train_and_evaluate(N_days_X, N_days_y, model_conf):  #lr=0.001
                model = get_compiled_model(N_days_X, N_days_y, model_conf)  #lr

                with converter_train.make_tf_dataset(batch_size=BATCH_SIZE) as train_dataset, \
                     converter_val.make_tf_dataset(batch_size=BATCH_SIZE) as val_dataset:

                    #train_dataset = train_dataset.map(lambda x: (x.features, x.label_index))
                    train_dataset = train_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    steps_per_epoch = len(converter_train) // BATCH_SIZE

                    #val_dataset = val_dataset.map(lambda x: (x.features, x.label_index))
                    val_dataset = val_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    validation_steps = max(1, len(converter_val) // BATCH_SIZE)

                    print(
                        f"steps_per_epoch: {steps_per_epoch}, validation_steps: {validation_steps}"
                    )

                    hist = model.fit(train_dataset,
                                     steps_per_epoch=steps_per_epoch,
                                     epochs=NUM_EPOCHS,
                                     validation_data=val_dataset,
                                     validation_steps=validation_steps,
                                     verbose=2)
                    return model, hist

            model, hist = train_and_evaluate(N_days_X, N_days_y, model_conf)
            print(hist.history['val_loss'][-1])

            #MLflow logging
            #mlflow.log_artifact(cwd + "data.json")
            #mlflow.log_artifact(cwd + "config.json")
            mlflow.log_param("model_name", str(model_name))
            mlflow.log_param("N_days_X", N_days_X)
            mlflow.log_param("N_days_y", N_days_y)
            mlflow.log_param("start_date", start_date)
            mlflow.log_param("end_date", end_date)
            mlflow.log_param("num_epochs", str(NUM_EPOCHS))
            mlflow.log_param("batch_size", str(BATCH_SIZE))
            #mlflow.log_param("steps_per_epoch", str(steps_per_epoch)) #validation_steps

            # saving using tf.keras.models.save_model
            tf.keras.models.save_model(model, filepath=model_dir +
                                       '/model')  #SavedModel format
            #model.save(filepath=model_dir+'model', save_format="h5")      #H5 format (todo, and look how to register that)

            # saving using mlflow.tensorflow.save_model (this does NOT log nor register the model) does not overwrites...
            #mlflow.tensorflow.save_model(tf_saved_model_dir=model_dir+'/model',
            #                             tf_meta_graph_tags=[tf.compat.v1.saved_model.tag_constants.SERVING],
            #                             tf_signature_def_key='serving_default',
            #                             path = 'model')

            # logging already saved model
            mlflow.tensorflow.log_model(
                tf_saved_model_dir=model_dir + '/model',
                tf_meta_graph_tags=[
                    tf.compat.v1.saved_model.tag_constants.SERVING
                ],
                tf_signature_def_key='serving_default',
                registered_model_name=model_name,
                artifact_path='model')

            # Getting the version number of the newly registered MLflow model (useful for next steps)
            mlflow_model_version = 0
            client_current_model = MlflowClient()
            for mv in client_current_model.search_model_versions(
                    "name='{0}'".format(mlflow_model_name)):
                #if int(dict(mv)['version']) == mlflow_model_version:
                if int(
                        dict(mv)['version']
                ) >= mlflow_model_version:  # finding the last version registered
                    mlflow_model_version = int(dict(mv)['version'])
                    model_dict = dict(mv)

            #update 2020-07017: to grab the latest model version, we can also do like this: (TO BE TESTED!!!)
            #model_version_infos = client_current_model.search_model_versions(f"name = '{model_name}'")
            #mlflow_model_version = max([model_version_info.version for model_version_info in model_version_infos])

            # Wait until the model is ready
            def wait_until_model_ready(model_name, model_version):
                client = MlflowClient()
                for _ in range(20):
                    model_version_details = client.get_model_version(
                        name=model_name,
                        version=model_version,
                    )
                    status = ModelVersionStatus.from_string(
                        model_version_details.status)
                    print("Model status: %s" %
                          ModelVersionStatus.to_string(status))
                    if status == ModelVersionStatus.READY:
                        break
                    tm.sleep(5)

            wait_until_model_ready(mlflow_model_name, mlflow_model_version)

            # Transition the registered model stage from "None" to "Staging"
            client_current_model.transition_model_version_stage(
                name=mlflow_model_name,
                version=mlflow_model_version,
                stage="Staging",
            )

            # Copy the file from the driver node and save it to DBFS (so that they can be accessed e.g. after the current cluster terminates.):
            dbutils.fs.cp("file:/tmp/{0}/model".format(model_name),
                          "dbfs:/mnt/test/{0}/model".format(model_name),
                          recurse=True)
            print('Model copied here: ',
                  "dbfs:/mnt/test/{0}/model/".format(model_name))

        #mlflow.end_run()

    except Exception as e:
        print("Errored on step T.3: model definition and train")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
    mlflow_model_stage = 'Staging'

    client = MlflowClient()
    for mv in client.search_model_versions(
            "name='{0}'".format(mlflow_model_name)):
        if dict(mv)['current_stage'] == mlflow_model_stage:
            model_dict = dict(mv)

            print('Model extracted run_id: ', model_dict['run_id'])
            print('Model extracted version number: ', model_dict['version'])
            print('Model extracted stage: ', model_dict['current_stage'])

            # Transition the registered model stage from "None" to "Staging"
            client.transition_model_version_stage(
                name=mlflow_model_name,
                version=model_dict['version'],
                stage="Production",
            )

            print()
            print('Model transitioned to Production')
            break

# COMMAND ----------

# MAGIC %run ./utils

# COMMAND ----------

# -*- coding: utf-8 -*-
# MAGIC %md
# MAGIC ### Delete Registered Models

# COMMAND ----------

from mlflow.tracking.client import MlflowClient
client = MlflowClient()

modelName = "Titanic-Model__" + userName
models = client.search_model_versions("name='{}'".format(modelName))

# loop over registered models
for i in range(len(models)):
  try:
    # set model stage to Archive
    client.transition_model_version_stage(name=modelName, version=models[i].version, stage='Archived')
  except:
    pass
  # delete version of model
  client.delete_model_version(modelName, models[i].version)

# delete model
client.delete_registered_model(modelName)

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC &copy; 2020 Databricks, Inc. All rights reserved.<br/>
# MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="http://www.apache.org/">Apache Software Foundation</a>.<br/>
# MAGIC <br/>
# MAGIC <a href="https://databricks.com/privacy-policy">Privacy Policy</a> | <a href="https://databricks.com/terms-of-use">Terms of Use</a> | <a href="http://help.databricks.com/">Support</a>