Esempio n. 1
0
def score(data_conf, model_conf, evaluation=False, **kwargs):

    try:
        print()
        print("-----------------------------------")
        print("         Model Serving             ")
        print("-----------------------------------")
        print()

        # ==============================
        # 1.0 Data Loading
        # ==============================

        #         # USING IRIS DATASET:
        #         iris = load_iris()                  #The Iris dataset is available through the scikit-learn API
        #         idx = list(range(len(iris.target)))
        #         np.random.shuffle(idx)              #We shuffle it (important if we want to split in train and test sets)
        #         X = iris.data[idx]
        #         y = iris.target[idx]

        #         # Load data in Pandas dataFrame and then in a Pyspark dataframe
        #         data_pd = pd.DataFrame(data=np.column_stack((X,y)), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
        #         data_df = spark.createDataFrame(data_pd)

        #if not evaluation: table_in = data_conf[env]['input_to_score'] # for scoring new data
        #if evaluation: table_in = data_conf[env]['input_test'] # for performance evaluation on historical data
        #data_df = spark.table(table_in)
        data_df = spark.read.format("delta").load(
            "/mnt/delta/{0}".format('test_data_spark_rf'))

        data_df.show(5)
        print("Step 1.0 completed: Loaded dataset in Spark")

    except Exception as e:
        print("Errored on 1.0: data loading")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===================
        # 1.1 Model serving
        # ===================

        # Load model from MLflow model registry #https://www.mlflow.org/docs/latest/model-registry.html
        if env == 'PROD':
            mlflow_model_stage = 'Production'
        else:
            mlflow_model_stage = 'Staging'

        # Detecting the model dictionary among available models in MLflow model registry.
        client = MlflowClient()
        for mv in client.search_model_versions(
                "name='{0}'".format(mlflow_model_name)):
            if dict(mv)['current_stage'] == mlflow_model_stage:
                model_dict = dict(mv)
                break

        print('Model extracted run_id: ', model_dict['run_id'])
        print('Model extracted version number: ', model_dict['version'])
        print('Model extracted stage: ', model_dict['current_stage'])

        def get_local_path_from_dbfs(dbfs_path):
            '''
            This get the local version of the dbfs path, i.e. replaces "dbfs:" by "/dbfs", for local APIs use.
            '''
            return "/dbfs" + dbfs_path.lstrip("dbfs:")

        mlflow_path = model_dict['source']
        print("mlflow_path: ", mlflow_path)

        # De-serialize the model
        #model = PipelineModel.load("/tmp/rf_model_test")
        model = mlflow.spark.load_model(mlflow_path)

        # Make predictions
        predictions = model.transform(data_df)

        # Select example rows to display.
        predictions.select("prediction", "indexedLabel", "features").show(5)

        # Saving the result of the scoring
        if not evaluation: table_out = data_conf[env]['output_to_score']
        if evaluation: table_out = data_conf[env]['output_test']
        #predictions.write.format("ORC").saveAsTable(table_out, mode='overwrite')

        print(
            "Step 1.1 completed: model loading, data scoring and writing to hive"
        )
        print()

    except Exception as e:
        print("Errored on step 1.1: model serving")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
Esempio n. 2
0
def evaluate(data_conf, model_conf, scoring=True, **kwargs):

    try:
        # ===========================
        # E.1 Scoring of test data
        # ===========================
        if scoring:  # switch, in case we want to skip score (if score already computed earlier)
            score(
                data_conf, model_conf, evaluation=True
            )  # the score function is applied on test dataset for performance evaluation

    except Exception as e:
        print("Errored on step E.1: scoring of test data")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===========================
        # E.2 Metrics & Visualization
        # ===========================

        # Load model from MLflow model registry #https://www.mlflow.org/docs/latest/model-registry.html
        if env == 'PROD':
            mlflow_model_stage = 'Production'
        else:
            mlflow_model_stage = 'Staging'

        # Detecting the model dictionary among available models in MLflow model registry.
        client = MlflowClient()
        for mv in client.search_model_versions(
                "name='{0}'".format(mlflow_model_name)):
            if dict(mv)['current_stage'] == mlflow_model_stage:
                model_dict = dict(mv)
                break

        print('Model extracted run_id: ', model_dict['run_id'])
        print('Model extracted version number: ', model_dict['version'])
        print('Model extracted stage: ', model_dict['current_stage'])

        #MLflow logging of metrics for trained model
        mlflow.end_run()  # in case mlfow run_id defined before here
        mlflow.start_run(run_id=model_dict['run_id'])

        # Loading dataset
        table_in = data_conf[env]['output_test']
        predictions = spark.table(table_in)

        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel",
            predictionCol="prediction",
            metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Accuracy = %g" % (accuracy))

        # Extracting the test set to Pandas
        pred_pd = predictions.toPandas()
        y_test = pred_pd['indexedLabel'].values
        y_pred = pred_pd['prediction'].values

        # Accuracy and Confusion Matrix
        accuracy = accuracy_score(y_test, y_pred)
        print('Accuracy = ', accuracy)
        print('Confusion matrix:')
        Classes = ['setosa', 'versicolor', 'virginica']
        C = confusion_matrix(y_test, y_pred)
        C_normalized = C / C.astype(np.float).sum()
        C_normalized_pd = pd.DataFrame(C_normalized,
                                       columns=Classes,
                                       index=Classes)
        print(C_normalized_pd)

        #labels = ['business', 'health']
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(C, cmap='Blues')
        plt.title('Confusion matrix of the classifier')
        fig.colorbar(cax)
        ax.set_xticklabels([''] + Classes)
        ax.set_yticklabels([''] + Classes)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()
        fig.savefig('/dbfs/mnt/delta/confusion_matrix_spark_rf.png')

        # Tracking performance metrics
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_artifact("/dbfs/mnt/delta/confusion_matrix_spark_rf.png")

        print("Step E.2 completed metrics & visualisation")
        print()

    except Exception as e:
        print("Errored on step E.2: metrics & visualisation")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
Esempio n. 3
0
  )

# COMMAND ----------

# MAGIC %md ### Fetch the new model version ID using MLflow Model Registry Search
# MAGIC 
# MAGIC The `MlflowClient.search_model_versions()` function searches for model versions by model name, MLflow run ID, or artifact source location. All model versions satisfying a particular filter query are returned.
# MAGIC 
# MAGIC The following cell uses this search function to fetch the version ID of the new model, which is assumed to be the largest (e.g., most recent) version ID.

# COMMAND ----------

from mlflow.tracking.client import MlflowClient
client = MlflowClient()

model_version_infos = client.search_model_versions("name = '%s'" % model_name)
new_model_version = max([model_version_info.version for model_version_info in model_version_infos])

# COMMAND ----------

# MAGIC %md Wait for the new model version to become ready.

# COMMAND ----------

wait_until_ready(model_name, new_model_version)

# COMMAND ----------

# MAGIC %md ## Add a description to the new model version

# COMMAND ----------
Esempio n. 4
0
    input_data,
    headers=headers)

print("Predicted : ", http_res.text)

# COMMAND ----------

# MAGIC %md ### Cleanup

# COMMAND ----------

# delete AML webservice
svc.delete()

# loop over registered models in MLflow
models = client.search_model_versions("name='{}'".format(model_name))
for model in models:
    try:
        # set model stage to Archive
        client.transition_model_version_stage(name=model_name,
                                              version=model.version,
                                              stage='Archived')
    except:
        pass
    # delete version of model
    client.delete_model_version(model_name, model.version)

# delete model
client.delete_registered_model(model_name)

# COMMAND ----------
def train(data_conf, model_conf, **kwargs):

    try:
        print("-----------------------------------")
        print("Starting Cashflow DL Model Training")
        print("-----------------------------------")
        print()

        # ==============================
        # 0. Main parameters definitions
        # ==============================

        # Size of X and y arrays definition
        N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int(
            data_conf['number_of_predicted_days'])  #365, 92
        print('Number of days used for prediction (X): ', N_days_X)
        print('Number of days predicted (y): ', N_days_y)
        print()

        # Date range definition
        start_date, end_date = data_conf['start_date'], data_conf['end_date']
        start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions(
            start_date, end_date, N_days_X, N_days_y)
        print('Date range: ', start_date, end_date)
        print()

        model_name = model_conf['model_name']

    except Exception as e:
        print("Errored on initialization")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.1 Pre-processing before model training
        # ========================================

        # Loading dataset
        table_in = data_conf[environment]['table_to_train_on']
        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        # Cleaning of the time series
        ts_balance = ts_balance.withColumn(
            'balance', ts_balance.balance.cast("array<float>"))

        ts_balance = ts_balance.withColumn(
            'keep_ts',
            F.udf(lambda x, y: time_series_cleaning(x, y), "int")('balance',
                                                                  F.lit(20))
        )  #at least 10 transactions in the ts, to be used in the training

        ts_balance = ts_balance.where('keep_ts == 1')

        # Creating the dataset on which we train (and test and validate) the model
        ts_balance_model = ts_balance.sample(
            False, 0.7,
            seed=0)  #now 0.7, but in real case would be 0.1 at best... or 0.05
        print('ts_balance_model.count()', ts_balance_model.count())

        # Pre-processing before model training
        ts_balance_model = pre_processing(ts_balance_model,
                                          end_date,
                                          spark,
                                          serving=False)
        ts_balance_model.show(3)

        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())
        ts_balance_model.show(3)

        # Saving prepared dataset
        table_out = 'cashflow_training_step1'
        #ts_balance_model.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance_model.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

    except Exception as e:
        print("Errored on step T.1: pre-processing before model training")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.2 Generating TRAIN, VAL, TEST datasets
        # ========================================

        # Loading datasets
        table_model = 'cashflow_training_step1'
        #ts_balance_model = spark.read.parquet("/mnt/test/{0}.parquet".format(table_model)).cache()
        ts_balance_model = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_model)).cache()
        ts_balance_model.show(3)

        print('ts_balance_model.count()', ts_balance_model.count())
        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())

        train_set, val_set, test_set = ts_balance_model.randomSplit(
            [0.6, 0.2, 0.2], seed=12345)
        train_set.show(3)
        print(
            'train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()',
            train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(),
            test_set.rdd.getNumPartitions())

        # Saving prepared datasets (train, val, test sets to parquet)
        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        table_test = data_conf[environment][
            'table_test_for_performance']  #'cashflow_test'

        train_set.select('X',
                         'y').write.format("delta").mode("overwrite").save(
                             "/mnt/delta/{0}".format(table_train))
        val_set.select('X', 'y').write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_val))
        test_set.select('primaryaccountholder','transactiondate','balance')\
            .write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format(table_test))

    except Exception as e:
        print("Errored on step T.2: pre-processings")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ==============================
        # T.3 MODEL DEFINITION AND TRAIN
        # ==============================

        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        #table_train = spark.read.parquet("/mnt/test/{0}.parquet".format(table_train))
        table_train = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_train))
        #table_val = spark.read.parquet("/mnt/test/{0}.parquet".format(table_val))
        table_val = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_val))
        table_train_count = table_train.count()
        table_val_count = table_val.count()
        #table_train_count, table_val_count

        from pyspark.sql.functions import col
        from petastorm.spark import SparkDatasetConverter, make_spark_converter

        # Set a cache directory on DBFS FUSE for intermediate data.
        spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
                       "file:///dbfs/tmp/petastorm/cache")
        converter_train = make_spark_converter(table_train)
        converter_val = make_spark_converter(table_val)

        print(f"train: {len(converter_train)}, val: {len(converter_val)}")

        def get_compiled_model(N_days_X, N_days_y, model_conf):  #lr=0.001
            #model = get_model(lr=lr)
            model = define_1dcnn_model(N_days_X, N_days_y, model_conf)

            hyperparameters = model_conf['hyperParameters']

            opt = tf.keras.optimizers.Adam()

            # Model compilation
            model.compile(optimizer=opt, loss=hyperparameters['loss'])

            return model

        # Enable auto-logging to MLflow to capture TensorBoard metrics.
        mlflow.tensorflow.autolog(every_n_iter=1)

        model_name = model_conf['model_name']
        mlflow_model_name = model_name
        model_dir = "/tmp/" + model_name
        try:
            dbutils.fs.rm(model_dir, recurse=True)
        except OSError:
            pass

        with mlflow.start_run():

            NUM_EPOCHS = model_conf['hyperParameters']['epochs']  #5
            BATCH_SIZE = model_conf['hyperParameters']['batch_size']  #500

            def train_and_evaluate(N_days_X, N_days_y, model_conf):  #lr=0.001
                model = get_compiled_model(N_days_X, N_days_y, model_conf)  #lr

                with converter_train.make_tf_dataset(batch_size=BATCH_SIZE) as train_dataset, \
                     converter_val.make_tf_dataset(batch_size=BATCH_SIZE) as val_dataset:

                    #train_dataset = train_dataset.map(lambda x: (x.features, x.label_index))
                    train_dataset = train_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    steps_per_epoch = len(converter_train) // BATCH_SIZE

                    #val_dataset = val_dataset.map(lambda x: (x.features, x.label_index))
                    val_dataset = val_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    validation_steps = max(1, len(converter_val) // BATCH_SIZE)

                    print(
                        f"steps_per_epoch: {steps_per_epoch}, validation_steps: {validation_steps}"
                    )

                    hist = model.fit(train_dataset,
                                     steps_per_epoch=steps_per_epoch,
                                     epochs=NUM_EPOCHS,
                                     validation_data=val_dataset,
                                     validation_steps=validation_steps,
                                     verbose=2)
                    return model, hist

            model, hist = train_and_evaluate(N_days_X, N_days_y, model_conf)
            print(hist.history['val_loss'][-1])

            #MLflow logging
            #mlflow.log_artifact(cwd + "data.json")
            #mlflow.log_artifact(cwd + "config.json")
            mlflow.log_param("model_name", str(model_name))
            mlflow.log_param("N_days_X", N_days_X)
            mlflow.log_param("N_days_y", N_days_y)
            mlflow.log_param("start_date", start_date)
            mlflow.log_param("end_date", end_date)
            mlflow.log_param("num_epochs", str(NUM_EPOCHS))
            mlflow.log_param("batch_size", str(BATCH_SIZE))
            #mlflow.log_param("steps_per_epoch", str(steps_per_epoch)) #validation_steps

            # saving using tf.keras.models.save_model
            tf.keras.models.save_model(model, filepath=model_dir +
                                       '/model')  #SavedModel format
            #model.save(filepath=model_dir+'model', save_format="h5")      #H5 format (todo, and look how to register that)

            # saving using mlflow.tensorflow.save_model (this does NOT log nor register the model) does not overwrites...
            #mlflow.tensorflow.save_model(tf_saved_model_dir=model_dir+'/model',
            #                             tf_meta_graph_tags=[tf.compat.v1.saved_model.tag_constants.SERVING],
            #                             tf_signature_def_key='serving_default',
            #                             path = 'model')

            # logging already saved model
            mlflow.tensorflow.log_model(
                tf_saved_model_dir=model_dir + '/model',
                tf_meta_graph_tags=[
                    tf.compat.v1.saved_model.tag_constants.SERVING
                ],
                tf_signature_def_key='serving_default',
                registered_model_name=model_name,
                artifact_path='model')

            # Getting the version number of the newly registered MLflow model (useful for next steps)
            mlflow_model_version = 0
            client_current_model = MlflowClient()
            for mv in client_current_model.search_model_versions(
                    "name='{0}'".format(mlflow_model_name)):
                #if int(dict(mv)['version']) == mlflow_model_version:
                if int(
                        dict(mv)['version']
                ) >= mlflow_model_version:  # finding the last version registered
                    mlflow_model_version = int(dict(mv)['version'])
                    model_dict = dict(mv)

            #update 2020-07017: to grab the latest model version, we can also do like this: (TO BE TESTED!!!)
            #model_version_infos = client_current_model.search_model_versions(f"name = '{model_name}'")
            #mlflow_model_version = max([model_version_info.version for model_version_info in model_version_infos])

            # Wait until the model is ready
            def wait_until_model_ready(model_name, model_version):
                client = MlflowClient()
                for _ in range(20):
                    model_version_details = client.get_model_version(
                        name=model_name,
                        version=model_version,
                    )
                    status = ModelVersionStatus.from_string(
                        model_version_details.status)
                    print("Model status: %s" %
                          ModelVersionStatus.to_string(status))
                    if status == ModelVersionStatus.READY:
                        break
                    tm.sleep(5)

            wait_until_model_ready(mlflow_model_name, mlflow_model_version)

            # Transition the registered model stage from "None" to "Staging"
            client_current_model.transition_model_version_stage(
                name=mlflow_model_name,
                version=mlflow_model_version,
                stage="Staging",
            )

            # Copy the file from the driver node and save it to DBFS (so that they can be accessed e.g. after the current cluster terminates.):
            dbutils.fs.cp("file:/tmp/{0}/model".format(model_name),
                          "dbfs:/mnt/test/{0}/model".format(model_name),
                          recurse=True)
            print('Model copied here: ',
                  "dbfs:/mnt/test/{0}/model/".format(model_name))

        #mlflow.end_run()

    except Exception as e:
        print("Errored on step T.3: model definition and train")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
def evaluate(data_conf, model_conf, scoring=True, **kwargs):

    try:
        print("-------------------------------------")
        print("Starting Cashflow DL Model Evaluation")
        print("-------------------------------------")
        print()

        # ==============================
        # 0. Main parameters definitions
        # ==============================

        # Size of X and y arrays definition
        N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int(
            data_conf['number_of_predicted_days'])  #365, 92
        print('Number of days used for prediction (X): ', N_days_X)
        print('Number of days predicted (y): ', N_days_y)
        print()

        # Date range definition
        start_date, end_date = data_conf['start_date'], data_conf['end_date']
        start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions(
            start_date, end_date, N_days_X, N_days_y)
        print('Date range: ', start_date, end_date)
        print()

        model_name = model_conf['model_name']

    except Exception as e:
        print("Errored on initialization")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===========================
        # E.1 Scoring of test data
        # ===========================

        #if kwargs['do_we_score'] is True: # switch, in case we want to skip score (if score already computed earlier)
        if scoring:  # switch, in case we want to skip score (if score already computed earlier)
            score(
                data_conf, model_conf, evaluation=True
            )  # the score function is applied on test dataset for performance evaluation

    except Exception as e:
        print("Errored on step E.1: scoring of test data")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===========================
        # E.2 Metrics & Visualization
        # ===========================

        # Load model from MLflow model registry #https://www.mlflow.org/docs/latest/model-registry.html
        #mlflow_model_name = 'cashflow-poc'
        mlflow_model_name = model_conf['model_name']
        if environment == 'prod':
            mlflow_model_stage = 'Production'
        else:
            mlflow_model_stage = 'Staging'

        # Detecting the model dictionary among available models in MLflow model registry.
        client = MlflowClient()
        for mv in client.search_model_versions(
                "name='{0}'".format(mlflow_model_name)):
            if dict(mv)['current_stage'] == mlflow_model_stage:
                model_dict = dict(mv)
                break

        print('Model extracted run_id: ', model_dict['run_id'])
        print('Model extracted version number: ', model_dict['version'])
        print('Model extracted stage: ', model_dict['current_stage'])

        #MLflow logging of metrics for trained model
        mlflow.end_run()  # in case mlfow run_id defined before here
        mlflow.start_run(run_id=model_dict['run_id'])
        #mlflow.start_run()  # specify the runid!!!

        # Loading dataset
        table_in = data_conf[environment]['table_scored']
        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        # Extracting the test set to Pandas
        ts_balance_pd = ts_balance.select(
            'balance', 'X', 'y', 'y_pred',
            'y_pred_rescaled_retrended').toPandas()

        # Extraction of metrics
        R2_all_3month, R2_array_3month, R2_all_1month, R2_array_1month = metric_extraction(
            ts_balance_pd, N_days_y)

        # Visualization of prediction
        #fig1, fig2 = visualization_prediction(ts_balance_pd, start_date, end_date, N_days_X, N_days_y, R2_array_1month, R2_array_3month, serving=False)
        fig1, fig2 = visualization_time_series_pred_only(ts_balance_pd,
                                                         start_date,
                                                         end_date,
                                                         N_days_X,
                                                         N_days_y,
                                                         R2_array_1month,
                                                         R2_array_3month,
                                                         serving=False)
        fig1.savefig('/dbfs/mnt/delta/performance.png')
        fig2.savefig('/dbfs/mnt/delta/performance_R2.png')
        mlflow.log_artifact('/dbfs/mnt/delta/performance.png')
        mlflow.log_artifact('/dbfs/mnt/delta/performance_R2.png')

        # Saving the metric
        print('Test R2 metric (3-months window): {}'.format(R2_all_3month))
        print('Test R2 metric (1-months window): {}'.format(R2_all_1month))
        mlflow.log_metric("R2_all_3month", R2_all_3month)
        mlflow.log_metric("R2_all_1month", R2_all_1month)

        with open("/dbfs/mnt/delta/evaluation.json", "w+") as f:
            json.dump({
                'R2_3month': R2_all_3month,
                'R2_1month': R2_all_1month
            }, f)
        mlflow.log_artifact("/dbfs/mnt/delta/evaluation.json")

        mlflow.end_run()

        ts_balance.unpersist()
        print("Step E.2 completed visualisation")

    except Exception as e:
        print("Errored on step E.2: visualisation")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
def score(data_conf, model_conf, evaluation=False, **kwargs):

    try:
        print("----------------------------------")
        print("Starting Cashflow DL Model Scoring")
        print("----------------------------------")
        print("")

        # ==============================
        # 0. Main parameters definitions
        # ==============================

        # Size of X and y arrays definition
        N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int(
            data_conf['number_of_predicted_days'])  #365, 92
        print('Number of days used for prediction (X): {0}'.format(N_days_X))
        print('Number of days predicted (y): {0}'.format(N_days_y))
        print('')

        # Date range definition
        start_date, end_date = data_conf['start_date'], data_conf['end_date']
        start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions(
            start_date, end_date, N_days_X, N_days_y)
        print('Date range: [{0}, {1}]'.format(start_date, end_date))
        print('')

        model_name = model_conf['model_name']

        #print("Step 0 completed (main parameters definition)")

    except Exception as e:
        print("Errored on initialization")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ==================================
        # S.1 Pre-processings before serving
        # ==================================

        start_time_S1 = time.time()

        # Loading dataset
        table_in = data_conf[environment]['table_to_score']

        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        print('Reading table {0}'.format(table_in))
        #print('Size of table: ',ts_balance.count())
        #print('ts_balance.rdd.getNumPartitions()',ts_balance.rdd.getNumPartitions())

        if not evaluation:
            ts_balance = pre_processing(ts_balance,
                                        end_date,
                                        spark,
                                        serving=True)
        if evaluation:
            ts_balance = pre_processing(ts_balance,
                                        end_date,
                                        spark,
                                        serving=False)
        ts_balance.show(3)

        # Saving prepared dataset
        table_out = data_conf[environment]['cashflow_s1_out_scoring']

        #ts_balance.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

        ts_balance.unpersist()
        spark.catalog.clearCache()
        end_time_S1 = time.time()
        print("Step S.1 completed: pre-processings before serving")
        print("Time spent: ", end_time_S1 - start_time_S1)

    except Exception as e:
        print("Errored on step S.1: pre-processings before serving")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===================
        # S.2 Model serving
        # ===================

        start_time_S2 = time.time()

        # Loading dataset
        table_in = data_conf[environment]['cashflow_s1_out_scoring']

        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in))
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))
        ts_balance.cache()
        print('Number of  partitions: ', ts_balance.rdd.getNumPartitions())

        # Load model from MLflow model registry #https://www.mlflow.org/docs/latest/model-registry.html
        mlflow_model_name = model_conf['model_name']
        if environment == 'prod':
            mlflow_model_stage = 'Production'
        else:
            mlflow_model_stage = 'Staging'

        # Detecting the model dictionary among available models in MLflow model registry.
        client = MlflowClient()
        for mv in client.search_model_versions(
                "name='{0}'".format(mlflow_model_name)):
            if dict(mv)['current_stage'] == mlflow_model_stage:
                model_dict = dict(mv)
                break

        print('Model extracted run_id: ', model_dict['run_id'])
        print('Model extracted version number: ', model_dict['version'])
        print('Model extracted stage: ', model_dict['current_stage'])

        def get_local_path_from_dbfs(dbfs_path):
            '''
            This get the local version of the dbfs path, i.e. replaces "dbfs:" by "/dbfs", for local APIs use.
            '''
            #os.path.join("/dbfs", dbfs_path.lstrip("dbfs:"))  #why does not work???
            return "/dbfs" + dbfs_path.lstrip("dbfs:")

        mlflow_path = get_local_path_from_dbfs(
            model_dict['source']) + '/tfmodel'
        print("mlflow_path: ", mlflow_path)

        # It detects the name id of the pb model file
        file = [
            f
            for f in os.listdir('/dbfs/mnt/test/{0}/model/'.format(model_name))
        ]
        print(file)
        export_dir_saved = "/dbfs/mnt/test/{0}/model/".format(
            model_name)  #+file[0]   # TODO!!! GET THE MODEL FROM MLFLOW !!!!
        print(export_dir_saved)

        #def rdd_scoring(numpy_array):
        #    predictor_fn = tf.contrib.predictor.from_saved_model(export_dir = export_dir_saved)
        #    return predictor_fn({'input': numpy_array.reshape(-1, N_days_X, 1) })

        #@F.udf("array<float>")
        #def udf_scoring(x):
        #    predictor_fn = tf.contrib.predictor.from_saved_model(export_dir = mlflow_path) #export_dir_saved)
        #    return np.around(predictor_fn({'input': np.array(x).reshape(-1, N_days_X, 1) })['output'][0].tolist(), decimals=3).tolist()

        @F.pandas_udf("array<float>")
        def pandas_udf_scoring(x):
            #predictor_fn = tf.contrib.predictor.from_saved_model(export_dir = export_dir_saved) #mlflow_path)
            #return Series([np.around(predictor_fn({'input': np.array(v).reshape(-1, N_days_X, 1)})['output'][0], decimals=3) for v in x])
            new_model = tf.keras.models.load_model(export_dir_saved)
            #new_model = mlflow.tensorflow.load_model(mlflow_path)
            return Series([
                np.around(new_model.predict(
                    np.array(v).reshape(-1, N_days_X, 1)).reshape(N_days_y),
                          decimals=3) for v in x
            ])

        ts_balance = ts_balance.withColumn('y_pred', pandas_udf_scoring('X'))
        #ts_balance = ts_balance.withColumn('y_pred', udf_scoring('X'))

        print('ts_balance.rdd.getNumPartitions()',
              ts_balance.rdd.getNumPartitions())
        ts_balance.show(3)
        #print('Size of table: ',ts_balance.count())

        # Saving prepared dataset
        table_out = data_conf[environment]['cashflow_s2_out_scoring']

        #ts_balance.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

        ts_balance.unpersist()
        spark.catalog.clearCache()
        end_time_S2 = time.time()
        print("Step S.2 completed: model serving")
        print("Time spent: ", end_time_S2 - start_time_S2)

    except Exception as e:
        print("Errored on step S.2: model serving")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===================
        # S.3 Post-processing
        # ===================

        start_time_S3 = time.time()

        # Loading dataset
        table_in = data_conf[environment]['cashflow_s2_out_scoring']

        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        ts_balance = post_processing(ts_balance)
        ts_balance.show(3)

        # Saving prepared dataset
        table_out = data_conf[environment]['table_scored']

        #ts_balance.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

        ts_balance.unpersist()
        end_time_S3 = time.time()
        print("Step S.3 completed: post-processing")
        print("Time spent: ", end_time_S3 - start_time_S3)

    except Exception as e:
        print("Errored on step S.3: post-processing")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e
# If run in production, in MLflow model registry, the last model currently in "Staging" is transitioned to "Production"
# ---------------------------------------------------------------------------------------------------------------------

from mlflow.tracking.client import MlflowClient

# Define the environment (dev, test or prod)
environment = dbutils.widgets.getArgument("environment")

if environment == 'prod':

    # Detect the last model currently in "Staging" in MLflow model registry.
    mlflow_model_name = 'super_test'
    mlflow_model_stage = 'Staging'

    client = MlflowClient()
    for mv in client.search_model_versions(
            "name='{0}'".format(mlflow_model_name)):
        if dict(mv)['current_stage'] == mlflow_model_stage:
            model_dict = dict(mv)

            print('Model extracted run_id: ', model_dict['run_id'])
            print('Model extracted version number: ', model_dict['version'])
            print('Model extracted stage: ', model_dict['current_stage'])

            # Transition the registered model stage from "None" to "Staging"
            client.transition_model_version_stage(
                name=mlflow_model_name,
                version=model_dict['version'],
                stage="Production",
            )

            print()
def score(data_conf, model_conf, evaluation=False, **kwargs):

    try:
        print()
        print("-----------------------------------")
        print("         Model Serving             ")
        print("-----------------------------------")
        print()

        # ==============================
        # 1.0 Data Loading
        # ==============================

        #if not evaluation: table_in = data_conf[env]['input_to_score'] # for scoring new data
        #if evaluation: table_in = data_conf[env]['input_test'] # for performance evaluation on historical data
        #data_df = spark.table(table_in)
        data_df = spark.read.format("delta").load("/mnt/delta/{0}".format('test_data_sklearn_rf'))  
        data_pd = data_df.toPandas()
        
        # Feature selection
        feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        target       = 'label'   
        
        x_test = data_pd[feature_cols].values
        y_test = data_pd[target].values

        # Creation of train and test datasets
        #x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.7, stratify=y) #stratify=y ensures that the same proportion of labels are in both train and test sets!        
         
        print("Step 1.0 completed: Loaded dataset in Spark")      

    except Exception as e:
        print("Errored on 1.0: data loading")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ===================
        # 1.1 Model serving
        # ===================   
        
        # Load model from MLflow model registry #https://www.mlflow.org/docs/latest/model-registry.html        
        if env == 'PROD' : 
            mlflow_model_stage = 'Production'
        else:
            mlflow_model_stage = 'Staging'
            
        print(mlflow_model_stage)
            
        # Detecting the model dictionary among available models in MLflow model registry. 
        client = MlflowClient()
        for mv in client.search_model_versions("name='{0}'".format(mlflow_model_name)):
            if dict(mv)['current_stage'] == mlflow_model_stage:
                model_dict = dict(mv)
                break  
                
        print('Model extracted run_id: ', model_dict['run_id'])
        print('Model extracted version number: ', model_dict['version'])
        print('Model extracted stage: ', model_dict['current_stage'])                

#         def get_local_path_from_dbfs(dbfs_path):
#             '''
#             This get the local version of the dbfs path, i.e. replaces "dbfs:" by "/dbfs", for local APIs use.
#             ''' 
#             return "/dbfs"+dbfs_path.lstrip("dbfs:")  
      
        mlflow_path = model_dict['source']      
        print("mlflow_path: ", mlflow_path)        

        # De-serialize the model
        # model = mlflow.sklearn.load_model(mlflow_path) # works but using the mlflow.sklearn API (not general)
        model = mlflow.pyfunc.load_model(mlflow_path) # Load model as a PyFuncModel.
        
        # Make predictions
        #y_pred = model.predict(x_test)
        y_pred = model.predict(pd.DataFrame(x_test)) # when using the PyFuncModel, the input expected is a Pandas df                     

        # Saving the result of the scoring
        if not evaluation: table_out = data_conf[env]['output_to_score']
        if evaluation: table_out = data_conf[env]['output_test']
        #predictions.write.format("ORC").saveAsTable(table_out, mode='overwrite') 
        pred_pd = pd.DataFrame(data=np.column_stack((y_test,y_pred)), columns=['y_test', 'y_pred'])
        pred_df = spark.createDataFrame(pred_pd)
        pred_df.write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format('prediction_sklearn_rf'))  
        
        # Select example rows to display.
        pred_df.show(5)        

        print("Step 1.1 completed: model loading, data scoring and writing to hive")   
        print()               

    except Exception as e:
        print("Errored on step 1.1: model serving")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e