Exemple #1
0
def main(argv):
    with kiwi.start_run():
        args = parser.parse_args(argv[1:])

        # Builds, trains and evaluates a tf.estimator. Then, exports it for inference,
        # logs the exported model with MLflow, and loads the fitted model back as a PyFunc.
        (x_train,
         y_train), (x_test,
                    y_test) = tf.keras.datasets.boston_housing.load_data()

        # There are 13 features we are using for inference.
        feat_cols = [
            tf.feature_column.numeric_column(key="features",
                                             shape=(x_train.shape[1], ))
        ]
        feat_spec = {
            "features":
            tf.placeholder("float",
                           name="features",
                           shape=[None, x_train.shape[1]])
        }

        hidden_units = [50, 20]
        steps = args.steps

        regressor = tf.estimator.DNNRegressor(hidden_units=hidden_units,
                                              feature_columns=feat_cols)
        train_input_fn = tf.estimator.inputs.numpy_input_fn(
            {"features": x_train}, y_train, num_epochs=None, shuffle=True)
        regressor.train(train_input_fn, steps=steps)
        test_input_fn = tf.estimator.inputs.numpy_input_fn(
            {"features": x_test}, y_test, num_epochs=None, shuffle=True)
        # Compute mean squared error
        mse = regressor.evaluate(test_input_fn, steps=steps)

        # Building a receiver function for exporting
        receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            feat_spec)
        temp = tempfile.mkdtemp()
        try:
            # The model is automatically logged when export_saved_model() is called.
            saved_estimator_path = regressor.export_savedmodel(
                temp, receiver_fn).decode("utf-8")

            # Since the model was automatically logged as an artifact (more specifically
            # a MLflow Model), we don't need to use saved_estimator_path to load back the model.
            # MLflow takes care of it!
            pyfunc_model = pyfunc.load_model(kiwi.get_artifact_uri('model'))
            df = pd.DataFrame(data=x_test,
                              columns=["features"] * x_train.shape[1])

            # Checking the PyFunc's predictions are the same as the original model's predictions.
            predict_df = pyfunc_model.predict(df)
            predict_df['original_labels'] = y_test
            print(predict_df)
        finally:
            shutil.rmtree(temp)
Exemple #2
0
    def log(cls, artifact_path, flavor, registered_model_name=None, **kwargs):
        """
        Log model using supplied flavor module. If no run is active, this method will create a new
        active run.

        :param artifact_path: Run relative path identifying the model.
        :param flavor: Flavor module to save the model with. The module must have
                       the ``save_model`` function that will persist the model as a valid
                       MLflow model.
        :param registered_model_name: (Experimental) If given, create a model version under
                                      ``registered_model_name``, also creating a registered model if
                                      one with the given name does not exist.
        :param signature: (Experimental) :py:class:`ModelSignature` describes model input
                          and output :py:class:`Schema <mlflow.types.Schema>`. The model signature
                          can be :py:func:`inferred <infer_signature>` from datasets representing
                          valid model input (e.g. the training dataset) and valid model output
                          (e.g. model predictions generated on the training dataset), for example:

                          .. code-block:: python

                            from mlflow.models.signature import infer_signature
                            train = df.drop_column("target_label")
                            signature = infer_signature(train, model.predict(train))

        :param input_example: (Experimental) Input example provides one or several examples of
                              valid model input. The example can be used as a hint of what data to
                              feed the model. The given example will be converted to a Pandas
                              DataFrame and then serialized to json using the Pandas split-oriented
                              format. Bytes are base64-encoded.

        :param kwargs: Extra args passed to the model flavor.
        """
        with TempDir() as tmp:
            local_path = tmp.path("model")
            run_id = kiwi.tracking.fluent._get_or_start_run().info.run_id
            mlflow_model = cls(artifact_path=artifact_path, run_id=run_id)
            flavor.save_model(path=local_path, mlflow_model=mlflow_model,
                              **kwargs)
            kiwi.tracking.fluent.log_artifacts(local_path, artifact_path)
            try:
                kiwi.tracking.fluent._record_logged_model(mlflow_model)
            except MlflowException:
                # We need to swallow all mlflow exceptions to maintain backwards compatibility with
                # older tracking servers. Only print out a warning for now.
                _logger.warning(
                    "Logging model metadata to the tracking server has failed, possibly due older "
                    "server version. The model artifacts have been logged successfully under %s. "
                    "In addition to exporting model artifacts, MLflow clients 1.7.0 and above "
                    "attempt to record model metadata to the  tracking store. If logging to a "
                    "mlflow server via REST, consider  upgrading the server version to MLflow "
                    "1.7.0 or above.", kiwi.get_artifact_uri())
            if registered_model_name is not None:
                run_id = kiwi.tracking.fluent.active_run().info.run_id
                kiwi.register_model("runs:/%s/%s" % (run_id, artifact_path),
                                    registered_model_name)
Exemple #3
0
def test_log_artifact_with_dirs(tmpdir):
    # Test log artifact with a directory
    art_dir = tmpdir.mkdir("parent")
    file0 = art_dir.join("file0")
    file0.write("something")
    file1 = art_dir.join("file1")
    file1.write("something")
    sub_dir = art_dir.mkdir("child")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir))
        base = os.path.basename(str(art_dir))
        assert os.listdir(run_artifact_dir) == [base]
        assert set(os.listdir(os.path.join(run_artifact_dir, base))) == \
            {'child', 'file0', 'file1'}
        with open(os.path.join(run_artifact_dir, base, "file0")) as f:
            assert f.read() == "something"
    # Test log artifact with directory and specified parent folder
    art_dir = tmpdir.mkdir("dir")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir), "some_parent")
        assert os.listdir(run_artifact_dir) == [
            os.path.basename("some_parent")
        ]
        assert os.listdir(os.path.join(run_artifact_dir, "some_parent")) == \
            [os.path.basename(str(art_dir))]
    sub_dir = art_dir.mkdir("another_dir")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir), "parent/and_child")
        assert os.listdir(os.path.join(run_artifact_dir, "parent", "and_child")) == \
            [os.path.basename(str(art_dir))]
        assert os.listdir(os.path.join(run_artifact_dir,
                                       "parent", "and_child",
                                       os.path.basename(str(art_dir)))) == \
            [os.path.basename(str(sub_dir))]
Exemple #4
0
def test_log_artifact():
    artifact_src_dir = tempfile.mkdtemp()
    # Create artifacts
    _, path0 = tempfile.mkstemp(dir=artifact_src_dir)
    _, path1 = tempfile.mkstemp(dir=artifact_src_dir)
    for i, path in enumerate([path0, path1]):
        with open(path, "w") as handle:
            handle.write("%s" % str(i))
    # Log an artifact, verify it exists in the directory returned by get_artifact_uri
    # after the run finishes
    artifact_parent_dirs = ["some_parent_dir", None]
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = kiwi.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)
            kiwi.log_artifact(path0, parent_dir)
        expected_dir = os.path.join(run_artifact_dir, parent_dir) \
            if parent_dir is not None else run_artifact_dir
        assert os.listdir(expected_dir) == [os.path.basename(path0)]
        logged_artifact_path = os.path.join(expected_dir, path0)
        assert filecmp.cmp(logged_artifact_path, path0, shallow=False)
    # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = kiwi.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)

            kiwi.log_artifacts(artifact_src_dir, parent_dir)
        # Check that the logged artifacts match
        expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \
            if parent_dir is not None else run_artifact_dir
        dir_comparison = filecmp.dircmp(artifact_src_dir,
                                        expected_artifact_output_dir)
        assert len(dir_comparison.left_only) == 0
        assert len(dir_comparison.right_only) == 0
        assert len(dir_comparison.diff_files) == 0
        assert len(dir_comparison.funny_files) == 0
Exemple #5
0
def test_get_artifact_uri_appends_to_uri_path_component_correctly(
        artifact_location, expected_uri_format):
    client = MlflowClient()
    client.create_experiment("get-artifact-uri-test",
                             artifact_location=artifact_location)
    kiwi.set_experiment("get-artifact-uri-test")
    with kiwi.start_run():
        run_id = kiwi.active_run().info.run_id
        for artifact_path in [
                "path/to/artifact", "/artifact/path", "arty.txt"
        ]:
            artifact_uri = kiwi.get_artifact_uri(artifact_path)
            assert artifact_uri == tracking.artifact_utils.get_artifact_uri(
                run_id, artifact_path)
            assert artifact_uri == expected_uri_format.format(
                run_id=run_id, path=artifact_path.lstrip("/"))
Exemple #6
0
def test_artifact_can_be_downloaded_from_absolute_uri_successfully(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_path = "artifact"
    with kiwi.start_run():
        kiwi.log_artifact(local_path=local_artifact_path, artifact_path=logged_artifact_path)
        artifact_uri = kiwi.get_artifact_uri(artifact_path=logged_artifact_path)

    downloaded_artifact_path = os.path.join(
        _download_artifact_from_uri(artifact_uri), artifact_file_name)
    assert downloaded_artifact_path != local_artifact_path
    assert downloaded_artifact_path != logged_artifact_path
    with open(downloaded_artifact_path, "r") as f:
        assert f.read() == artifact_text
Exemple #7
0
def test_download_artifact_from_absolute_uri_persists_data_to_specified_output_directory(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_subdir = "logged_artifact"
    with kiwi.start_run():
        kiwi.log_artifact(local_path=local_artifact_path, artifact_path=logged_artifact_subdir)
        artifact_uri = kiwi.get_artifact_uri(artifact_path=logged_artifact_subdir)

    artifact_output_path = tmpdir.join("artifact_output").strpath
    os.makedirs(artifact_output_path)
    _download_artifact_from_uri(artifact_uri=artifact_uri, output_path=artifact_output_path)
    assert logged_artifact_subdir in os.listdir(artifact_output_path)
    assert artifact_file_name in os.listdir(
        os.path.join(artifact_output_path, logged_artifact_subdir))
    with open(os.path.join(
            artifact_output_path, logged_artifact_subdir, artifact_file_name), "r") as f:
        assert f.read() == artifact_text
        test_loss, correct, len(test_loader.dataset), test_accuracy))
    step = (epoch + 1) * len(train_loader)
    log_scalar('test_loss', test_loss, step)
    log_scalar('test_accuracy', test_accuracy, step)

def log_scalar(name, value, step):
    """Log a scalar value to both MLflow and TensorBoard"""
    writer.add_scalar(name, value, step)
    kiwi.log_metric(name, value)

with kiwi.start_run():
    # Log our parameters into mlflow
    for key, value in vars(args).items():
        kiwi.log_param(key, value)

    # Create a SummaryWriter to write TensorBoard events locally
    output_dir = dirpath = tempfile.mkdtemp()
    writer = SummaryWriter(output_dir)
    print("Writing TensorBoard events locally to %s\n" % output_dir)

    # Perform the training
    for epoch in range(1, args.epochs + 1):
        train(epoch)
        test(epoch)

    # Upload the TensorBoard event logs as a run artifact
    print("Uploading TensorBoard events as a run artifact...")
    kiwi.log_artifacts(output_dir, artifact_path="events")
    print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" %
          os.path.join(kiwi.get_artifact_uri(), "events"))
Exemple #9
0
def test_get_artifact_uri_uses_currently_active_run_id():
    artifact_path = "artifact"
    with kiwi.start_run() as active_run:
        assert kiwi.get_artifact_uri(artifact_path=artifact_path) == \
               tracking.artifact_utils.get_artifact_uri(
            run_id=active_run.info.run_id, artifact_path=artifact_path)
Exemple #10
0
def test_get_artifact_uri_with_artifact_path_unspecified_returns_artifact_root_dir(
):
    with kiwi.start_run() as active_run:
        assert kiwi.get_artifact_uri(
            artifact_path=None) == active_run.info.artifact_uri
Exemple #11
0
import tempfile

import kiwi
from kiwi import log_metric, log_param, log_artifacts, get_artifact_uri, active_run,\
    get_tracking_uri, log_artifact

if __name__ == "__main__":
    print("Running {} with tracking URI {}".format(sys.argv[0],
                                                   get_tracking_uri()))
    log_param("param1", 5)
    log_metric("foo", 5)
    log_metric("foo", 6)
    log_metric("foo", 7)
    log_metric("random_int", random.randint(0, 100))
    run_id = active_run().info.run_id
    # Get run metadata & data from the tracking server
    service = kiwi.tracking.MlflowClient()
    run = service.get_run(run_id)
    print("Metadata & data for run with UUID %s: %s" % (run_id, run))
    local_dir = tempfile.mkdtemp()
    message = "test artifact written during run %s within artifact URI %s\n" \
              % (active_run().info.run_id, get_artifact_uri())
    try:
        file_path = os.path.join(local_dir, "some_output_file.txt")
        with open(file_path, "w") as handle:
            handle.write(message)
        log_artifacts(local_dir, "some_subdir")
        log_artifact(file_path, "another_dir")
    finally:
        shutil.rmtree(local_dir)
Exemple #12
0
def main(argv):
    with kiwi.start_run():
        args = parser.parse_args(argv[1:])

        # Fetch the data
        (train_x, train_y), (test_x, test_y) = load_data()

        # Feature columns describe how to use the input.
        my_feature_columns = []
        for key in train_x.keys():
            my_feature_columns.append(
                tf.feature_column.numeric_column(key=key))

        # Two hidden layers of 10 nodes each.
        hidden_units = [10, 10]

        # Build 2 hidden layer DNN with 10, 10 units respectively.
        classifier = tf.estimator.DNNClassifier(
            feature_columns=my_feature_columns,
            hidden_units=hidden_units,
            # The model must choose between 3 classes.
            n_classes=3)

        # Train the Model.
        classifier.train(
            input_fn=lambda: train_input_fn(train_x, train_y, args.batch_size),
            steps=args.train_steps)

        # Evaluate the model.
        eval_result = classifier.evaluate(
            input_fn=lambda: eval_input_fn(test_x, test_y, args.batch_size))

        print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

        # Generate predictions from the model
        expected = ['Setosa', 'Versicolor', 'Virginica']
        predict_x = {
            'SepalLength': [5.1, 5.9, 6.9],
            'SepalWidth': [3.3, 3.0, 3.1],
            'PetalLength': [1.7, 4.2, 5.4],
            'PetalWidth': [0.5, 1.5, 2.1],
        }

        predictions = classifier.predict(input_fn=lambda: eval_input_fn(
            predict_x, labels=None, batch_size=args.batch_size))

        old_predictions = []
        template = '\nPrediction is "{}" ({:.1f}%), expected "{}"'

        for pred_dict, expec in zip(predictions, expected):
            class_id = pred_dict['class_ids'][0]
            probability = pred_dict['probabilities'][class_id]

            print(template.format(SPECIES[class_id], 100 * probability, expec))

            old_predictions.append(SPECIES[class_id])

        # Creating output tf.Variables to specify the output of the saved model.
        feat_specifications = {
            'SepalLength': tf.Variable([],
                                       dtype=tf.float64,
                                       name="SepalLength"),
            'SepalWidth': tf.Variable([], dtype=tf.float64, name="SepalWidth"),
            'PetalLength': tf.Variable([],
                                       dtype=tf.float64,
                                       name="PetalLength"),
            'PetalWidth': tf.Variable([], dtype=tf.float64, name="PetalWidth")
        }

        receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            feat_specifications)
        temp = tempfile.mkdtemp()
        try:
            # The model is automatically logged when export_saved_model() is called.
            saved_estimator_path = classifier.export_saved_model(
                temp, receiver_fn).decode("utf-8")

            # Since the model was automatically logged as an artifact (more specifically
            # a MLflow Model), we don't need to use saved_estimator_path to load back the model.
            # MLflow takes care of it!
            pyfunc_model = pyfunc.load_model(kiwi.get_artifact_uri('model'))

            predict_data = [[5.1, 3.3, 1.7, 0.5], [5.9, 3.0, 4.2, 1.5],
                            [6.9, 3.1, 5.4, 2.1]]
            df = pd.DataFrame(data=predict_data,
                              columns=[
                                  "SepalLength", "SepalWidth", "PetalLength",
                                  "PetalWidth"
                              ])

            # Predicting on the loaded Python Function and a DataFrame containing the
            # original data we predicted on.
            predict_df = pyfunc_model.predict(df)

            # Checking the PyFunc's predictions are the same as the original model's predictions.
            template = '\nOriginal prediction is "{}", reloaded prediction is "{}"'
            for expec, pred in zip(old_predictions, predict_df['classes']):
                class_id = predict_df['class_ids'][predict_df.loc[
                    predict_df['classes'] == pred].index[0]]
                reloaded_label = SPECIES[class_id]
                print(template.format(expec, reloaded_label))
        finally:
            shutil.rmtree(temp)
Exemple #13
0
def log_model(spark_model,
              artifact_path,
              conda_env=None,
              dfs_tmpdir=None,
              sample_input=None,
              registered_model_name=None,
              signature: ModelSignature = None,
              input_example: ModelInputExample = None):
    """
    Log a Spark MLlib model as an MLflow artifact for the current run. This uses the
    MLlib persistence format and produces an MLflow Model with the Spark flavor.

    Note: If no run is active, it will instantiate a run to obtain a run_id.

    :param spark_model: Spark model to be saved - MLflow can only save descendants of
                        pyspark.ml.Model which implement MLReadable and MLWritable.
    :param artifact_path: Run relative artifact path.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decsribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in :func:`get_default_conda_env()`. If `None`, the default
                      :func:`get_default_conda_env()` environment is added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model is written in this
                       destination and then copied into the model's artifact directory. This is
                       necessary as Spark ML models read from and write to DFS if running on a
                       cluster. If this operation completes successfully, all temporary files
                       created on the DFS are removed. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.
    :param registered_model_name: (Experimental) If given, create a model version under
                                  ``registered_model_name``, also creating a registered model if one
                                  with the given name does not exist.

    :param signature: (Experimental) :py:class:`ModelSignature <mlflow.models.ModelSignature>`
                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
                      from datasets with valid model input (e.g. the training dataset with target
                      column omitted) and valid model output (e.g. model predictions generated on
                      the training dataset), for example:

                      .. code-block:: python

                        from mlflow.models.signature import infer_signature
                        train = df.drop_column("target_label")
                        predictions = ... # compute model predictions
                        signature = infer_signature(train, predictions)
    :param input_example: (Experimental) Input example provides one or several instances of valid
                          model input. The example can be used as a hint of what data to feed the
                          model. The given example will be converted to a Pandas DataFrame and then
                          serialized to json using the Pandas split-oriented format. Bytes are
                          base64-encoded.



    .. code-block:: python
        :caption: Example

        from pyspark.ml import Pipeline
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.feature import HashingTF, Tokenizer
        training = spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"])
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=10, regParam=0.001)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training)
        mlflow.spark.log_model(model, "spark-model")
    """
    from py4j.protocol import Py4JJavaError

    _validate_model(spark_model)
    from pyspark.ml import PipelineModel
    if not isinstance(spark_model, PipelineModel):
        spark_model = PipelineModel([spark_model])
    run_id = kiwi.tracking.fluent._get_or_start_run().info.run_id
    run_root_artifact_uri = kiwi.get_artifact_uri()
    # If the artifact URI is a local filesystem path, defer to Model.log() to persist the model,
    # since Spark may not be able to write directly to the driver's filesystem. For example,
    # writing to `file:/uri` will write to the local filesystem from each executor, which will
    # be incorrect on multi-node clusters - to avoid such issues we just use the Model.log() path
    # here.
    if is_local_uri(run_root_artifact_uri):
        return Model.log(artifact_path=artifact_path,
                         flavor=kiwi.spark,
                         spark_model=spark_model,
                         conda_env=conda_env,
                         dfs_tmpdir=dfs_tmpdir,
                         sample_input=sample_input,
                         registered_model_name=registered_model_name)
    # If Spark cannot write directly to the artifact repo, defer to Model.log() to persist the
    # model
    model_dir = os.path.join(run_root_artifact_uri, artifact_path)
    try:
        spark_model.save(os.path.join(model_dir, _SPARK_MODEL_PATH_SUB))
    except Py4JJavaError:
        return Model.log(artifact_path=artifact_path,
                         flavor=kiwi.spark,
                         spark_model=spark_model,
                         conda_env=conda_env,
                         dfs_tmpdir=dfs_tmpdir,
                         sample_input=sample_input,
                         registered_model_name=registered_model_name,
                         signature=signature,
                         input_example=input_example)

    # Otherwise, override the default model log behavior and save model directly to artifact repo
    mlflow_model = Model(artifact_path=artifact_path, run_id=run_id)
    with TempDir() as tmp:
        tmp_model_metadata_dir = tmp.path()
        _save_model_metadata(tmp_model_metadata_dir,
                             spark_model,
                             mlflow_model,
                             sample_input,
                             conda_env,
                             signature=signature,
                             input_example=input_example)
        kiwi.tracking.fluent.log_artifacts(tmp_model_metadata_dir,
                                           artifact_path)
        if registered_model_name is not None:
            kiwi.register_model("runs:/%s/%s" % (run_id, artifact_path),
                                registered_model_name)