Ejemplo n.º 1
0
    def test_model_log(self):
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            model_path = tmp.path("linear.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(self._linear_lr, f)
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                pyfunc.log_model(artifact_path="linear",
                                 data_path=model_path,
                                 loader_module=os.path.basename(__file__)[:-3],
                                 code_path=[__file__])

                run_id = tracking.active_run().info.run_uuid
                path = tracking._get_model_log_dir("linear", run_id)
                m = Model.load(os.path.join(path, "MLmodel"))
                print(m.__dict__)
                x = pyfunc.load_pyfunc("linear", run_id=run_id)
                xpred = x.predict(self._X)
                np.testing.assert_array_equal(self._linear_lr_predict, xpred)
            finally:
                tracking.end_run()
                tracking.set_tracking_uri(None)
                # Remove the log directory in order to avoid adding new tests to pytest...
                shutil.rmtree(tracking_dir)
Ejemplo n.º 2
0
def test_start_run_context_manager():
    with temp_directory() as tmp_dir, mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        first_run = tracking.start_run()
        store = first_run.store
        first_uuid = first_run.run_info.run_uuid
        with first_run:
            # Check that start_run() causes the run information to be persisted in the store
            persisted_run = store.get_run(first_uuid)
            assert persisted_run is not None
            assert persisted_run.info == first_run.run_info
        finished_run = store.get_run(first_uuid)
        assert finished_run.info.status == RunStatus.FINISHED
        # Launch a separate run that fails, verify the run status is FAILED and the run UUID is
        # different
        second_run = tracking.start_run()
        assert second_run.run_info.run_uuid != first_uuid
        with pytest.raises(Exception):
            with second_run:
                raise Exception("Failing run!")
        finished_run2 = store.get_run(second_run.run_info.run_uuid)
        assert finished_run2.info == second_run.run_info
        assert finished_run2.info.status == RunStatus.FAILED
        # Nested runs return original run
        with tracking.start_run():
            active_run = tracking._active_run
            new_run = tracking.start_run()
            assert active_run == new_run
Ejemplo n.º 3
0
def test_log_artifact():
    with temp_directory() as tmp_dir, temp_directory() as artifact_src_dir,\
            mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        # Create artifacts
        _, path0 = tempfile.mkstemp(dir=artifact_src_dir)
        _, path1 = tempfile.mkstemp(dir=artifact_src_dir)
        for i, path in enumerate([path0, path1]):
            with open(path, "w") as handle:
                handle.write("%s" % str(i))
        # Log an artifact, verify it exists in the directory returned by get_artifact_uri
        # after the run finishes
        artifact_parent_dirs = ["some_parent_dir", None]
        for parent_dir in artifact_parent_dirs:
            with tracking.start_run():
                run_artifact_dir = mlflow.get_artifact_uri()
                mlflow.log_artifact(path0, parent_dir)
            expected_dir = os.path.join(run_artifact_dir, parent_dir)\
                if parent_dir is not None else run_artifact_dir
            assert os.listdir(expected_dir) == [os.path.basename(path0)]
            logged_artifact_path = os.path.join(expected_dir, path0)
            assert filecmp.cmp(logged_artifact_path, path0, shallow=False)
        # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri
        for parent_dir in artifact_parent_dirs:
            with tracking.start_run():
                run_artifact_dir = mlflow.get_artifact_uri()
                mlflow.log_artifacts(artifact_src_dir, parent_dir)
            # Check that the logged artifacts match
            expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \
                if parent_dir is not None else run_artifact_dir
            dir_comparison = filecmp.dircmp(artifact_src_dir, expected_artifact_output_dir)
            assert len(dir_comparison.left_only) == 0
            assert len(dir_comparison.right_only) == 0
            assert len(dir_comparison.diff_files) == 0
            assert len(dir_comparison.funny_files) == 0
Ejemplo n.º 4
0
def test_no_nested_run():
    with temp_directory() as tmp_dir, mock.patch(
            "mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        first_run = tracking.start_run()
        with first_run:
            with pytest.raises(Exception):
                tracking.start_run()
Ejemplo n.º 5
0
    def test_log_saved_model(self):
        # This tests model logging capabilities on the sklearn.iris dataset.
        iris = datasets.load_iris()
        X = iris.data[:, :2]  # we only take the first two features.
        y = iris.target
        trainingFeatures = {}
        for i in range(0, 2):
            # TensorFlow is fickle about feature names, so we remove offending characters
            iris.feature_names[i] = iris.feature_names[i].replace(" ", "")
            iris.feature_names[i] = iris.feature_names[i].replace("(", "")
            iris.feature_names[i] = iris.feature_names[i].replace(")", "")
            trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1]
        tf_feat_cols = []
        feature_names = iris.feature_names[:2]
        # Creating TensorFlow-specific numeric columns for input.
        for col in iris.feature_names[:2]:
            tf_feat_cols.append(tf.feature_column.numeric_column(col))
        # Creating input training function.
        input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                         y,
                                                         shuffle=False,
                                                         batch_size=1)
        # Creating Deep Neural Network Regressor.
        estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols,
                                              hidden_units=[1])
        # Training and creating expected predictions on training dataset.
        estimator.train(input_train, steps=10)
        # Saving the estimator's prediction on the training data; assume the DNNRegressor
        # produces a single output column named 'predictions'
        pred_col = "predictions"
        estimator_preds = [s[pred_col] for s in estimator.predict(input_train)]
        estimator_preds_df = pd.DataFrame({pred_col: estimator_preds})

        old_tracking_uri = tracking.get_tracking_uri()
        # should_start_run tests whether or not calling log_model() automatically starts a run.
        for should_start_run in [False, True]:
            with TempDir(chdr=True, remove_on_exit=True) as tmp:
                try:
                    # Creating dict of features names (str) to placeholders (tensors)
                    feature_spec = {}
                    for name in feature_names:
                        feature_spec[name] = tf.placeholder("float",
                                                            name=name,
                                                            shape=[150])
                    tracking.set_tracking_uri("test")
                    if should_start_run:
                        tracking.start_run()
                    pyfunc_preds_df = self.helper(
                        feature_spec, tmp, estimator,
                        pandas.DataFrame(data=X, columns=feature_names))

                    # Asserting that the loaded model predictions are as expected.
                    assert estimator_preds_df.equals(pyfunc_preds_df)
                finally:
                    # Restoring the old logging location.
                    tracking.end_run()
                    tracking.set_tracking_uri(old_tracking_uri)
Ejemplo n.º 6
0
    def test_log_saved_model(self):
        # This tests model logging capabilities on the sklearn.iris dataset.
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            iris = datasets.load_iris()
            X = iris.data[:, :2]  # we only take the first two features.
            y = iris.target
            trainingFeatures = {}
            feature_names = iris.feature_names[:2]
            for i in range(0, 2):
                # TensorFlow is fickle about feature names, so we remove offending characters
                iris.feature_names[i] = iris.feature_names[i].replace(" ", "")
                iris.feature_names[i] = iris.feature_names[i].replace("(", "")
                iris.feature_names[i] = iris.feature_names[i].replace(")", "")
                trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1]
            tf_feat_cols = []
            feature_names = iris.feature_names[:2]
            # Creating TensorFlow-specific numeric columns for input.
            for col in iris.feature_names[:2]:
                tf_feat_cols.append(tf.feature_column.numeric_column(col))
            # Creating input training function.
            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y,
                                                             shuffle=False,
                                                             batch_size=1)
            # Creating Deep Neural Network Regressor.
            estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols,
                                                  hidden_units=[1])
            # Training and creating expected predictions on training dataset.
            estimator.train(input_train, steps=100)
            estimator_preds = estimator.predict(input_train)
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                for name in feature_names:
                    feature_spec[name] = tf.placeholder("float",
                                                        name=name,
                                                        shape=[150])

                saved = [s['predictions'] for s in estimator_preds]

                results = self.helper(
                    feature_spec, tmp, estimator,
                    pandas.DataFrame(data=X, columns=feature_names))

                # Asserting that the loaded model predictions are as expected.
                np.testing.assert_array_equal(saved, results)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
Ejemplo n.º 7
0
 def test_model_log(self):
     with TempDir(chdr=True, remove_on_exit=True):
         tracking.start_run()
         try:
             sklearn.log_model(sk_model=self._linear_lr,
                               artifact_path="linear")
             x = sklearn.load_model(
                 "linear", run_id=tracking.active_run().info.run_uuid)
             xpred = x.predict(self._X)
             np.testing.assert_array_equal(self._linear_lr_predict, xpred)
         finally:
             tracking.end_run()
Ejemplo n.º 8
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        try:
            tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            if should_start_run:
                tracking.start_run()
            sparkm.log_model(artifact_path="model", spark_model=model)
            run_id = tracking.active_run().info.run_uuid
            x = pyfunc.load_pyfunc("model", run_id=run_id)
            preds2 = x.predict(pandas_df)
            assert preds1 == preds2
            reloaded_model = sparkm.load_model("model", run_id=run_id)
            preds_df_1 = reloaded_model.transform(spark_df)
            preds3 = [
                x.prediction
                for x in preds_df_1.select("prediction").collect()
            ]
            assert preds1 == preds3
        finally:
            tracking.end_run()
            tracking.set_tracking_uri(old_tracking_uri)
            shutil.rmtree(tracking_dir)
Ejemplo n.º 9
0
 def test_model_log(self):
     old_uri = tracking.get_tracking_uri()
     # should_start_run tests whether or not calling log_model() automatically starts a run.
     for should_start_run in [False, True]:
         with TempDir(chdr=True, remove_on_exit=True) as tmp:
             try:
                 tracking.set_tracking_uri("test")
                 if should_start_run:
                     tracking.start_run()
                 sklearn.log_model(sk_model=self._linear_lr, artifact_path="linear")
                 x = sklearn.load_model("linear", run_id=tracking.active_run().info.run_uuid)
                 xpred = x.predict(self._X)
                 np.testing.assert_array_equal(self._linear_lr_predict, xpred)
             finally:
                 tracking.end_run()
                 tracking.set_tracking_uri(old_uri)
Ejemplo n.º 10
0
def main(argv):
    # Builds, trains and evaluates a tf.estimator. Then, exports it for inference, logs the exported model 
    # with MLflow, and loads the fitted model back as a PyFunc to make predictions.
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data()
    # There are 13 features we are using for inference.
    feat_cols = [tf.feature_column.numeric_column(key="features", shape=(x_train.shape[1],))]
    feat_spec = {"features":tf.placeholder("float", name="features", shape=[None, x_train.shape[1]])}
    hidden_units = [50, 20]
    steps = 1000
    regressor = tf.estimator.DNNRegressor(hidden_units=hidden_units, feature_columns=feat_cols)
    train_input_fn = tf.estimator.inputs.numpy_input_fn({"features": x_train}, y_train, num_epochs=None, shuffle=True)
    with tracking.start_run() as tracked_run:
        mlflow.log_param("Hidden Units", hidden_units)
        mlflow.log_param("Steps", steps)
        regressor.train(train_input_fn, steps=steps)
        test_input_fn = tf.estimator.inputs.numpy_input_fn({"features": x_test}, y_test, num_epochs=None, shuffle=True)
        # Compute mean squared error
        mse = regressor.evaluate(test_input_fn, steps=steps)
        mlflow.log_metric("Mean Square Error", mse['average_loss'])
        # Building a receiver function for exporting
        receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feat_spec)
        temp = tempfile.mkdtemp()
        try:
            saved_estimator_path = regressor.export_savedmodel(temp, receiver_fn).decode("utf-8")
            # Logging the saved model
            tensorflow.log_saved_model(saved_model_dir=saved_estimator_path, signature_def_key="predict", artifact_path="model")
            # Reloading the model
            pyfunc = tensorflow.load_pyfunc(saved_estimator_path)
            df = pd.DataFrame(data=x_test, columns=["features"] * x_train.shape[1])
            # Predicting on the loaded Python Function
            predict_df = pyfunc.predict(df)
            predict_df['original_labels'] = y_test
            print(predict_df)
        finally:
            shutil.rmtree(temp)
Ejemplo n.º 11
0
 def test_log_saved_model(self):
     with TempDir(chdr=False, remove_on_exit=True) as tmp:
         # Setting the logging such that it is in the temp folder and deleted after the test.
         old_tracking_dir = tracking.get_tracking_uri()
         tracking_dir = os.path.abspath(tmp.path("mlruns"))
         tracking.set_tracking_uri("file://%s" % tracking_dir)
         tracking.start_run()
         try:
             # Creating dict of features names (str) to placeholders (tensors)
             feature_spec = {}
             for name in self._feature_names:
                 feature_spec[name] = tf.placeholder("float",
                                                     name=name,
                                                     shape=[150])
             # Creating receiver function for model saving.
             receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
                 feature_spec)
             saved_model_path = tmp.path("model")
             os.makedirs(saved_model_path)
             os.makedirs(tmp.path("hello"))
             # Saving Tensorflow model.
             saved_model_path = self._dnn.export_savedmodel(
                 saved_model_path, receiver_fn).decode("utf-8")
             # Logging the Tensorflow model just saved.
             tensorflow.log_saved_model(saved_model_dir=saved_model_path,
                                        signature_def_key="predict",
                                        artifact_path=tmp.path("hello"))
             # Loading the saved Tensorflow model as a pyfunc.
             x = pyfunc.load_pyfunc(saved_model_path)
             # Predicting on the iris dataset using the pyfunc.
             xpred = x.predict(
                 pandas.DataFrame(data=self._X,
                                  columns=self._feature_names))
             saved = []
             for s in self._dnn_predict:
                 saved.append(s['predictions'])
             loaded = []
             for index, rows in xpred.iterrows():
                 loaded.append(rows)
             # Asserting that the loaded model predictions are as expected.
             np.testing.assert_array_equal(saved, loaded)
         finally:
             # Restoring the old logging location.
             tracking.end_run()
             tracking.set_tracking_uri(old_tracking_dir)
Ejemplo n.º 12
0
def test_log_metric_validation():
    with temp_directory() as tmp_dir, mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        active_run = tracking.start_run()
        run_uuid = active_run.run_info.run_uuid
        with active_run:
            mlflow.log_metric("name_1", "apple")
        finished_run = active_run.store.get_run(run_uuid)
        assert len(finished_run.data.metrics) == 0
Ejemplo n.º 13
0
    def test_model_log(self):
        old_uri = tracking.get_tracking_uri()
        # should_start_run tests whether or not calling log_model() automatically starts a run.
        for should_start_run in [False, True]:
            with TempDir(chdr=True, remove_on_exit=True) as tmp:
                try:
                    tracking.set_tracking_uri("test")
                    if should_start_run:
                        tracking.start_run()
                    mlflow.h2o.log_model(self.gbm, artifact_path="gbm")

                    # Load model
                    gbm_loaded = mlflow.h2o.load_model("gbm",
                                                       run_id=tracking.active_run().info.run_uuid)
                    assert all(gbm_loaded.predict(self.test).as_data_frame() == self.predicted)
                finally:
                    tracking.end_run()
                    tracking.set_tracking_uri(old_uri)
Ejemplo n.º 14
0
def test_start_and_end_run():
    with temp_directory() as tmp_dir, mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        # Use the start_run() and end_run() APIs without a `with` block, verify they work.
        active_run = tracking.start_run()
        mlflow.log_metric("name_1", 25)
        tracking.end_run()
        finished_run = active_run.store.get_run(active_run.run_info.run_uuid)
        # Validate metrics
        assert len(finished_run.data.metrics) == 1
        expected_pairs = {"name_1": 25}
        for metric in finished_run.data.metrics:
            assert expected_pairs[metric.key] == metric.value
Ejemplo n.º 15
0
def _run_project(project, entry_point, work_dir, parameters, use_conda,
                 storage_dir, experiment_id):
    """Locally run a project that has been checked out in `work_dir`."""
    mlflow.set_tracking_uri('..\\')  #added by cliicy
    if storage_dir is not None and not os.path.exists(storage_dir):
        os.makedirs(storage_dir)
    storage_dir_for_run = tempfile.mkdtemp(dir=storage_dir)
    print(
        "=== Created directory %s for downloading remote URIs passed to arguments of "
        "type 'path' ===" % storage_dir_for_run)
    # Try to build the command first in case the user mis-specified parameters
    run_project_command = project.get_entry_point(entry_point).compute_command(
        parameters, storage_dir_for_run)
    commands = []

    # Create a new run and log every provided parameter into it.
    active_run = tracking.start_run(
        experiment_id=experiment_id,
        source_name=project.uri,
        source_version=tracking._get_git_commit(work_dir),
        entry_point_name=entry_point,
        source_type=SourceType.PROJECT)
    for key, value in parameters.items():
        active_run.log_param(Param(key, value))
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    exp_id = experiment_id or tracking._get_experiment_id()
    env_map = {
        tracking._RUN_NAME_ENV_VAR: active_run.run_info.run_uuid,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(exp_id),
    }

    commands.append(run_project_command)
    command = " && ".join(commands)
    print("=== Running command: %s ===" % command)
    try:
        command = "python my_train.py 0.4 0.1"
        print("will run command aaaaa " + command + " " + work_dir + " aaaaa ")
        process.exec_cmd(command,
                         cwd=work_dir,
                         stream_output=True,
                         env=env_map)
        #process.exec_cmd([os.environ.get("SHELL", "bash"), "-c", command], cwd=work_dir,
        #                 stream_output=True, env=env_map)
        tracking.end_run()
        print("=== Run succeeded ===")
    except process.ShellCommandException:
        tracking.end_run("FAILED")
        print("=== Run failed ===")
Ejemplo n.º 16
0
def _run_project(project, entry_point, work_dir, parameters, use_conda,
                 storage_dir, experiment_id):
    """Locally run a project that has been checked out in `work_dir`."""
    storage_dir_for_run = _get_storage_dir(storage_dir)
    eprint(
        "=== Created directory %s for downloading remote URIs passed to arguments of "
        "type 'path' ===" % storage_dir_for_run)
    # Try to build the command first in case the user mis-specified parameters
    run_project_command = project.get_entry_point(entry_point)\
        .compute_command(parameters, storage_dir_for_run)
    commands = []
    if use_conda:
        conda_env_path = os.path.abspath(
            os.path.join(work_dir, project.conda_env))
        _maybe_create_conda_env(conda_env_path)
        commands.append("source activate %s" %
                        _get_conda_env_name(conda_env_path))

    # Create a new run and log every provided parameter into it.
    active_run = tracking.start_run(
        experiment_id=experiment_id,
        source_name=project.uri,
        source_version=tracking._get_git_commit(work_dir),
        entry_point_name=entry_point,
        source_type=SourceType.PROJECT)
    if parameters is not None:
        for key, value in parameters.items():
            active_run.log_param(Param(key, value))
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    exp_id = experiment_id or tracking._get_experiment_id()
    env_map = {
        tracking._RUN_NAME_ENV_VAR: active_run.run_info.run_uuid,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(exp_id),
    }

    commands.append(run_project_command)
    command = " && ".join(commands)
    eprint("=== Running command: %s ===" % command)
    try:
        process.exec_cmd([os.environ.get("SHELL", "bash"), "-c", command],
                         cwd=work_dir,
                         stream_output=True,
                         env=env_map)
        tracking.end_run()
        eprint("=== Run succeeded ===")
    except process.ShellCommandException:
        tracking.end_run("FAILED")
        eprint("=== Run failed ===")
Ejemplo n.º 17
0
def test_log_metric():
    with temp_directory() as tmp_dir, mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        active_run = tracking.start_run()
        run_uuid = active_run.run_info.run_uuid
        with active_run:
            mlflow.log_metric("name_1", 25)
            mlflow.log_metric("name_2", -3)
            mlflow.log_metric("name_1", 30)
        finished_run = active_run.store.get_run(run_uuid)
        # Validate metrics
        assert len(finished_run.data.metrics) == 2
        expected_pairs = {"name_1": 30, "name_2": -3}
        for metric in finished_run.data.metrics:
            assert expected_pairs[metric.key] == metric.value
Ejemplo n.º 18
0
def test_log_param():
    with temp_directory() as tmp_dir, mock.patch("mlflow.tracking._get_store") as get_store_mock:
        get_store_mock.return_value = FileStore(tmp_dir)
        active_run = tracking.start_run()
        run_uuid = active_run.run_info.run_uuid
        with active_run:
            mlflow.log_param("name_1", "a")
            mlflow.log_param("name_2", "b")
            mlflow.log_param("name_1", "c")
        finished_run = active_run.store.get_run(run_uuid)
        # Validate params
        assert len(finished_run.data.params) == 2
        expected_pairs = {"name_1": "c", "name_2": "b"}
        for param in finished_run.data.params:
            assert expected_pairs[param.key] == param.value
Ejemplo n.º 19
0
def test_model_log(model, data, predicted):
    x, y = data
    old_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            try:
                tracking.set_tracking_uri("test")
                if should_start_run:
                    tracking.start_run()
                mlflow.keras.log_model(model, artifact_path="keras_model")

                # Load model
                model_loaded = mlflow.keras.load_model(
                    "keras_model", run_id=tracking.active_run().info.run_uuid)
                assert all(model_loaded.predict(x) == predicted)

                # Loading pyfunc model
                pyfunc_loaded = mlflow.pyfunc.load_pyfunc(
                    "keras_model", run_id=tracking.active_run().info.run_uuid)
                assert all(pyfunc_loaded.predict(x).values == predicted)
            finally:
                tracking.end_run()
    tracking.set_tracking_uri(old_uri)
    def test_categorical_columns(self):
        """
        This tests logging capabilities on datasets with categorical columns.
        See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/\
        regression/imports85.py
        for reference code.
        """
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            path = os.path.abspath("tests/data/uci-autos-imports-85.data")
            # Order is important for the csv-readers, so we use an OrderedDict here.
            defaults = collections.OrderedDict([("body-style", [""]),
                                                ("curb-weight", [0.0]),
                                                ("highway-mpg", [0.0]),
                                                ("price", [0.0])])

            types = collections.OrderedDict(
                (key, type(value[0])) for key, value in defaults.items())
            df = pandas.read_csv(path,
                                 names=types.keys(),
                                 dtype=types,
                                 na_values="?")
            df = df.dropna()

            # Extract the label from the features dataframe.
            y_train = df.pop("price")

            # Creating the input training function required.
            trainingFeatures = {}

            for i in df:
                trainingFeatures[i] = df[i].values

            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y_train.values,
                                                             shuffle=False,
                                                             batch_size=1)

            # Creating the feature columns required for the DNNRegressor.
            body_style_vocab = [
                "hardtop", "wagon", "sedan", "hatchback", "convertible"
            ]
            body_style = tf.feature_column.categorical_column_with_vocabulary_list(
                key="body-style", vocabulary_list=body_style_vocab)
            feature_columns = [
                tf.feature_column.numeric_column(key="curb-weight"),
                tf.feature_column.numeric_column(key="highway-mpg"),
                # Since this is a DNN model, convert categorical columns from sparse
                # to dense.
                # Wrap them in an `indicator_column` to create a
                # one-hot vector from the input.
                tf.feature_column.indicator_column(body_style)
            ]

            # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
            # defined above as input.
            estimator = tf.estimator.DNNRegressor(
                hidden_units=[20, 20], feature_columns=feature_columns)

            # Training the estimator.
            estimator.train(input_fn=input_train, steps=10)
            # Saving the estimator's prediction on the training data; assume the DNNRegressor
            # produces a single output column named 'predictions'
            pred_col = "predictions"
            estimator_preds = [
                s[pred_col] for s in estimator.predict(input_train)
            ]
            estimator_preds_df = pd.DataFrame({pred_col: estimator_preds})
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                feature_spec["body-style"] = tf.placeholder("string",
                                                            name="body-style",
                                                            shape=[None])
                feature_spec["curb-weight"] = tf.placeholder(
                    "float", name="curb-weight", shape=[None])
                feature_spec["highway-mpg"] = tf.placeholder(
                    "float", name="highway-mpg", shape=[None])

                pyfunc_preds_df = self.helper(feature_spec, tmp, estimator, df)
                # Asserting that the loaded model predictions are as expected. Allow for some
                # imprecision as this is expected with TensorFlow.
                pandas.testing.assert_frame_equal(pyfunc_preds_df,
                                                  estimator_preds_df,
                                                  check_less_precise=6)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
Ejemplo n.º 21
0
    def test_categorical_columns(self):
        """
        This tests logging capabilities on datasets with categorical columns.
        See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/regression/imports85.py
        for reference code.
        """
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            # Downloading the data into a pandas DataFrame.
            URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
            path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)
            # Order is important for the csv-readers, so we use an OrderedDict here.
            defaults = collections.OrderedDict([("body-style", [""]),
                                                ("curb-weight", [0.0]),
                                                ("highway-mpg", [0.0]),
                                                ("price", [0.0])])

            types = collections.OrderedDict(
                (key, type(value[0])) for key, value in defaults.items())
            df = pandas.read_csv(path,
                                 names=types.keys(),
                                 dtype=types,
                                 na_values="?")
            df = df.dropna()

            # Extract the label from the features dataframe.
            y_train = df.pop("price")

            # Creating the input training function required.
            trainingFeatures = {}

            for i in df:
                trainingFeatures[i] = df[i].values

            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y_train.values,
                                                             shuffle=False,
                                                             batch_size=1)

            # Creating the feature columns required for the DNNRegressor.
            body_style_vocab = [
                "hardtop", "wagon", "sedan", "hatchback", "convertible"
            ]
            body_style = tf.feature_column.categorical_column_with_vocabulary_list(
                key="body-style", vocabulary_list=body_style_vocab)
            feature_columns = [
                tf.feature_column.numeric_column(key="curb-weight"),
                tf.feature_column.numeric_column(key="highway-mpg"),
                # Since this is a DNN model, convert categorical columns from sparse
                # to dense.
                # Wrap them in an `indicator_column` to create a
                # one-hot vector from the input.
                tf.feature_column.indicator_column(body_style),
            ]

            # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
            # defined above as input.
            estimator = tf.estimator.DNNRegressor(
                hidden_units=[20, 20], feature_columns=feature_columns)

            # Training the estimator.
            estimator.train(input_fn=input_train, steps=100)
            # Saving the estimator's prediction on the training data.
            estimator_preds = estimator.predict(input_train)
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                feature_spec["body-style"] = tf.placeholder("string",
                                                            name="body-style",
                                                            shape=[None])
                feature_spec["curb-weight"] = tf.placeholder(
                    "float", name="curb-weight", shape=[None])
                feature_spec["highway-mpg"] = tf.placeholder(
                    "float", name="highway-mpg", shape=[None])

                saved = [s['predictions'] for s in estimator_preds]

                results = self.helper(feature_spec, tmp, estimator, df)

                # Asserting that the loaded model predictions are as expected.
                # TensorFlow is known to have precision errors, hence the almost_equal.
                np.testing.assert_array_almost_equal(saved, results, decimal=2)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
Ejemplo n.º 22
0
def _run_project(project, entry_point, work_dir, parameters, use_conda,
                 storage_dir, experiment_id):
    """Locally run a project that has been checked out in `work_dir`."""
    if storage_dir is not None and not os.path.exists(storage_dir):
        os.makedirs(storage_dir)
    storage_dir_for_run = tempfile.mkdtemp(dir=storage_dir)
    eprint(
        "=== Created directory %s for downloading remote URIs passed to arguments of "
        "type 'path' ===" % storage_dir_for_run)
    # Try to build the command first in case the user mis-specified parameters
    run_project_command = project.get_entry_point(entry_point)\
        .compute_command(parameters, storage_dir_for_run)
    commands = []
    if use_conda:
        with open(os.path.join(work_dir, project.conda_env)) as conda_env_file:
            conda_env_sha = hashlib.sha1(
                conda_env_file.read().encode("utf-8")).hexdigest()
        conda_env = "mlflow-%s" % conda_env_sha
        (exit_code, _, stderr) = process.exec_cmd(["conda", "--help"],
                                                  throw_on_error=False)
        if exit_code != 0:
            eprint(
                'conda is not installed properly. Please follow the instructions on '
                'https://conda.io/docs/user-guide/install/index.html')
            eprint(stderr)
            sys.exit(1)
        (_, stdout,
         stderr) = process.exec_cmd(["conda", "env", "list", "--json"])
        env_names = [
            os.path.basename(env) for env in json.loads(stdout)['envs']
        ]

        conda_action = 'create'
        if conda_env not in env_names:
            eprint('=== Creating conda environment %s ===' % conda_env)
            process.exec_cmd([
                "conda", "env", conda_action, "-n", conda_env, "--file",
                project.conda_env
            ],
                             cwd=work_dir,
                             stream_output=True)
        commands.append("source activate %s" % conda_env)

    # Create a new run and log every provided parameter into it.
    active_run = tracking.start_run(
        experiment_id=experiment_id,
        source_name=project.uri,
        source_version=tracking._get_git_commit(work_dir),
        entry_point_name=entry_point,
        source_type=SourceType.PROJECT)
    for key, value in parameters.items():
        active_run.log_param(Param(key, value))
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    exp_id = experiment_id or tracking._get_experiment_id()
    env_map = {
        tracking._RUN_NAME_ENV_VAR: active_run.run_info.run_uuid,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(exp_id),
    }

    commands.append(run_project_command)
    command = " && ".join(commands)
    eprint("=== Running command: %s ===" % command)
    try:
        process.exec_cmd([os.environ.get("SHELL", "bash"), "-c", command],
                         cwd=work_dir,
                         stream_output=True,
                         env=env_map)
        tracking.end_run()
        eprint("=== Run succeeded ===")
    except process.ShellCommandException:
        tracking.end_run("FAILED")
        eprint("=== Run failed ===")
Ejemplo n.º 23
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data,
                             columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                tracking.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    tracking.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                if dfs_tmp_dir:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model,
                                     dfs_tmpdir=dfs_tmp_dir)
                else:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model)
                run_id = tracking.active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [
                    x.prediction
                    for x in preds_df_1.select("prediction").collect()
                ]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # make sure we did not leave any temp files behind
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert not os.listdir(x)
                shutil.rmtree(x)
            finally:
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)