Example #1
0
    def test_model_log(self):
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            model_path = tmp.path("linear.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(self._linear_lr, f)
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                pyfunc.log_model(artifact_path="linear",
                                 data_path=model_path,
                                 loader_module=os.path.basename(__file__)[:-3],
                                 code_path=[__file__])

                run_id = tracking.active_run().info.run_uuid
                path = tracking._get_model_log_dir("linear", run_id)
                m = Model.load(os.path.join(path, "MLmodel"))
                print(m.__dict__)
                x = pyfunc.load_pyfunc("linear", run_id=run_id)
                xpred = x.predict(self._X)
                np.testing.assert_array_equal(self._linear_lr_predict, xpred)
            finally:
                tracking.end_run()
                tracking.set_tracking_uri(None)
                # Remove the log directory in order to avoid adding new tests to pytest...
                shutil.rmtree(tracking_dir)
Example #2
0
def test_log_model(sequential_model, data, sequential_predicted):
    old_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            try:
                tracking.set_tracking_uri(tmp.path("test"))
                if should_start_run:
                    mlflow.start_run()

                artifact_path = "pytorch"
                mlflow.pytorch.log_model(sequential_model,
                                         artifact_path=artifact_path)
                model_uri = "runs:/{run_id}/{artifact_path}".format(
                    run_id=mlflow.active_run().info.run_id,
                    artifact_path=artifact_path)

                # Load model
                sequential_model_loaded = mlflow.pytorch.load_model(
                    model_uri=model_uri)

                test_predictions = _predict(sequential_model_loaded, data)
                np.testing.assert_array_equal(test_predictions,
                                              sequential_predicted)
            finally:
                mlflow.end_run()
                tracking.set_tracking_uri(old_uri)
Example #3
0
def test_set_experiment():
    with pytest.raises(TypeError):
        mlflow.set_experiment()

    with pytest.raises(Exception):
        mlflow.set_experiment(None)

    with pytest.raises(Exception):
        mlflow.set_experiment("")

    try:
        with TempDir() as tracking_uri:
            tracking.set_tracking_uri(tracking_uri.path())
            name = "random_exp"
            exp_id = mlflow.create_experiment(name)
            mlflow.set_experiment(name)
            run = start_run()
            assert run.info.experiment_id == exp_id
            end_run()

            another_name = "another_experiment"
            mlflow.set_experiment(another_name)
            exp_id2 = mlflow.tracking.MlflowClient().get_experiment_by_name(
                another_name)
            another_run = start_run()
            assert another_run.info.experiment_id == exp_id2.experiment_id
            end_run()
    finally:
        # Need to do this to clear active experiment to restore state
        mlflow.tracking.fluent._active_experiment_id = None
Example #4
0
def test_dnn():
    old_uri = tracking.get_tracking_uri()
    try:
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            diamonds = tmp.path("diamonds")
            estimator = tmp.path("estimator")
            artifacts = tmp.path("artifacts")
            os.mkdir(diamonds)
            os.mkdir(estimator)
            os.mkdir(artifacts)
            tracking.set_tracking_uri(artifacts)
            # Download the diamonds dataset via mlflow run
            run(".",
                entry_point="main",
                version=None,
                parameters={"dest-dir": diamonds},
                experiment_id=tracking._get_experiment_id(),
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Run the main dnn app via mlflow
            run("apps/dnn-regression",
                entry_point="main",
                version=None,
                parameters={
                    "model-dir": estimator,
                    "train": os.path.join(diamonds, "train_diamonds.parquet"),
                    "test": os.path.join(diamonds, "test_diamonds.parquet"),
                    "hidden-units": "30,30",
                    "label-col": "price",
                    "steps": 5000,
                    "batch-size": 128
                },
                experiment_id=tracking._get_experiment_id(),
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Loading the saved model as a pyfunc.
            pyfunc = tensorflow.load_pyfunc(
                os.path.join(estimator,
                             os.listdir(estimator)[0]))

            df = pandas.read_parquet(
                os.path.join(diamonds, "test_diamonds.parquet"))

            predict_df = pyfunc.predict(df)
            assert 'predictions' in predict_df
            assert isinstance(predict_df['predictions'][0][0], numpy.float32)
    finally:
        tracking.set_tracking_uri(old_uri)
Example #5
0
    def test_log_saved_model(self):
        # This tests model logging capabilities on the sklearn.iris dataset.
        iris = datasets.load_iris()
        X = iris.data[:, :2]  # we only take the first two features.
        y = iris.target
        trainingFeatures = {}
        for i in range(0, 2):
            # TensorFlow is fickle about feature names, so we remove offending characters
            iris.feature_names[i] = iris.feature_names[i].replace(" ", "")
            iris.feature_names[i] = iris.feature_names[i].replace("(", "")
            iris.feature_names[i] = iris.feature_names[i].replace(")", "")
            trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1]
        tf_feat_cols = []
        feature_names = iris.feature_names[:2]
        # Creating TensorFlow-specific numeric columns for input.
        for col in iris.feature_names[:2]:
            tf_feat_cols.append(tf.feature_column.numeric_column(col))
        # Creating input training function.
        input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                         y,
                                                         shuffle=False,
                                                         batch_size=1)
        # Creating Deep Neural Network Regressor.
        estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols,
                                              hidden_units=[1])
        # Training and creating expected predictions on training dataset.
        estimator.train(input_train, steps=10)
        # Saving the estimator's prediction on the training data; assume the DNNRegressor
        # produces a single output column named 'predictions'
        pred_col = "predictions"
        estimator_preds = [s[pred_col] for s in estimator.predict(input_train)]
        estimator_preds_df = pd.DataFrame({pred_col: estimator_preds})

        old_tracking_uri = tracking.get_tracking_uri()
        # should_start_run tests whether or not calling log_model() automatically starts a run.
        for should_start_run in [False, True]:
            with TempDir(chdr=True, remove_on_exit=True) as tmp:
                try:
                    # Creating dict of features names (str) to placeholders (tensors)
                    feature_spec = {}
                    for name in feature_names:
                        feature_spec[name] = tf.placeholder("float",
                                                            name=name,
                                                            shape=[150])
                    tracking.set_tracking_uri("test")
                    if should_start_run:
                        tracking.start_run()
                    pyfunc_preds_df = self.helper(
                        feature_spec, tmp, estimator,
                        pandas.DataFrame(data=X, columns=feature_names))

                    # Asserting that the loaded model predictions are as expected.
                    assert estimator_preds_df.equals(pyfunc_preds_df)
                finally:
                    # Restoring the old logging location.
                    tracking.end_run()
                    tracking.set_tracking_uri(old_tracking_uri)
Example #6
0
def test_no_nested_run():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        first_run = start_run()
        with first_run:
            with pytest.raises(Exception):
                start_run()
    finally:
        tracking.set_tracking_uri(None)
Example #7
0
    def test_log_saved_model(self):
        # This tests model logging capabilities on the sklearn.iris dataset.
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            iris = datasets.load_iris()
            X = iris.data[:, :2]  # we only take the first two features.
            y = iris.target
            trainingFeatures = {}
            feature_names = iris.feature_names[:2]
            for i in range(0, 2):
                # TensorFlow is fickle about feature names, so we remove offending characters
                iris.feature_names[i] = iris.feature_names[i].replace(" ", "")
                iris.feature_names[i] = iris.feature_names[i].replace("(", "")
                iris.feature_names[i] = iris.feature_names[i].replace(")", "")
                trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1]
            tf_feat_cols = []
            feature_names = iris.feature_names[:2]
            # Creating TensorFlow-specific numeric columns for input.
            for col in iris.feature_names[:2]:
                tf_feat_cols.append(tf.feature_column.numeric_column(col))
            # Creating input training function.
            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y,
                                                             shuffle=False,
                                                             batch_size=1)
            # Creating Deep Neural Network Regressor.
            estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols,
                                                  hidden_units=[1])
            # Training and creating expected predictions on training dataset.
            estimator.train(input_train, steps=100)
            estimator_preds = estimator.predict(input_train)
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                for name in feature_names:
                    feature_spec[name] = tf.placeholder("float",
                                                        name=name,
                                                        shape=[150])

                saved = [s['predictions'] for s in estimator_preds]

                results = self.helper(
                    feature_spec, tmp, estimator,
                    pandas.DataFrame(data=X, columns=feature_names))

                # Asserting that the loaded model predictions are as expected.
                np.testing.assert_array_equal(saved, results)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
Example #8
0
def test_log_metric_validation():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        active_run = start_run()
        run_uuid = active_run.info.run_uuid
        with active_run:
            mlflow.log_metric("name_1", "apple")
        finished_run = tracking.MlflowClient().get_run(run_uuid)
        assert len(finished_run.data.metrics) == 0
    finally:
        tracking.set_tracking_uri(None)
 def test_model_log(self):
     with TempDir(chdr=True, remove_on_exit=True):
         tracking.set_tracking_uri("mlruns")
         tracking.start_run()
         try:
             sklearn.log_model(sk_model=self._linear_lr, artifact_path="linear")
             x = sklearn.load_model("linear", run_id=tracking.active_run().info.run_uuid)
             xpred = x.predict(self._X)
             np.testing.assert_array_equal(self._linear_lr_predict, xpred)
         finally:
             tracking.end_run()
             tracking.set_tracking_uri(None)
Example #10
0
def test_gbt():
    old_uri = tracking.get_tracking_uri()
    with TempDir(chdr=False, remove_on_exit=True) as tmp:
        try:
            diamonds = tmp.path("diamonds")
            artifacts = tmp.path("artifacts")
            os.mkdir(diamonds)
            os.mkdir(artifacts)
            tracking.set_tracking_uri(artifacts)
            # Download the diamonds dataset via mlflow run
            run(".", entry_point="main", version=None,
                parameters={"dest-dir": diamonds}, experiment_id=0,
                mode="local", cluster_spec=None, git_username=None, git_password=None,
                use_conda=True, storage_dir=None)

            initial = os.path.join(artifacts, "0")
            dir_list = os.listdir(initial)

            # Run the main gbt app via mlflow
            run("apps/gbt-regression", entry_point="main", version=None,
                parameters={"train": os.path.join(diamonds, "train_diamonds.parquet"),
                            "test": os.path.join(diamonds, "test_diamonds.parquet"),
                            "n-trees": 10,
                            "m-depth": 3,
                            "learning-rate": .1,
                            "loss": "rmse",
                            "label-col": "price"},
                experiment_id=0, mode="local",
                cluster_spec=None, git_username=None, git_password=None, use_conda=True,
                storage_dir=None)

            # Identifying the new run's folder
            main = None
            for item in os.listdir(initial):
                if item not in dir_list:
                    main = item

            pyfunc = load_pyfunc(os.path.join(initial, main, "artifacts/model/model.pkl"))
            df = pandas.read_parquet(os.path.join(diamonds, "test_diamonds.parquet"))

            # Removing the price column from the DataFrame so we can use the features to predict
            df = df.drop(columns="price")

            # Predicting from the saved pyfunc
            predict = pyfunc.predict(df)

            # Make sure the data is of the right type
            assert isinstance(predict[0], numpy.float32)
        finally:
            tracking.set_tracking_uri(old_uri)
Example #11
0
def test_start_and_end_run():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        # Use the start_run() and end_run() APIs without a `with` block, verify they work.
        active_run = start_run()
        mlflow.log_metric("name_1", 25)
        end_run()
        finished_run = tracking.MlflowClient().get_run(active_run.info.run_uuid)
        # Validate metrics
        assert len(finished_run.data.metrics) == 1
        expected_pairs = {"name_1": 25}
        for metric in finished_run.data.metrics:
            assert expected_pairs[metric.key] == metric.value
    finally:
        tracking.set_tracking_uri(None)
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        try:
            tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            if should_start_run:
                tracking.start_run()
            sparkm.log_model(artifact_path="model", spark_model=model)
            run_id = tracking.active_run().info.run_uuid
            x = pyfunc.load_pyfunc("model", run_id=run_id)
            preds2 = x.predict(pandas_df)
            assert preds1 == preds2
            reloaded_model = sparkm.load_model("model", run_id=run_id)
            preds_df_1 = reloaded_model.transform(spark_df)
            preds3 = [
                x.prediction
                for x in preds_df_1.select("prediction").collect()
            ]
            assert preds1 == preds3
        finally:
            tracking.end_run()
            tracking.set_tracking_uri(old_tracking_uri)
            shutil.rmtree(tracking_dir)
 def test_model_log(self):
     old_uri = tracking.get_tracking_uri()
     # should_start_run tests whether or not calling log_model() automatically starts a run.
     for should_start_run in [False, True]:
         with TempDir(chdr=True, remove_on_exit=True) as tmp:
             try:
                 tracking.set_tracking_uri("test")
                 if should_start_run:
                     tracking.start_run()
                 sklearn.log_model(sk_model=self._linear_lr, artifact_path="linear")
                 x = sklearn.load_model("linear", run_id=tracking.active_run().info.run_uuid)
                 xpred = x.predict(self._X)
                 np.testing.assert_array_equal(self._linear_lr_predict, xpred)
             finally:
                 tracking.end_run()
                 tracking.set_tracking_uri(old_uri)
Example #14
0
def test_create_experiment():
    with pytest.raises(TypeError):
        mlflow.create_experiment()

    with pytest.raises(Exception):
        mlflow.create_experiment(None)

    with pytest.raises(Exception):
        mlflow.create_experiment("")

    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        exp_id = mlflow.create_experiment(
            "Some random experiment name %d" % random.randint(1, 1e6))
        assert exp_id is not None
    finally:
        tracking.set_tracking_uri(None)
Example #15
0
 def test_log_saved_model(self):
     with TempDir(chdr=False, remove_on_exit=True) as tmp:
         # Setting the logging such that it is in the temp folder and deleted after the test.
         old_tracking_dir = tracking.get_tracking_uri()
         tracking_dir = os.path.abspath(tmp.path("mlruns"))
         tracking.set_tracking_uri("file://%s" % tracking_dir)
         tracking.start_run()
         try:
             # Creating dict of features names (str) to placeholders (tensors)
             feature_spec = {}
             for name in self._feature_names:
                 feature_spec[name] = tf.placeholder("float",
                                                     name=name,
                                                     shape=[150])
             # Creating receiver function for model saving.
             receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
                 feature_spec)
             saved_model_path = tmp.path("model")
             os.makedirs(saved_model_path)
             os.makedirs(tmp.path("hello"))
             # Saving Tensorflow model.
             saved_model_path = self._dnn.export_savedmodel(
                 saved_model_path, receiver_fn).decode("utf-8")
             # Logging the Tensorflow model just saved.
             tensorflow.log_saved_model(saved_model_dir=saved_model_path,
                                        signature_def_key="predict",
                                        artifact_path=tmp.path("hello"))
             # Loading the saved Tensorflow model as a pyfunc.
             x = pyfunc.load_pyfunc(saved_model_path)
             # Predicting on the iris dataset using the pyfunc.
             xpred = x.predict(
                 pandas.DataFrame(data=self._X,
                                  columns=self._feature_names))
             saved = []
             for s in self._dnn_predict:
                 saved.append(s['predictions'])
             loaded = []
             for index, rows in xpred.iterrows():
                 loaded.append(rows)
             # Asserting that the loaded model predictions are as expected.
             np.testing.assert_array_equal(saved, loaded)
         finally:
             # Restoring the old logging location.
             tracking.end_run()
             tracking.set_tracking_uri(old_tracking_dir)
    def test_model_log(self):
        old_uri = tracking.get_tracking_uri()
        # should_start_run tests whether or not calling log_model() automatically starts a run.
        for should_start_run in [False, True]:
            with TempDir(chdr=True, remove_on_exit=True) as tmp:
                try:
                    tracking.set_tracking_uri("test")
                    if should_start_run:
                        tracking.start_run()
                    mlflow.h2o.log_model(self.gbm, artifact_path="gbm")

                    # Load model
                    gbm_loaded = mlflow.h2o.load_model("gbm",
                                                       run_id=tracking.active_run().info.run_uuid)
                    assert all(gbm_loaded.predict(self.test).as_data_frame() == self.predicted)
                finally:
                    tracking.end_run()
                    tracking.set_tracking_uri(old_uri)
Example #17
0
def test_log_param():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        active_run = start_run()
        run_uuid = active_run.info.run_uuid
        with active_run:
            mlflow.log_param("name_1", "a")
            mlflow.log_param("name_2", "b")
            mlflow.log_param("name_1", "c")
            mlflow.log_param("nested/nested/name", 5)
        finished_run = tracking.MlflowClient().get_run(run_uuid)
        # Validate params
        assert len(finished_run.data.params) == 3
        expected_pairs = {"name_1": "c", "name_2": "b", "nested/nested/name": "5"}
        for param in finished_run.data.params:
            assert expected_pairs[param.key] == param.value
    finally:
        tracking.set_tracking_uri(None)
Example #18
0
def test_log_metric():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        active_run = start_run()
        run_uuid = active_run.info.run_uuid
        with active_run:
            mlflow.log_metric("name_1", 25)
            mlflow.log_metric("name_2", -3)
            mlflow.log_metric("name_1", 30)
            mlflow.log_metric("nested/nested/name", 40)
        finished_run = tracking.MlflowClient().get_run(run_uuid)
        # Validate metrics
        assert len(finished_run.data.metrics) == 3
        expected_pairs = {"name_1": 30, "name_2": -3, "nested/nested/name": 40}
        for metric in finished_run.data.metrics:
            assert expected_pairs[metric.key] == metric.value
    finally:
        tracking.set_tracking_uri(None)
Example #19
0
def test_log_model(model, data, predicted):

    old_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            try:
                tracking.set_tracking_uri(tmp.path("test"))
                if should_start_run:
                    mlflow.start_run()

                mlflow.pytorch.log_model(model, artifact_path="pytorch")

                # Load model
                run_id = mlflow.active_run().info.run_uuid
                model_loaded = mlflow.pytorch.load_model("pytorch", run_id=run_id)

                test_predictions = _predict(model_loaded, data)
                assert np.all(test_predictions == predicted)
            finally:
                mlflow.end_run()
                tracking.set_tracking_uri(old_uri)
Example #20
0
def test_log_artifact():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        artifact_src_dir = tempfile.mkdtemp()
        # Create artifacts
        _, path0 = tempfile.mkstemp(dir=artifact_src_dir)
        _, path1 = tempfile.mkstemp(dir=artifact_src_dir)
        for i, path in enumerate([path0, path1]):
            with open(path, "w") as handle:
                handle.write("%s" % str(i))
        # Log an artifact, verify it exists in the directory returned by get_artifact_uri
        # after the run finishes
        artifact_parent_dirs = ["some_parent_dir", None]
        for parent_dir in artifact_parent_dirs:
            with start_run():
                run_artifact_dir = mlflow.get_artifact_uri()
                mlflow.log_artifact(path0, parent_dir)
            expected_dir = os.path.join(run_artifact_dir, parent_dir) \
                if parent_dir is not None else run_artifact_dir
            assert os.listdir(expected_dir) == [os.path.basename(path0)]
            logged_artifact_path = os.path.join(expected_dir, path0)
            assert filecmp.cmp(logged_artifact_path, path0, shallow=False)
        # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri
        for parent_dir in artifact_parent_dirs:
            with start_run():
                run_artifact_dir = mlflow.get_artifact_uri()
                mlflow.log_artifacts(artifact_src_dir, parent_dir)
            # Check that the logged artifacts match
            expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \
                if parent_dir is not None else run_artifact_dir
            dir_comparison = filecmp.dircmp(artifact_src_dir,
                                            expected_artifact_output_dir)
            assert len(dir_comparison.left_only) == 0
            assert len(dir_comparison.right_only) == 0
            assert len(dir_comparison.diff_files) == 0
            assert len(dir_comparison.funny_files) == 0
    finally:
        tracking.set_tracking_uri(None)
Example #21
0
def test_start_run_context_manager():
    try:
        tracking.set_tracking_uri(tempfile.mkdtemp())
        first_run = start_run()
        first_uuid = first_run.info.run_uuid
        with first_run:
            # Check that start_run() causes the run information to be persisted in the store
            persisted_run = tracking.MlflowClient().get_run(first_uuid)
            assert persisted_run is not None
            assert persisted_run.info == first_run.info
        finished_run = tracking.MlflowClient().get_run(first_uuid)
        assert finished_run.info.status == RunStatus.FINISHED
        # Launch a separate run that fails, verify the run status is FAILED and the run UUID is
        # different
        second_run = start_run()
        assert second_run.info.run_uuid != first_uuid
        with pytest.raises(Exception):
            with second_run:
                raise Exception("Failing run!")
        finished_run2 = tracking.MlflowClient().get_run(second_run.info.run_uuid)
        assert finished_run2.info.status == RunStatus.FAILED
    finally:
        tracking.set_tracking_uri(None)
Example #22
0
def test_model_log(model, data, predicted):
    x, y = data
    old_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        with TempDir(chdr=True, remove_on_exit=True) as tmp:
            try:
                tracking.set_tracking_uri("test")
                if should_start_run:
                    tracking.start_run()
                mlflow.keras.log_model(model, artifact_path="keras_model")

                # Load model
                model_loaded = mlflow.keras.load_model(
                    "keras_model", run_id=tracking.active_run().info.run_uuid)
                assert all(model_loaded.predict(x) == predicted)

                # Loading pyfunc model
                pyfunc_loaded = mlflow.pyfunc.load_pyfunc(
                    "keras_model", run_id=tracking.active_run().info.run_uuid)
                assert all(pyfunc_loaded.predict(x).values == predicted)
            finally:
                tracking.end_run()
    tracking.set_tracking_uri(old_uri)
    def setup_mlflow_tracking(self, URI, experiment_name, run_name):

        # select URI for server tracking
        set_tracking_uri(uri=URI)
        if is_tracking_uri_set():
            logging.debug('MLFlow URI: ' + str(get_tracking_uri()))

        # CRUD interface
        self.client = MlflowClient(tracking_uri=get_tracking_uri())

        # Experiment setup
        if self.client.get_experiment_by_name(name=experiment_name) is None:
            exp_id = self.client.create_experiment(name=experiment_name)
        else:
            exp = self.client.get_experiment_by_name(name=experiment_name)
            exp_id = exp.experiment_id

        # Run setup
        mlflow.start_run(experiment_id=exp_id, run_name=run_name)
        self.run_id = mlflow.active_run().info.run_id
        data = self.client.get_run(mlflow.active_run().info.run_id).data
        logging.info('MLFlow tracking started - Experiment: ' +
                     str(experiment_name) + " - Run: " +
                     str(data.tags["mlflow.runName"]))
    def test_categorical_columns(self):
        """
        This tests logging capabilities on datasets with categorical columns.
        See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/\
        regression/imports85.py
        for reference code.
        """
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            path = os.path.abspath("tests/data/uci-autos-imports-85.data")
            # Order is important for the csv-readers, so we use an OrderedDict here.
            defaults = collections.OrderedDict([("body-style", [""]),
                                                ("curb-weight", [0.0]),
                                                ("highway-mpg", [0.0]),
                                                ("price", [0.0])])

            types = collections.OrderedDict(
                (key, type(value[0])) for key, value in defaults.items())
            df = pandas.read_csv(path,
                                 names=types.keys(),
                                 dtype=types,
                                 na_values="?")
            df = df.dropna()

            # Extract the label from the features dataframe.
            y_train = df.pop("price")

            # Creating the input training function required.
            trainingFeatures = {}

            for i in df:
                trainingFeatures[i] = df[i].values

            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y_train.values,
                                                             shuffle=False,
                                                             batch_size=1)

            # Creating the feature columns required for the DNNRegressor.
            body_style_vocab = [
                "hardtop", "wagon", "sedan", "hatchback", "convertible"
            ]
            body_style = tf.feature_column.categorical_column_with_vocabulary_list(
                key="body-style", vocabulary_list=body_style_vocab)
            feature_columns = [
                tf.feature_column.numeric_column(key="curb-weight"),
                tf.feature_column.numeric_column(key="highway-mpg"),
                # Since this is a DNN model, convert categorical columns from sparse
                # to dense.
                # Wrap them in an `indicator_column` to create a
                # one-hot vector from the input.
                tf.feature_column.indicator_column(body_style)
            ]

            # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
            # defined above as input.
            estimator = tf.estimator.DNNRegressor(
                hidden_units=[20, 20], feature_columns=feature_columns)

            # Training the estimator.
            estimator.train(input_fn=input_train, steps=10)
            # Saving the estimator's prediction on the training data; assume the DNNRegressor
            # produces a single output column named 'predictions'
            pred_col = "predictions"
            estimator_preds = [
                s[pred_col] for s in estimator.predict(input_train)
            ]
            estimator_preds_df = pd.DataFrame({pred_col: estimator_preds})
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                feature_spec["body-style"] = tf.placeholder("string",
                                                            name="body-style",
                                                            shape=[None])
                feature_spec["curb-weight"] = tf.placeholder(
                    "float", name="curb-weight", shape=[None])
                feature_spec["highway-mpg"] = tf.placeholder(
                    "float", name="highway-mpg", shape=[None])

                pyfunc_preds_df = self.helper(feature_spec, tmp, estimator, df)
                # Asserting that the loaded model predictions are as expected. Allow for some
                # imprecision as this is expected with TensorFlow.
                pandas.testing.assert_frame_equal(pyfunc_preds_df,
                                                  estimator_preds_df,
                                                  check_less_precise=6)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
Example #25
0
def test_linear():
    old_uri = tracking.get_tracking_uri()
    with TempDir(chdr=False, remove_on_exit=True) as tmp:
        try:
            diamonds = tmp.path("diamonds")
            root_tracking_dir = tmp.path("root_tracking_dir")
            os.mkdir(diamonds)
            os.mkdir(root_tracking_dir)
            tracking.set_tracking_uri(root_tracking_dir)
            # Download the diamonds dataset via mlflow run
            mlflow.set_experiment("test-experiment")
            run(".",
                entry_point="main",
                version=None,
                parameters={"dest-dir": diamonds},
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Run the main linear app via mlflow
            submitted_run = run("apps/linear-regression",
                                entry_point="main",
                                version=None,
                                parameters={
                                    "train":
                                    os.path.join(diamonds,
                                                 "train_diamonds.parquet"),
                                    "test":
                                    os.path.join(diamonds,
                                                 "test_diamonds.parquet"),
                                    "alpha":
                                    .001,
                                    "l1-ratio":
                                    .5,
                                    "label-col":
                                    "price"
                                },
                                mode="local",
                                cluster_spec=None,
                                git_username=None,
                                git_password=None,
                                use_conda=True,
                                storage_dir=None)

            pyfunc = load_pyfunc(path="model", run_id=submitted_run.run_id)

            df = pandas.read_parquet(
                os.path.join(diamonds, "test_diamonds.parquet"))

            # Removing the price column from the DataFrame so we can use the features to predict
            df = df.drop(columns="price")

            # Predicting from the saved pyfunc
            predict = pyfunc.predict(df)

            # Make sure the data is of the right type
            assert isinstance(predict[0], numpy.float64)
        finally:
            tracking.set_tracking_uri(old_uri)
Example #26
0
# %%
start = datetime.now()

prepared_data = prepared_data.spark.persist()
##### 6e changement : Pour utiliser scikit learn avec koalas, il faut utiliser Mlflow
##### Mais l'entrainement restera sur des dataframe pandas, seule la prédiction peut être faite avec koalas
prepared_data = prepared_data.to_pandas()

#### On prépare donc l'environnement
from mlflow.tracking import MlflowClient, set_tracking_uri
import mlflow.sklearn

from tempfile import mkdtemp
d = mkdtemp("koalas_mlflow")
set_tracking_uri("file:%s" % d)
client = MlflowClient()
exp = mlflow.create_experiment("my_experiment")
mlflow.set_experiment("my_experiment")

# Split Train/Test
from sklearn.model_selection import train_test_split
X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed']
y = prepared_data['new_confirmed'].ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Scale des valeurs
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
Example #27
0
    def test_categorical_columns(self):
        """
        This tests logging capabilities on datasets with categorical columns.
        See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/regression/imports85.py
        for reference code.
        """
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            # Downloading the data into a pandas DataFrame.
            URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
            path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)
            # Order is important for the csv-readers, so we use an OrderedDict here.
            defaults = collections.OrderedDict([("body-style", [""]),
                                                ("curb-weight", [0.0]),
                                                ("highway-mpg", [0.0]),
                                                ("price", [0.0])])

            types = collections.OrderedDict(
                (key, type(value[0])) for key, value in defaults.items())
            df = pandas.read_csv(path,
                                 names=types.keys(),
                                 dtype=types,
                                 na_values="?")
            df = df.dropna()

            # Extract the label from the features dataframe.
            y_train = df.pop("price")

            # Creating the input training function required.
            trainingFeatures = {}

            for i in df:
                trainingFeatures[i] = df[i].values

            input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                             y_train.values,
                                                             shuffle=False,
                                                             batch_size=1)

            # Creating the feature columns required for the DNNRegressor.
            body_style_vocab = [
                "hardtop", "wagon", "sedan", "hatchback", "convertible"
            ]
            body_style = tf.feature_column.categorical_column_with_vocabulary_list(
                key="body-style", vocabulary_list=body_style_vocab)
            feature_columns = [
                tf.feature_column.numeric_column(key="curb-weight"),
                tf.feature_column.numeric_column(key="highway-mpg"),
                # Since this is a DNN model, convert categorical columns from sparse
                # to dense.
                # Wrap them in an `indicator_column` to create a
                # one-hot vector from the input.
                tf.feature_column.indicator_column(body_style),
            ]

            # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
            # defined above as input.
            estimator = tf.estimator.DNNRegressor(
                hidden_units=[20, 20], feature_columns=feature_columns)

            # Training the estimator.
            estimator.train(input_fn=input_train, steps=100)
            # Saving the estimator's prediction on the training data.
            estimator_preds = estimator.predict(input_train)
            # Setting the logging such that it is in the temp folder and deleted after the test.
            old_tracking_dir = tracking.get_tracking_uri()
            tracking_dir = os.path.abspath(tmp.path("mlruns"))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            tracking.start_run()
            try:
                # Creating dict of features names (str) to placeholders (tensors)
                feature_spec = {}
                feature_spec["body-style"] = tf.placeholder("string",
                                                            name="body-style",
                                                            shape=[None])
                feature_spec["curb-weight"] = tf.placeholder(
                    "float", name="curb-weight", shape=[None])
                feature_spec["highway-mpg"] = tf.placeholder(
                    "float", name="highway-mpg", shape=[None])

                saved = [s['predictions'] for s in estimator_preds]

                results = self.helper(feature_spec, tmp, estimator, df)

                # Asserting that the loaded model predictions are as expected.
                # TensorFlow is known to have precision errors, hence the almost_equal.
                np.testing.assert_array_almost_equal(saved, results, decimal=2)
            finally:
                # Restoring the old logging location.
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_dir)
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data,
                             columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                tracking.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    tracking.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                if dfs_tmp_dir:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model,
                                     dfs_tmpdir=dfs_tmp_dir)
                else:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model)
                run_id = tracking.active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [
                    x.prediction
                    for x in preds_df_1.select("prediction").collect()
                ]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # make sure we did not leave any temp files behind
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert not os.listdir(x)
                shutil.rmtree(x)
            finally:
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)