def test_universal_autolog_calls_specific_autologs_correctly(library, mlflow_module):
    integrations_with_additional_config = [xgboost, lightgbm, sklearn]
    args_to_test = {
        "log_models": False,
        "disable": True,
        "exclusive": True,
        "disable_for_unsupported_versions": True,
        "silent": True,
    }
    if library in integrations_with_additional_config:
        args_to_test.update({"log_input_examples": True, "log_model_signatures": True})

    mlflow.autolog(**args_to_test)
    mlflow.utils.import_hooks.notify_module_loaded(library)

    for arg_key, arg_value in args_to_test.items():
        assert (
            get_autologging_config(mlflow_module.autolog.integration_name, arg_key, None)
            == arg_value
        )
Ejemplo n.º 2
0
def test_universal_autolog_attaches_pyspark_import_hook_if_pyspark_isnt_installed(
        config):
    with mock.patch("mlflow.spark.autolog",
                    wraps=mlflow.spark.autolog) as autolog_mock:
        autolog_mock.integration_name = "spark"
        # simulate pyspark not being installed
        autolog_mock.side_effect = ImportError(
            "no module named pyspark blahblah")

        mlflow.autolog(**config)
        autolog_mock.assert_called_once()  # it was called once and failed

        # now the user installs pyspark
        autolog_mock.side_effect = None

        mlflow.utils.import_hooks.notify_module_loaded(pyspark)

        # assert autolog is called again once pyspark is imported
        assert autolog_mock.call_count == 2
        assert autolog_mock.call_args_list[1] == config
Ejemplo n.º 3
0
def objective_function(params):
    with mlflow.start_run(nested=True):
        mlflow.autolog()

        est = int(params['n_estimators'])
        md = int(params['max_depth'])
        msl = int(params['min_samples_leaf'])
        mss = int(params['min_samples_split'])

        model = RandomForestRegressor(n_estimators=est,
                                      max_depth=md,
                                      min_samples_leaf=msl,
                                      min_samples_split=mss)

        model.fit(X_train, y_train)

        pred = model.predict(split_X_rem)

        rmse = mean_squared_error(split_y_rem, pred)

    return {"loss": rmse, "status": STATUS_OK}
def test_autolog_globally_configured_flag_set_correctly():
    from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS

    AUTOLOGGING_INTEGRATIONS.clear()
    import sklearn  # pylint: disable=unused-import,unused-variable
    import pyspark  # pylint: disable=unused-import,unused-variable
    import pyspark.ml  # pylint: disable=unused-import,unused-variable

    integrations_to_test = ["sklearn", "spark", "pyspark.ml"]
    mlflow.autolog()
    for integration_name in integrations_to_test:
        assert AUTOLOGGING_INTEGRATIONS[integration_name][
            "globally_configured"]

    mlflow.sklearn.autolog()
    mlflow.spark.autolog()
    mlflow.pyspark.ml.autolog()

    for integration_name in integrations_to_test:
        assert "globally_configured" not in AUTOLOGGING_INTEGRATIONS[
            integration_name]
Ejemplo n.º 5
0
def test_universal_autolog_calls_specific_autologs_correctly(
        library, mlflow_module):
    integrations_with_config = [xgboost, lightgbm, sklearn]

    # modify the __signature__ of the mock to contain the needed parameters
    args = ({
        "log_input_examples": bool,
        "log_model_signatures": bool,
        "log_models": bool
    } if library in integrations_with_config else {
        "log_models": bool
    })
    params = [
        inspect.Parameter(param,
                          inspect.Parameter.POSITIONAL_OR_KEYWORD,
                          annotation=type_) for param, type_ in args.items()
    ]
    with mock.patch("mlflow." + mlflow_module + ".autolog",
                    wraps=getattr(mlflow,
                                  mlflow_module).autolog) as autolog_mock:
        autolog_mock.__signature__ = inspect.Signature(params)

        autolog_mock.assert_not_called()

        # this should attach import hooks to each library
        mlflow.autolog(log_input_examples=True,
                       log_model_signatures=True,
                       log_models=True)

        autolog_mock.assert_not_called()

        mlflow.utils.import_hooks.notify_module_loaded(library)

        # after each library is imported, its corresponding autolog function should have been called
        if library in integrations_with_config:
            autolog_mock.assert_called_once_with(log_input_examples=True,
                                                 log_model_signatures=True,
                                                 log_models=True)
        else:
            autolog_mock.assert_called_once_with(log_models=True)
def test_fluent_autolog_with_tf_keras_logs_expected_content(
        random_train_data, random_one_hot_labels):
    """
    Guards against previously-exhibited issues where using the fluent `mlflow.autolog()` API with
    `tf.keras` Models did not work due to conflicting patches set by both the
    `mlflow.tensorflow.autolog()` and the `mlflow.keras.autolog()` APIs.
    """
    mlflow.autolog()

    model = create_tf_keras_model()

    with mlflow.start_run() as run:
        model.fit(random_train_data, random_one_hot_labels, epochs=10)

    client = mlflow.tracking.MlflowClient()
    run_data = client.get_run(run.info.run_id).data
    assert "accuracy" in run_data.metrics
    assert "epochs" in run_data.params

    artifacts = client.list_artifacts(run.info.run_id)
    artifacts = map(lambda x: x.path, artifacts)
    assert "model" in artifacts
Ejemplo n.º 7
0
def test_universal_autolog_attaches_pyspark_import_hook_if_pyspark_isnt_installed(
):
    library = pyspark
    mlflow_module = "spark"

    with mock.patch("mlflow." + mlflow_module + ".autolog",
                    wraps=getattr(mlflow,
                                  mlflow_module).autolog) as autolog_mock:
        # simulate pyspark not being installed
        autolog_mock.side_effect = ImportError(
            "no module named pyspark blahblah")

        mlflow.autolog()
        autolog_mock.assert_called_once()  # it was called once and failed

        # now the user installs pyspark
        autolog_mock.side_effect = None

        mlflow.utils.import_hooks.notify_module_loaded(library)

        # assert autolog is called again once pyspark is imported
        assert autolog_mock.call_count == 2
def test_autolog_success_message_obeys_disabled():
    with mock.patch("mlflow.tracking.fluent._logger.info") as autolog_logger_mock:
        mlflow.autolog(disable=True)
        mlflow.utils.import_hooks.notify_module_loaded(tensorflow)
        autolog_logger_mock.assert_not_called()

        mlflow.autolog()
        mlflow.utils.import_hooks.notify_module_loaded(tensorflow)
        autolog_logger_mock.assert_called()

        autolog_logger_mock.reset_mock()

        mlflow.autolog(disable=False)
        mlflow.utils.import_hooks.notify_module_loaded(tensorflow)
        autolog_logger_mock.assert_called()
def test_universal_autolog_calls_pyspark_immediately():
    mlflow.autolog()
    assert not autologging_is_disabled(mlflow.spark.FLAVOR_NAME)

    mlflow.autolog(disable=True)
    assert autologging_is_disabled(mlflow.spark.FLAVOR_NAME)

    mlflow.autolog(disable=False)
    assert not autologging_is_disabled(mlflow.spark.FLAVOR_NAME)

    with mock.patch("mlflow.spark.autolog", wraps=mlflow.spark.autolog) as autolog_mock:
        # there should be no import hook on pyspark since autologging was already
        # applied to an active spark session
        mlflow.utils.import_hooks.notify_module_loaded(pyspark)
        autolog_mock.assert_not_called()
def test_autolog_respects_silent_mode(tmpdir):
    # Use file-based experiment storage for this test. Otherwise, concurrent experiment creation in
    # multithreaded contexts may fail for other storage backends (e.g. SQLAlchemy)
    mlflow.set_tracking_uri(str(tmpdir))
    mlflow.set_experiment("test_experiment")

    og_showwarning = warnings.showwarning
    stream = StringIO()
    sys.stderr = stream
    logger = logging.getLogger(mlflow.__name__)

    from sklearn import datasets

    iris = datasets.load_iris()

    def train_model():
        import sklearn.utils
        from sklearn import svm
        from sklearn.model_selection import GridSearchCV

        parameters = {"kernel": ("linear", "rbf"), "C": [1, 10]}
        svc = svm.SVC()
        with sklearn.utils.parallel_backend(backend="threading"):
            clf = GridSearchCV(svc, parameters)
            clf.fit(iris.data, iris.target)

        return True

    # Call general and framework-specific autologging APIs to cover a
    # larger surface area for testing purposes
    mlflow.autolog(silent=True)
    mlflow.sklearn.autolog(silent=True, log_input_examples=True)

    executions = []
    with ThreadPoolExecutor(max_workers=50) as executor:
        for _ in range(2):
            e = executor.submit(train_model)
            executions.append(e)

    assert all([e.result() is True for e in executions])
    assert not stream.getvalue()
    # Verify that `warnings.showwarning` was restored to its original value after training
    # and that MLflow event logs are enabled
    assert warnings.showwarning == og_showwarning
    logger.info("verify that event logs are enabled")
    assert "verify that event logs are enabled" in stream.getvalue()

    stream.truncate(0)

    mlflow.sklearn.autolog(silent=False, log_input_examples=True)

    executions = []
    with ThreadPoolExecutor(max_workers=50) as executor:
        for _ in range(100):
            e = executor.submit(train_model)
            executions.append(e)

    assert all([e.result() is True for e in executions])
    assert stream.getvalue()
    # Verify that `warnings.showwarning` was restored to its original value after training
    # and that MLflow event logs are enabled
    assert warnings.showwarning == og_showwarning
    logger.info("verify that event logs are enabled")
    assert "verify that event logs are enabled" in stream.getvalue()

    # TODO: Investigate why this test occasionally leaks a run, which causes the
    # `clean_up_leaked_runs` fixture in `tests/conftest.py` to fail.
    while mlflow.active_run():
        mlflow.end_run()
Ejemplo n.º 11
0
# COMMAND ----------

signature

# COMMAND ----------

display(silverDepDF_1)

# COMMAND ----------

import mlflow
import mlflow.spark
from mlflow.models.signature import infer_signature
# turn on autologging
mlflow.autolog(log_input_examples=True,
               log_model_signatures=True,
               log_models=True)

experiment_name = '/Users/[email protected]/flight_delay/dscc202_group05_experiment'
mlflow.set_experiment(experiment_name)
with mlflow.start_run(experiment_id=70892):
    #with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
    pipelineModel = pipeline.fit(train)

    # Log the best model.

    mlflow.spark.log_model(
        spark_model=pipelineModel,
        artifact_path='best-model-dep-newfeatures',
        signature=signature,
        input_example=train.drop("DEP_DELAY").toPandas().head())
Ejemplo n.º 12
0
# MAGIC ### Model 2: Decision Tree

# COMMAND ----------

model_2 = tree.DecisionTreeClassifier()
model_2.fit(iris.data, iris.target)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Alternative: Enable MLflow Autologging

# COMMAND ----------

import mlflow  # Import MLflow
mlflow.autolog()  # Turn on "autologging"

with mlflow.start_run(run_name="Sklearn Decision Tree"
                      ):  #Pass in run_name using "with" Python syntax
    model_3 = tree.DecisionTreeClassifier(max_depth=5).fit(
        iris.data, iris.target)  #Instantiate and fit model

# COMMAND ----------

# MAGIC %md
# MAGIC ## Make Predictions with Model
# MAGIC After registering your model to the model registry and transitioning it to Stage `Production`, load it back to make predictions

# COMMAND ----------

model_name = "mlflow_101_demo"  #Or replace with your model name
# MAGIC %md
# MAGIC ### Define training function

# COMMAND ----------

import mlflow
import mlflow.shap
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

mlflow.autolog(log_input_examples=True) # Optional
label = "Damaged"
model_name = f"turbine_failure_model_{dbName}"

# Define a method for reuse later
def fit_model(model_feature_lookups, n_iter=10):

  with mlflow.start_run():
    training_set = fs.create_training_set(outputDF,
                                          model_feature_lookups,
                                          label=label,
                                          exclude_columns=key)

    # Convert to pandas Dataframe
    training_pd = training_set.load_df().toPandas()
    X = training_pd.drop(label, axis=1)
Ejemplo n.º 14
0
import mlflow

print("Hello, world!")

mlflow.autolog()

mlflow.log_metric("metric", 42.0)

print("Bye, world...")
Ejemplo n.º 15
0
        '--pytorch-seed',
        type=int,
        default=0,
        help='Random seed of all Pytorch functions',
    )
    parser.add_argument(
        '--log-interval',
        type=int,
        default=100,
        help='log interval of stdout',
    )

    parser = pl.Trainer.add_argparse_args(parent_parser=parser)
    parser = LitsSegmentator.add_model_specific_args(parent_parser=parser)

    mlflow.autolog(1)
    # log conda env and system information
    MLFCore.log_sys_intel_conda_env()
    # parse cli arguments
    args = parser.parse_args()
    dict_args = vars(args)

    # print("->args:") # take a look at those args
    # print(dict_args)

    # store seed and number of gpus to make linter bit less restrict in terms of naming
    general_seed = dict_args['general_seed']
    pytorch_seed = dict_args['pytorch_seed']
    num_of_gpus = dict_args['gpus']

    # set det.
Ejemplo n.º 16
0
# Get Experiment Details
experiment = mlflow.get_experiment_by_name("Social NLP Experiments")
print(experiment.experiment_id)
print(experiment.artifact_location)
print(experiment.tags)
print(experiment.lifecycle_stage)

# Start a new MLflow run, setting it as the active run under which
# metrics and parameters will be logged.
with mlflow.start_run(run_name="RARENT_RUN") as parent_run:
    mlflow.log_param("parent", "yes")
    with mlflow.start_run(run_name="CHILD_RUN", nested=True) as child_run:
        mlflow.log_param("child", "yes")

mlflow.autolog()  # call before training code
with mlflow.start_run():
    for epoch in range(0, 3):
        mlflow.log_param(key="quality", value=2 * epoch, step=epoch)

# or use library specific auto-log calls
mlflow.tensorflow.autolog()  # call before training code
mlflow.keras.autolog()  # call before training code

# MLflow provides a more detailed Tracking Service API for
# managing experiments and runs directly, which is available through
# client SDK in the mlflow.tracking module

from mlflow.tracking import MlflowClient

client = MlflowClient()
Ejemplo n.º 17
0
def test_autolog_obeys_disabled():
    from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS

    mlflow.autolog(disable=True)
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    assert get_autologging_config("sklearn", "disable")

    mlflow.autolog()
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    mlflow.autolog(disable=True)
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    assert get_autologging_config("sklearn", "disable")

    mlflow.autolog(disable=False)
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    assert not get_autologging_config("sklearn", "disable")
    mlflow.sklearn.autolog(disable=True)
    assert get_autologging_config("sklearn", "disable")

    AUTOLOGGING_INTEGRATIONS.clear()
    mlflow.autolog(disable_for_unsupported_versions=False)
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    assert not get_autologging_config("sklearn", "disable_for_unsupported_versions")
    mlflow.autolog(disable_for_unsupported_versions=True)
    mlflow.utils.import_hooks.notify_module_loaded(sklearn)
    assert get_autologging_config("sklearn", "disable_for_unsupported_versions")

    mlflow.sklearn.autolog(disable_for_unsupported_versions=False)
    assert not get_autologging_config("sklearn", "disable_for_unsupported_versions")
    mlflow.sklearn.autolog(disable_for_unsupported_versions=True)
    assert get_autologging_config("sklearn", "disable_for_unsupported_versions")
 def mlflow_autolog():
     mlflow.autolog()
Ejemplo n.º 19
0
def test_disable_for_unsupported_versions_warning_sklearn_integration():
    log_warn_fn_name = "mlflow.utils.autologging_utils._logger.warning"
    log_info_fn_name = "mlflow.tracking.fluent._logger.info"

    def is_sklearn_warning_fired(log_warn_fn_args):
        return (
            "You are using an unsupported version of" in log_warn_fn_args[0][0]
            and log_warn_fn_args[0][1] == "sklearn"
        )

    def is_sklearn_autolog_enabled_info_fired(log_info_fn_args):
        return (
            "Autologging successfully enabled for " in log_info_fn_args[0][0]
            and log_info_fn_args[0][1] == "sklearn"
        )

    with mock.patch("sklearn.__version__", "0.20.3"):
        AUTOLOGGING_INTEGRATIONS.clear()
        with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch(
            log_info_fn_name
        ) as log_info_fn:
            mlflow.autolog(disable_for_unsupported_versions=True)
            assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list)
            assert any(
                is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list
            )
        with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch(
            log_info_fn_name
        ) as log_info_fn:
            mlflow.autolog(disable_for_unsupported_versions=False)
            assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list)
            assert any(
                is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list
            )

        with mock.patch(log_warn_fn_name) as log_warn_fn:
            mlflow.sklearn.autolog(disable_for_unsupported_versions=True)
            log_warn_fn.assert_not_called()
        with mock.patch(log_warn_fn_name) as log_warn_fn:
            mlflow.sklearn.autolog(disable_for_unsupported_versions=False)
            log_warn_fn.assert_not_called()

    with mock.patch("sklearn.__version__", "0.20.2"):
        AUTOLOGGING_INTEGRATIONS.clear()
        with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch(
            log_info_fn_name
        ) as log_info_fn:
            mlflow.autolog(disable_for_unsupported_versions=True)
            assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list)
            assert all(
                not is_sklearn_autolog_enabled_info_fired(args)
                for args in log_info_fn.call_args_list
            )
        with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch(
            log_info_fn_name
        ) as log_info_fn:
            mlflow.autolog(disable_for_unsupported_versions=False)
            assert any(is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list)
            assert any(
                is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list
            )
        with mock.patch(log_warn_fn_name) as log_warn_fn:
            mlflow.sklearn.autolog(disable_for_unsupported_versions=True)
            log_warn_fn.assert_not_called()
        with mock.patch(log_warn_fn_name) as log_warn_fn:
            mlflow.sklearn.autolog(disable_for_unsupported_versions=False)
            assert log_warn_fn.call_count == 1 and is_sklearn_warning_fired(log_warn_fn.call_args)
Ejemplo n.º 20
0
def start_training():
    parser = ArgumentParser(description='XGBoost Example')
    parser.add_argument(
        '--max_epochs',
        type=int,
        default=25,
        help='Number of epochs to train',
    )
    parser.add_argument(
        '--general-seed',
        type=int,
        default=0,
        help='General Python, Python random and Numpy seed.',
    )
    parser.add_argument(
        '--xgboost-seed',
        type=int,
        default=0,
        help='XGBoost specific random seed.',
    )
    parser.add_argument(
        '--cuda',
        type=bool,
        default=True,
        help='Enable or disable CUDA support.',
    )
    parser.add_argument(
        '--single-precision-histogram',
        type=bool,
        default=True,
        help='Enable or disable single precision histogram calculation.',
    )
    parser.add_argument(
        '--training-data',
        type=str,
        help='Path to the training data',
    )
    parser.add_argument(
        '--test-data',
        type=str,
        help='Path to the test data',
    )
    avail_gpus = GPUtil.getGPUs()
    args = parser.parse_args()
    dict_args = vars(args)
    use_cuda = True if dict_args['cuda'] and len(avail_gpus) > 0 else False
    if use_cuda:
        print(f'[bold blue]Using {len(avail_gpus)} GPUs!')
    else:
        print('[bold blue]No GPUs detected. Running on the CPU')

    with mlflow.start_run():
        # Enable the logging of all parameters, metrics and models to mlflow
        mlflow.autolog(1)

        # Log hardware and software
        MLFCore.log_sys_intel_conda_env()

        # Fetch and prepare data
        training_data, test_data = load_train_test_data(
            dict_args['training_data'], dict_args['test_data'])

        # Enable input data logging
        # MLFCore.log_input_data('data/')

        # Set XGBoost parameters
        param = {
            'objective':
            'binary:logistic',
            'single_precision_histogram':
            True
            if dict_args['single_precision_histogram'] == 'True' else False,
            'subsample':
            0.7,
            'colsample_bytree':
            0.6,
            'learning_rate':
            0.2,
            'max_depth':
            3,
            'min_child_weight':
            1,
            'eval_metric':
            'logloss'
        }

        # Set random seeds
        MLFCore.set_general_random_seeds(dict_args["general_seed"])
        MLFCore.set_xgboost_random_seeds(dict_args["xgboost_seed"], param)

        # Set CPU or GPU as training device
        if use_cuda:
            param['tree_method'] = 'gpu_hist'
        else:
            param['tree_method'] = 'hist'

        # Train on the chosen device
        results = {}
        runtime = time.time()
        booster = xgb.train(param,
                            training_data.DM,
                            dict_args['max_epochs'],
                            evals=[(test_data.DM, 'test')],
                            evals_result=results)
        device = 'GPU' if use_cuda else 'CPU'
        if use_cuda:
            print(
                f'[bold green]{device} Run Time: {str(time.time() - runtime)} seconds'
            )

        # Perform some predictions on the test data, evaluate and log them
        print('[bold blue]Performing predictions on test data.')
        test_predictions = np.round(booster.predict(test_data.DM))
        calculate_log_metrics(test_data.y, test_predictions)