def test_universal_autolog_calls_specific_autologs_correctly(library, mlflow_module): integrations_with_additional_config = [xgboost, lightgbm, sklearn] args_to_test = { "log_models": False, "disable": True, "exclusive": True, "disable_for_unsupported_versions": True, "silent": True, } if library in integrations_with_additional_config: args_to_test.update({"log_input_examples": True, "log_model_signatures": True}) mlflow.autolog(**args_to_test) mlflow.utils.import_hooks.notify_module_loaded(library) for arg_key, arg_value in args_to_test.items(): assert ( get_autologging_config(mlflow_module.autolog.integration_name, arg_key, None) == arg_value )
def test_universal_autolog_attaches_pyspark_import_hook_if_pyspark_isnt_installed( config): with mock.patch("mlflow.spark.autolog", wraps=mlflow.spark.autolog) as autolog_mock: autolog_mock.integration_name = "spark" # simulate pyspark not being installed autolog_mock.side_effect = ImportError( "no module named pyspark blahblah") mlflow.autolog(**config) autolog_mock.assert_called_once() # it was called once and failed # now the user installs pyspark autolog_mock.side_effect = None mlflow.utils.import_hooks.notify_module_loaded(pyspark) # assert autolog is called again once pyspark is imported assert autolog_mock.call_count == 2 assert autolog_mock.call_args_list[1] == config
def objective_function(params): with mlflow.start_run(nested=True): mlflow.autolog() est = int(params['n_estimators']) md = int(params['max_depth']) msl = int(params['min_samples_leaf']) mss = int(params['min_samples_split']) model = RandomForestRegressor(n_estimators=est, max_depth=md, min_samples_leaf=msl, min_samples_split=mss) model.fit(X_train, y_train) pred = model.predict(split_X_rem) rmse = mean_squared_error(split_y_rem, pred) return {"loss": rmse, "status": STATUS_OK}
def test_autolog_globally_configured_flag_set_correctly(): from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS AUTOLOGGING_INTEGRATIONS.clear() import sklearn # pylint: disable=unused-import,unused-variable import pyspark # pylint: disable=unused-import,unused-variable import pyspark.ml # pylint: disable=unused-import,unused-variable integrations_to_test = ["sklearn", "spark", "pyspark.ml"] mlflow.autolog() for integration_name in integrations_to_test: assert AUTOLOGGING_INTEGRATIONS[integration_name][ "globally_configured"] mlflow.sklearn.autolog() mlflow.spark.autolog() mlflow.pyspark.ml.autolog() for integration_name in integrations_to_test: assert "globally_configured" not in AUTOLOGGING_INTEGRATIONS[ integration_name]
def test_universal_autolog_calls_specific_autologs_correctly( library, mlflow_module): integrations_with_config = [xgboost, lightgbm, sklearn] # modify the __signature__ of the mock to contain the needed parameters args = ({ "log_input_examples": bool, "log_model_signatures": bool, "log_models": bool } if library in integrations_with_config else { "log_models": bool }) params = [ inspect.Parameter(param, inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=type_) for param, type_ in args.items() ] with mock.patch("mlflow." + mlflow_module + ".autolog", wraps=getattr(mlflow, mlflow_module).autolog) as autolog_mock: autolog_mock.__signature__ = inspect.Signature(params) autolog_mock.assert_not_called() # this should attach import hooks to each library mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=True) autolog_mock.assert_not_called() mlflow.utils.import_hooks.notify_module_loaded(library) # after each library is imported, its corresponding autolog function should have been called if library in integrations_with_config: autolog_mock.assert_called_once_with(log_input_examples=True, log_model_signatures=True, log_models=True) else: autolog_mock.assert_called_once_with(log_models=True)
def test_fluent_autolog_with_tf_keras_logs_expected_content( random_train_data, random_one_hot_labels): """ Guards against previously-exhibited issues where using the fluent `mlflow.autolog()` API with `tf.keras` Models did not work due to conflicting patches set by both the `mlflow.tensorflow.autolog()` and the `mlflow.keras.autolog()` APIs. """ mlflow.autolog() model = create_tf_keras_model() with mlflow.start_run() as run: model.fit(random_train_data, random_one_hot_labels, epochs=10) client = mlflow.tracking.MlflowClient() run_data = client.get_run(run.info.run_id).data assert "accuracy" in run_data.metrics assert "epochs" in run_data.params artifacts = client.list_artifacts(run.info.run_id) artifacts = map(lambda x: x.path, artifacts) assert "model" in artifacts
def test_universal_autolog_attaches_pyspark_import_hook_if_pyspark_isnt_installed( ): library = pyspark mlflow_module = "spark" with mock.patch("mlflow." + mlflow_module + ".autolog", wraps=getattr(mlflow, mlflow_module).autolog) as autolog_mock: # simulate pyspark not being installed autolog_mock.side_effect = ImportError( "no module named pyspark blahblah") mlflow.autolog() autolog_mock.assert_called_once() # it was called once and failed # now the user installs pyspark autolog_mock.side_effect = None mlflow.utils.import_hooks.notify_module_loaded(library) # assert autolog is called again once pyspark is imported assert autolog_mock.call_count == 2
def test_autolog_success_message_obeys_disabled(): with mock.patch("mlflow.tracking.fluent._logger.info") as autolog_logger_mock: mlflow.autolog(disable=True) mlflow.utils.import_hooks.notify_module_loaded(tensorflow) autolog_logger_mock.assert_not_called() mlflow.autolog() mlflow.utils.import_hooks.notify_module_loaded(tensorflow) autolog_logger_mock.assert_called() autolog_logger_mock.reset_mock() mlflow.autolog(disable=False) mlflow.utils.import_hooks.notify_module_loaded(tensorflow) autolog_logger_mock.assert_called()
def test_universal_autolog_calls_pyspark_immediately(): mlflow.autolog() assert not autologging_is_disabled(mlflow.spark.FLAVOR_NAME) mlflow.autolog(disable=True) assert autologging_is_disabled(mlflow.spark.FLAVOR_NAME) mlflow.autolog(disable=False) assert not autologging_is_disabled(mlflow.spark.FLAVOR_NAME) with mock.patch("mlflow.spark.autolog", wraps=mlflow.spark.autolog) as autolog_mock: # there should be no import hook on pyspark since autologging was already # applied to an active spark session mlflow.utils.import_hooks.notify_module_loaded(pyspark) autolog_mock.assert_not_called()
def test_autolog_respects_silent_mode(tmpdir): # Use file-based experiment storage for this test. Otherwise, concurrent experiment creation in # multithreaded contexts may fail for other storage backends (e.g. SQLAlchemy) mlflow.set_tracking_uri(str(tmpdir)) mlflow.set_experiment("test_experiment") og_showwarning = warnings.showwarning stream = StringIO() sys.stderr = stream logger = logging.getLogger(mlflow.__name__) from sklearn import datasets iris = datasets.load_iris() def train_model(): import sklearn.utils from sklearn import svm from sklearn.model_selection import GridSearchCV parameters = {"kernel": ("linear", "rbf"), "C": [1, 10]} svc = svm.SVC() with sklearn.utils.parallel_backend(backend="threading"): clf = GridSearchCV(svc, parameters) clf.fit(iris.data, iris.target) return True # Call general and framework-specific autologging APIs to cover a # larger surface area for testing purposes mlflow.autolog(silent=True) mlflow.sklearn.autolog(silent=True, log_input_examples=True) executions = [] with ThreadPoolExecutor(max_workers=50) as executor: for _ in range(2): e = executor.submit(train_model) executions.append(e) assert all([e.result() is True for e in executions]) assert not stream.getvalue() # Verify that `warnings.showwarning` was restored to its original value after training # and that MLflow event logs are enabled assert warnings.showwarning == og_showwarning logger.info("verify that event logs are enabled") assert "verify that event logs are enabled" in stream.getvalue() stream.truncate(0) mlflow.sklearn.autolog(silent=False, log_input_examples=True) executions = [] with ThreadPoolExecutor(max_workers=50) as executor: for _ in range(100): e = executor.submit(train_model) executions.append(e) assert all([e.result() is True for e in executions]) assert stream.getvalue() # Verify that `warnings.showwarning` was restored to its original value after training # and that MLflow event logs are enabled assert warnings.showwarning == og_showwarning logger.info("verify that event logs are enabled") assert "verify that event logs are enabled" in stream.getvalue() # TODO: Investigate why this test occasionally leaks a run, which causes the # `clean_up_leaked_runs` fixture in `tests/conftest.py` to fail. while mlflow.active_run(): mlflow.end_run()
# COMMAND ---------- signature # COMMAND ---------- display(silverDepDF_1) # COMMAND ---------- import mlflow import mlflow.spark from mlflow.models.signature import infer_signature # turn on autologging mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=True) experiment_name = '/Users/[email protected]/flight_delay/dscc202_group05_experiment' mlflow.set_experiment(experiment_name) with mlflow.start_run(experiment_id=70892): #with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run: pipelineModel = pipeline.fit(train) # Log the best model. mlflow.spark.log_model( spark_model=pipelineModel, artifact_path='best-model-dep-newfeatures', signature=signature, input_example=train.drop("DEP_DELAY").toPandas().head())
# MAGIC ### Model 2: Decision Tree # COMMAND ---------- model_2 = tree.DecisionTreeClassifier() model_2.fit(iris.data, iris.target) # COMMAND ---------- # MAGIC %md # MAGIC ### Alternative: Enable MLflow Autologging # COMMAND ---------- import mlflow # Import MLflow mlflow.autolog() # Turn on "autologging" with mlflow.start_run(run_name="Sklearn Decision Tree" ): #Pass in run_name using "with" Python syntax model_3 = tree.DecisionTreeClassifier(max_depth=5).fit( iris.data, iris.target) #Instantiate and fit model # COMMAND ---------- # MAGIC %md # MAGIC ## Make Predictions with Model # MAGIC After registering your model to the model registry and transitioning it to Stage `Production`, load it back to make predictions # COMMAND ---------- model_name = "mlflow_101_demo" #Or replace with your model name
# MAGIC %md # MAGIC ### Define training function # COMMAND ---------- import mlflow import mlflow.shap import mlflow.sklearn from mlflow.models.signature import infer_signature from sklearn.compose import ColumnTransformer from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder mlflow.autolog(log_input_examples=True) # Optional label = "Damaged" model_name = f"turbine_failure_model_{dbName}" # Define a method for reuse later def fit_model(model_feature_lookups, n_iter=10): with mlflow.start_run(): training_set = fs.create_training_set(outputDF, model_feature_lookups, label=label, exclude_columns=key) # Convert to pandas Dataframe training_pd = training_set.load_df().toPandas() X = training_pd.drop(label, axis=1)
import mlflow print("Hello, world!") mlflow.autolog() mlflow.log_metric("metric", 42.0) print("Bye, world...")
'--pytorch-seed', type=int, default=0, help='Random seed of all Pytorch functions', ) parser.add_argument( '--log-interval', type=int, default=100, help='log interval of stdout', ) parser = pl.Trainer.add_argparse_args(parent_parser=parser) parser = LitsSegmentator.add_model_specific_args(parent_parser=parser) mlflow.autolog(1) # log conda env and system information MLFCore.log_sys_intel_conda_env() # parse cli arguments args = parser.parse_args() dict_args = vars(args) # print("->args:") # take a look at those args # print(dict_args) # store seed and number of gpus to make linter bit less restrict in terms of naming general_seed = dict_args['general_seed'] pytorch_seed = dict_args['pytorch_seed'] num_of_gpus = dict_args['gpus'] # set det.
# Get Experiment Details experiment = mlflow.get_experiment_by_name("Social NLP Experiments") print(experiment.experiment_id) print(experiment.artifact_location) print(experiment.tags) print(experiment.lifecycle_stage) # Start a new MLflow run, setting it as the active run under which # metrics and parameters will be logged. with mlflow.start_run(run_name="RARENT_RUN") as parent_run: mlflow.log_param("parent", "yes") with mlflow.start_run(run_name="CHILD_RUN", nested=True) as child_run: mlflow.log_param("child", "yes") mlflow.autolog() # call before training code with mlflow.start_run(): for epoch in range(0, 3): mlflow.log_param(key="quality", value=2 * epoch, step=epoch) # or use library specific auto-log calls mlflow.tensorflow.autolog() # call before training code mlflow.keras.autolog() # call before training code # MLflow provides a more detailed Tracking Service API for # managing experiments and runs directly, which is available through # client SDK in the mlflow.tracking module from mlflow.tracking import MlflowClient client = MlflowClient()
def test_autolog_obeys_disabled(): from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS mlflow.autolog(disable=True) mlflow.utils.import_hooks.notify_module_loaded(sklearn) assert get_autologging_config("sklearn", "disable") mlflow.autolog() mlflow.utils.import_hooks.notify_module_loaded(sklearn) mlflow.autolog(disable=True) mlflow.utils.import_hooks.notify_module_loaded(sklearn) assert get_autologging_config("sklearn", "disable") mlflow.autolog(disable=False) mlflow.utils.import_hooks.notify_module_loaded(sklearn) assert not get_autologging_config("sklearn", "disable") mlflow.sklearn.autolog(disable=True) assert get_autologging_config("sklearn", "disable") AUTOLOGGING_INTEGRATIONS.clear() mlflow.autolog(disable_for_unsupported_versions=False) mlflow.utils.import_hooks.notify_module_loaded(sklearn) assert not get_autologging_config("sklearn", "disable_for_unsupported_versions") mlflow.autolog(disable_for_unsupported_versions=True) mlflow.utils.import_hooks.notify_module_loaded(sklearn) assert get_autologging_config("sklearn", "disable_for_unsupported_versions") mlflow.sklearn.autolog(disable_for_unsupported_versions=False) assert not get_autologging_config("sklearn", "disable_for_unsupported_versions") mlflow.sklearn.autolog(disable_for_unsupported_versions=True) assert get_autologging_config("sklearn", "disable_for_unsupported_versions")
def mlflow_autolog(): mlflow.autolog()
def test_disable_for_unsupported_versions_warning_sklearn_integration(): log_warn_fn_name = "mlflow.utils.autologging_utils._logger.warning" log_info_fn_name = "mlflow.tracking.fluent._logger.info" def is_sklearn_warning_fired(log_warn_fn_args): return ( "You are using an unsupported version of" in log_warn_fn_args[0][0] and log_warn_fn_args[0][1] == "sklearn" ) def is_sklearn_autolog_enabled_info_fired(log_info_fn_args): return ( "Autologging successfully enabled for " in log_info_fn_args[0][0] and log_info_fn_args[0][1] == "sklearn" ) with mock.patch("sklearn.__version__", "0.20.3"): AUTOLOGGING_INTEGRATIONS.clear() with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch( log_info_fn_name ) as log_info_fn: mlflow.autolog(disable_for_unsupported_versions=True) assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list) assert any( is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list ) with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch( log_info_fn_name ) as log_info_fn: mlflow.autolog(disable_for_unsupported_versions=False) assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list) assert any( is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list ) with mock.patch(log_warn_fn_name) as log_warn_fn: mlflow.sklearn.autolog(disable_for_unsupported_versions=True) log_warn_fn.assert_not_called() with mock.patch(log_warn_fn_name) as log_warn_fn: mlflow.sklearn.autolog(disable_for_unsupported_versions=False) log_warn_fn.assert_not_called() with mock.patch("sklearn.__version__", "0.20.2"): AUTOLOGGING_INTEGRATIONS.clear() with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch( log_info_fn_name ) as log_info_fn: mlflow.autolog(disable_for_unsupported_versions=True) assert all(not is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list) assert all( not is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list ) with mock.patch(log_warn_fn_name) as log_warn_fn, mock.patch( log_info_fn_name ) as log_info_fn: mlflow.autolog(disable_for_unsupported_versions=False) assert any(is_sklearn_warning_fired(args) for args in log_warn_fn.call_args_list) assert any( is_sklearn_autolog_enabled_info_fired(args) for args in log_info_fn.call_args_list ) with mock.patch(log_warn_fn_name) as log_warn_fn: mlflow.sklearn.autolog(disable_for_unsupported_versions=True) log_warn_fn.assert_not_called() with mock.patch(log_warn_fn_name) as log_warn_fn: mlflow.sklearn.autolog(disable_for_unsupported_versions=False) assert log_warn_fn.call_count == 1 and is_sklearn_warning_fired(log_warn_fn.call_args)
def start_training(): parser = ArgumentParser(description='XGBoost Example') parser.add_argument( '--max_epochs', type=int, default=25, help='Number of epochs to train', ) parser.add_argument( '--general-seed', type=int, default=0, help='General Python, Python random and Numpy seed.', ) parser.add_argument( '--xgboost-seed', type=int, default=0, help='XGBoost specific random seed.', ) parser.add_argument( '--cuda', type=bool, default=True, help='Enable or disable CUDA support.', ) parser.add_argument( '--single-precision-histogram', type=bool, default=True, help='Enable or disable single precision histogram calculation.', ) parser.add_argument( '--training-data', type=str, help='Path to the training data', ) parser.add_argument( '--test-data', type=str, help='Path to the test data', ) avail_gpus = GPUtil.getGPUs() args = parser.parse_args() dict_args = vars(args) use_cuda = True if dict_args['cuda'] and len(avail_gpus) > 0 else False if use_cuda: print(f'[bold blue]Using {len(avail_gpus)} GPUs!') else: print('[bold blue]No GPUs detected. Running on the CPU') with mlflow.start_run(): # Enable the logging of all parameters, metrics and models to mlflow mlflow.autolog(1) # Log hardware and software MLFCore.log_sys_intel_conda_env() # Fetch and prepare data training_data, test_data = load_train_test_data( dict_args['training_data'], dict_args['test_data']) # Enable input data logging # MLFCore.log_input_data('data/') # Set XGBoost parameters param = { 'objective': 'binary:logistic', 'single_precision_histogram': True if dict_args['single_precision_histogram'] == 'True' else False, 'subsample': 0.7, 'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'eval_metric': 'logloss' } # Set random seeds MLFCore.set_general_random_seeds(dict_args["general_seed"]) MLFCore.set_xgboost_random_seeds(dict_args["xgboost_seed"], param) # Set CPU or GPU as training device if use_cuda: param['tree_method'] = 'gpu_hist' else: param['tree_method'] = 'hist' # Train on the chosen device results = {} runtime = time.time() booster = xgb.train(param, training_data.DM, dict_args['max_epochs'], evals=[(test_data.DM, 'test')], evals_result=results) device = 'GPU' if use_cuda else 'CPU' if use_cuda: print( f'[bold green]{device} Run Time: {str(time.time() - runtime)} seconds' ) # Perform some predictions on the test data, evaluate and log them print('[bold blue]Performing predictions on test data.') test_predictions = np.round(booster.predict(test_data.DM)) calculate_log_metrics(test_data.y, test_predictions)