def _log_pretraining_metadata(estimator, *args, **kwargs): # pylint: disable=unused-argument """ Records metadata (e.g., params and tags) for a scikit-learn estimator prior to training. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ # Deep parameter logging includes parameters from children of a given # estimator. For some meta estimators (e.g., pipelines), recording # these parameters is desirable. For parameter search estimators, # however, child estimators act as seeds for the parameter search # process; accordingly, we avoid logging initial, untuned parameters # for these seed estimators. should_log_params_deeply = not _is_parameter_search_estimator( estimator) # Chunk model parameters to avoid hitting the log_batch API limit for chunk in _chunk_dict( estimator.get_params(deep=should_log_params_deeply), chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ): truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) try_mlflow_log(mlflow.log_params, truncated) try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
def _log_pretraining_metadata(autologging_client, estimator, *args, **kwargs): # pylint: disable=unused-argument """ Records metadata (e.g., params and tags) for a scikit-learn estimator prior to training. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param autologging_client: An instance of `MlflowAutologgingQueueingClient` used for efficiently logging run data to MLflow Tracking. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ # Deep parameter logging includes parameters from children of a given # estimator. For some meta estimators (e.g., pipelines), recording # these parameters is desirable. For parameter search estimators, # however, child estimators act as seeds for the parameter search # process; accordingly, we avoid logging initial, untuned parameters # for these seed estimators. should_log_params_deeply = not _is_parameter_search_estimator( estimator) run_id = mlflow.active_run().info.run_id autologging_client.log_params( run_id=mlflow.active_run().info.run_id, params=estimator.get_params(deep=should_log_params_deeply), ) autologging_client.set_tags( run_id=run_id, tags=_get_estimator_info_tags(estimator), )
def _log_posttraining_metadata(estimator, *args, **kwargs): """ Records metadata for a scikit-learn estimator after training has completed. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ if hasattr(estimator, "score"): try: score_args = _get_args_for_score(estimator.score, estimator.fit, args, kwargs) training_score = estimator.score(*score_args) except Exception as e: msg = ( estimator.score.__qualname__ + " failed. The 'training_score' metric will not be recorded. Scoring error: " + str(e)) _logger.warning(msg) else: try_mlflow_log(mlflow.log_metric, "training_score", training_score) # log common metrics and artifacts for estimators (classifier, regressor) _log_specialized_estimator_content(estimator, mlflow.active_run().info.run_id, args, kwargs) def get_input_example(): # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) fit_arg_names = _get_arg_names(estimator.fit) X_var_name, y_var_name = fit_arg_names[:2] input_example = _get_Xy(args, kwargs, X_var_name, y_var_name)[0][:INPUT_EXAMPLE_SAMPLE_ROWS] return input_example def infer_model_signature(input_example): if not hasattr(estimator, "predict"): raise Exception( "the trained model does not specify a `predict` function, " + "which is required in order to infer the signature") return infer_signature(input_example, estimator.predict(input_example)) if log_models: # Will only resolve `input_example` and `signature` if `log_models` is `True`. input_example, signature = resolve_input_example_and_signature( get_input_example, infer_model_signature, log_input_examples, log_model_signatures, _logger, ) try_mlflow_log( log_model, estimator, artifact_path="model", signature=signature, input_example=input_example, ) if _is_parameter_search_estimator(estimator): if hasattr(estimator, "best_estimator_") and log_models: try_mlflow_log( log_model, estimator.best_estimator_, artifact_path="best_estimator", signature=signature, input_example=input_example, ) if hasattr(estimator, "best_score_"): try_mlflow_log(mlflow.log_metric, "best_cv_score", estimator.best_score_) if hasattr(estimator, "best_params_"): best_params = { "best_{param_name}".format(param_name=param_name): param_value for param_name, param_value in estimator.best_params_.items() } try_mlflow_log(mlflow.log_params, best_params) if hasattr(estimator, "cv_results_"): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run child_tags = context_registry.resolve_tags() child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME}) _create_child_runs_for_parameter_search( cv_estimator=estimator, parent_run=mlflow.active_run(), child_tags=child_tags, ) except Exception as e: msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( str(e))) _logger.warning(msg) try: cv_results_df = pd.DataFrame.from_dict( estimator.cv_results_) _log_parameter_search_results_as_artifact( cv_results_df, mlflow.active_run().info.run_id) except Exception as e: msg = ( "Failed to log parameter search results as an artifact." " Exception: {}".format(str(e))) _logger.warning(msg)
def _log_posttraining_metadata(estimator, *args, **kwargs): """ Records metadata for a scikit-learn estimator after training has completed. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ if hasattr(estimator, "score"): try: score_args = _get_args_for_score(estimator.score, estimator.fit, args, kwargs) training_score = estimator.score(*score_args) except Exception as e: # pylint: disable=broad-except msg = ( estimator.score.__qualname__ + " failed. The 'training_score' metric will not be recorded. Scoring error: " + str(e)) _logger.warning(msg) else: try_mlflow_log(mlflow.log_metric, "training_score", training_score) # log common metrics and artifacts for estimators (classifier, regressor) _log_specialized_estimator_content(estimator, mlflow.active_run().info.run_id, args, kwargs) input_example = None signature = None if hasattr(estimator, "predict"): try: # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) SAMPLE_ROWS = 5 fit_arg_names = _get_arg_names(estimator.fit) X_var_name, y_var_name = fit_arg_names[:2] input_example = _get_Xy(args, kwargs, X_var_name, y_var_name)[0][:SAMPLE_ROWS] model_output = estimator.predict(input_example) signature = infer_signature(input_example, model_output) except Exception as e: # pylint: disable=broad-except input_example = None msg = "Failed to infer an input example and model signature: " + str( e) _logger.warning(msg) try_mlflow_log( log_model, estimator, artifact_path="model", signature=signature, input_example=input_example, ) if _is_parameter_search_estimator(estimator): if hasattr(estimator, "best_estimator_"): try_mlflow_log( log_model, estimator.best_estimator_, artifact_path="best_estimator", signature=signature, input_example=input_example, ) if hasattr(estimator, "best_params_"): best_params = { "best_{param_name}".format(param_name=param_name): param_value for param_name, param_value in estimator.best_params_.items() } try_mlflow_log(mlflow.log_params, best_params) if hasattr(estimator, "cv_results_"): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run environment_tags = context_registry.resolve_tags() _create_child_runs_for_parameter_search( cv_estimator=estimator, parent_run=mlflow.active_run(), child_tags=environment_tags, ) except Exception as e: # pylint: disable=broad-except msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( str(e))) _logger.warning(msg) try: cv_results_df = pd.DataFrame.from_dict( estimator.cv_results_) _log_parameter_search_results_as_artifact( cv_results_df, mlflow.active_run().info.run_id) except Exception as e: # pylint: disable=broad-except msg = ( "Failed to log parameter search results as an artifact." " Exception: {}".format(str(e))) _logger.warning(msg)
def _log_posttraining_metadata(autologging_client, estimator, *args, **kwargs): """ Records metadata for a scikit-learn estimator after training has completed. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param autologging_client: An instance of `MlflowAutologgingQueueingClient` used for efficiently logging run data to MLflow Tracking. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ def infer_model_signature(input_example): if not hasattr(estimator, "predict"): raise Exception( "the trained model does not specify a `predict` function, " + "which is required in order to infer the signature") return infer_signature(input_example, estimator.predict(input_example)) (X, y_true, sample_weight) = _get_args_for_metrics(estimator.fit, args, kwargs) # log common metrics and artifacts for estimators (classifier, regressor) logged_metrics = _log_estimator_content( autologging_client=autologging_client, estimator=estimator, prefix=_TRAINING_PREFIX, run_id=mlflow.active_run().info.run_id, X=X, y_true=y_true, sample_weight=sample_weight, ) if y_true is None and not logged_metrics: _logger.warning( "Training metrics will not be recorded because training labels were not specified." " To automatically record training metrics, provide training labels as inputs to" " the model training function.") def get_input_example(): # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) input_example = X[:INPUT_EXAMPLE_SAMPLE_ROWS] return input_example if log_models: # Will only resolve `input_example` and `signature` if `log_models` is `True`. input_example, signature = resolve_input_example_and_signature( get_input_example, infer_model_signature, log_input_examples, log_model_signatures, _logger, ) log_model( estimator, artifact_path="model", signature=signature, input_example=input_example, ) if _is_parameter_search_estimator(estimator): if hasattr(estimator, "best_estimator_") and log_models: log_model( estimator.best_estimator_, artifact_path="best_estimator", signature=signature, input_example=input_example, ) if hasattr(estimator, "best_score_"): autologging_client.log_metrics( run_id=mlflow.active_run().info.run_id, metrics={"best_cv_score": estimator.best_score_}, ) if hasattr(estimator, "best_params_"): best_params = { "best_{param_name}".format(param_name=param_name): param_value for param_name, param_value in estimator.best_params_.items() } autologging_client.log_params( run_id=mlflow.active_run().info.run_id, params=best_params, ) if hasattr(estimator, "cv_results_"): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run child_tags = context_registry.resolve_tags() child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME}) _create_child_runs_for_parameter_search( autologging_client=autologging_client, cv_estimator=estimator, parent_run=mlflow.active_run(), max_tuning_runs=max_tuning_runs, child_tags=child_tags, ) except Exception as e: msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( str(e))) _logger.warning(msg) try: cv_results_df = pd.DataFrame.from_dict( estimator.cv_results_) _log_parameter_search_results_as_artifact( cv_results_df, mlflow.active_run().info.run_id) except Exception as e: msg = ( "Failed to log parameter search results as an artifact." " Exception: {}".format(str(e))) _logger.warning(msg)