def test_chunk_dict(): d = {i: i for i in range(10)} assert list(_chunk_dict(d, 4)) == [ {i: i for i in range(4)}, {i: i for i in range(4, 8)}, {i: i for i in range(8, 10)}, ] assert list(_chunk_dict(d, 5)) == [ {i: i for i in range(5)}, {i: i for i in range(5, 10)}, ] assert list(_chunk_dict(d, len(d))) == [d] assert list(_chunk_dict(d, len(d) + 1)) == [d]
def _create_child_runs_for_parameter_search(parent_estimator, parent_model, parent_run, child_tags): from itertools import zip_longest client = MlflowClient() # Use the start time of the parent parameter search run as a rough estimate for the # start time of child runs, since we cannot precisely determine when each point # in the parameter search space was explored child_run_start_time = parent_run.info.start_time child_run_end_time = int(time.time() * 1000) estimator_param_maps = parent_estimator.getEstimatorParamMaps() tuned_estimator = parent_estimator.getEstimator() metrics_dict, _ = _get_param_search_metrics_and_best_index( parent_estimator, parent_model) for i in range(len(estimator_param_maps)): child_estimator = tuned_estimator.copy(estimator_param_maps[i]) tags_to_log = dict(child_tags) if child_tags else {} tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}) tags_to_log.update(_get_estimator_info_tags(child_estimator)) child_run = client.create_run( experiment_id=parent_run.info.experiment_id, start_time=child_run_start_time, tags=tags_to_log, ) params_to_log = _get_instance_param_map( child_estimator, parent_estimator._autologging_metadata.uid_to_indexed_name_map) param_batches_to_log = _chunk_dict( params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH) metrics_to_log = {k: v[i] for k, v in metrics_dict.items()} for params_batch, metrics_batch in zip_longest(param_batches_to_log, [metrics_to_log], fillvalue={}): # Trim any parameter keys / values and metric keys that exceed the limits # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric) truncated_params_batch = _truncate_dict(params_batch, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) truncated_metrics_batch = _truncate_dict( metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH) client.log_batch( run_id=child_run.info.run_id, params=[ Param(str(key), str(value)) for key, value in truncated_params_batch.items() ], metrics=[ Metric(key=str(key), value=value, timestamp=child_run_end_time, step=0) for key, value in truncated_metrics_batch.items() ], ) client.set_terminated(run_id=child_run.info.run_id, end_time=child_run_end_time)
def _log_estimator_params(param_map): # Chunk model parameters to avoid hitting the log_batch API limit for chunk in _chunk_dict( param_map, chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ): truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) try_mlflow_log(mlflow.log_params, truncated)
def _log_pretraining_metadata(estimator, params): if params and isinstance(params, dict): estimator = estimator.copy(params) # Chunk model parameters to avoid hitting the log_batch API limit for chunk in _chunk_dict( _get_instance_param_map(estimator), chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ): truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) try_mlflow_log(mlflow.log_params, truncated) try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
def _log_pretraining_metadata(estimator, params): if params and isinstance(params, dict): estimator = estimator.copy(params) param_map = _get_instance_param_map(estimator) if isinstance(estimator, Pipeline): pipeline_hierarchy = _get_pipeline_stage_hierarchy(estimator) try_mlflow_log(mlflow.log_dict, pipeline_hierarchy, artifact_file="pipeline_hierarchy.json") # Chunk model parameters to avoid hitting the log_batch API limit for chunk in _chunk_dict( param_map, chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ): truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) try_mlflow_log(mlflow.log_params, truncated) try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
def _create_child_runs_for_parameter_search(cv_estimator, parent_run, child_tags=None): """ Creates a collection of child runs for a parameter search training session. Runs are reconstructed from the `cv_results_` attribute of the specified trained parameter search estimator - `cv_estimator`, which provides relevant performance metrics for each point in the parameter search space. One child run is created for each point in the parameter search space. For additional information, see `https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html`_. # noqa: E501 :param cv_estimator: The trained parameter search estimator for which to create child runs. :param parent_run: A py:class:`mlflow.entities.Run` object referring to the parent parameter search run for which child runs should be created. :param child_tags: An optional dictionary of MLflow tag keys and values to log for each child run. """ import pandas as pd client = MlflowClient() # Use the start time of the parent parameter search run as a rough estimate for the # start time of child runs, since we cannot precisely determine when each point # in the parameter search space was explored child_run_start_time = parent_run.info.start_time child_run_end_time = int(time.time() * 1000) seed_estimator = cv_estimator.estimator # In the unlikely case that a seed of a parameter search estimator is, # itself, a parameter search estimator, we should avoid logging the untuned # parameters of the seeds's seed estimator should_log_params_deeply = not _is_parameter_search_estimator( seed_estimator) # Each row of `cv_results_` only provides parameters that vary across # the user-specified parameter grid. In order to log the complete set # of parameters for each child run, we fetch the parameters defined by # the seed estimator and update them with parameter subset specified # in the result row base_params = seed_estimator.get_params(deep=should_log_params_deeply) cv_results_df = pd.DataFrame.from_dict(cv_estimator.cv_results_) for _, result_row in cv_results_df.iterrows(): tags_to_log = dict(child_tags) if child_tags else {} tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}) tags_to_log.update(_get_estimator_info_tags(seed_estimator)) child_run = client.create_run( experiment_id=parent_run.info.experiment_id, start_time=child_run_start_time, tags=tags_to_log, ) from itertools import zip_longest params_to_log = dict(base_params) params_to_log.update(result_row.get("params", {})) param_batches_to_log = _chunk_dict( params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH) # Parameters values are recorded twice in the set of search `cv_results_`: # once within a `params` column with dictionary values and once within # a separate dataframe column that is created for each parameter. To prevent # duplication of parameters, we log the consolidated values from the parameter # dictionary column and filter out the other parameter-specific columns with # names of the form `param_{param_name}`. Additionally, `cv_results_` produces # metrics for each training split, which is fairly verbose; accordingly, we filter # out per-split metrics in favor of aggregate metrics (mean, std, etc.) excluded_metric_prefixes = ["param", "split"] metric_batches_to_log = _chunk_dict( { key: value for key, value in result_row.iteritems() if not any([ key.startswith(prefix) for prefix in excluded_metric_prefixes ]) and isinstance(value, Number) }, chunk_size=min(MAX_ENTITIES_PER_BATCH - MAX_PARAMS_TAGS_PER_BATCH, MAX_METRICS_PER_BATCH), ) for params_batch, metrics_batch in zip_longest(param_batches_to_log, metric_batches_to_log, fillvalue={}): # Trim any parameter keys / values and metric keys that exceed the limits # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric) truncated_params_batch = _truncate_dict(params_batch, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) truncated_metrics_batch = _truncate_dict( metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH) client.log_batch( run_id=child_run.info.run_id, params=[ Param(str(key), str(value)) for key, value in truncated_params_batch.items() ], metrics=[ Metric(key=str(key), value=value, timestamp=child_run_end_time, step=0) for key, value in truncated_metrics_batch.items() ], ) client.set_terminated(run_id=child_run.info.run_id, end_time=child_run_end_time)