Ejemplo n.º 1
0
def _create_child_runs_for_parameter_search(parent_estimator, parent_model,
                                            parent_run, child_tags):
    from itertools import zip_longest

    client = MlflowClient()
    # Use the start time of the parent parameter search run as a rough estimate for the
    # start time of child runs, since we cannot precisely determine when each point
    # in the parameter search space was explored
    child_run_start_time = parent_run.info.start_time
    child_run_end_time = int(time.time() * 1000)

    estimator_param_maps = parent_estimator.getEstimatorParamMaps()
    tuned_estimator = parent_estimator.getEstimator()

    metrics_dict, _ = _get_param_search_metrics_and_best_index(
        parent_estimator, parent_model)
    for i in range(len(estimator_param_maps)):
        child_estimator = tuned_estimator.copy(estimator_param_maps[i])
        tags_to_log = dict(child_tags) if child_tags else {}
        tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id})
        tags_to_log.update(_get_estimator_info_tags(child_estimator))

        child_run = client.create_run(
            experiment_id=parent_run.info.experiment_id,
            start_time=child_run_start_time,
            tags=tags_to_log,
        )

        params_to_log = _get_instance_param_map(
            child_estimator,
            parent_estimator._autologging_metadata.uid_to_indexed_name_map)
        param_batches_to_log = _chunk_dict(
            params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH)
        metrics_to_log = {k: v[i] for k, v in metrics_dict.items()}
        for params_batch, metrics_batch in zip_longest(param_batches_to_log,
                                                       [metrics_to_log],
                                                       fillvalue={}):
            # Trim any parameter keys / values and metric keys that exceed the limits
            # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric)
            truncated_params_batch = _truncate_dict(params_batch,
                                                    MAX_ENTITY_KEY_LENGTH,
                                                    MAX_PARAM_VAL_LENGTH)
            truncated_metrics_batch = _truncate_dict(
                metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH)
            client.log_batch(
                run_id=child_run.info.run_id,
                params=[
                    Param(str(key), str(value))
                    for key, value in truncated_params_batch.items()
                ],
                metrics=[
                    Metric(key=str(key),
                           value=value,
                           timestamp=child_run_end_time,
                           step=0)
                    for key, value in truncated_metrics_batch.items()
                ],
            )
        client.set_terminated(run_id=child_run.info.run_id,
                              end_time=child_run_end_time)
Ejemplo n.º 2
0
    def create_run(
        self,
        experiment_id: str,
        start_time: Optional[int] = None,
        tags: Optional[Dict[str, Any]] = None,
    ) -> PendingRunId:
        """
        Enqueues a CreateRun operation with the specified attributes, returning a `PendingRunId`
        instance that can be used as input to other client logging APIs (e.g. `log_metrics`,
        `log_params`, ...).

        :return: A `PendingRunId` that can be passed as the `run_id` parameter to other client
                 logging APIs, such as `log_params` and `log_metrics`.
        """
        tags = tags or {}
        tags = _truncate_dict(tags,
                              max_key_length=MAX_ENTITY_KEY_LENGTH,
                              max_value_length=MAX_TAG_VAL_LENGTH)
        run_id = PendingRunId()
        self._get_pending_operations(run_id).enqueue(
            create_run=_PendingCreateRun(
                experiment_id=experiment_id,
                start_time=start_time,
                tags=[RunTag(key, str(value)) for key, value in tags.items()],
            ))
        return run_id
Ejemplo n.º 3
0
 def gen_evaluator_info(self, evaluator):
     """
     Generate evaluator information, include evaluator class name and params.
     """
     class_name = _get_fully_qualified_class_name(evaluator)
     param_map = _truncate_dict(_get_param_map(evaluator),
                                MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH)
     return {"evaluator_class": class_name, "params": param_map}
Ejemplo n.º 4
0
def _log_estimator_params(param_map):
    # Chunk model parameters to avoid hitting the log_batch API limit
    for chunk in _chunk_dict(
            param_map,
            chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
    ):
        truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                   MAX_PARAM_VAL_LENGTH)
        try_mlflow_log(mlflow.log_params, truncated)
Ejemplo n.º 5
0
 def set_tags(self, run_id: Union[str, PendingRunId],
              tags: Dict[str, Any]) -> None:
     """
     Enqueues a collection of Tags to be logged to the run specified by `run_id`.
     """
     tags = _truncate_dict(tags,
                           max_key_length=MAX_ENTITY_KEY_LENGTH,
                           max_value_length=MAX_TAG_VAL_LENGTH)
     tags_arr = [RunTag(key, str(value)) for key, value in tags.items()]
     self._get_pending_operations(run_id).enqueue(tags=tags_arr)
Ejemplo n.º 6
0
 def log_params(self, run_id: Union[str, PendingRunId],
                params: Dict[str, Any]) -> None:
     """
     Enqueues a collection of Parameters to be logged to the run specified by `run_id`.
     """
     params = _truncate_dict(params,
                             max_key_length=MAX_ENTITY_KEY_LENGTH,
                             max_value_length=MAX_PARAM_VAL_LENGTH)
     params_arr = [Param(key, str(value)) for key, value in params.items()]
     self._get_pending_operations(run_id).enqueue(params=params_arr)
def test_client_truncates_metric_keys():
    client = MlflowAutologgingQueueingClient()
    metrics_to_log = {
        "a" * (MAX_ENTITY_KEY_LENGTH + 5): 1,
        "b" * (MAX_ENTITY_KEY_LENGTH + 50): 2,
    }

    with mlflow.start_run() as run:
        client.log_metrics(run_id=run.info.run_id, metrics=metrics_to_log)
        client.flush()

    run_metrics = get_run_data(run.info.run_id)[1]
    assert run_metrics == _truncate_dict(metrics_to_log, max_key_length=MAX_ENTITY_KEY_LENGTH)
Ejemplo n.º 8
0
def test_client_truncates_tag_keys_and_values():
    client = MlflowAutologgingQueueingClient()
    tags_to_log = {
        "a" * (MAX_ENTITY_KEY_LENGTH + 5): "b" * (MAX_PARAM_VAL_LENGTH + 5),
        "c" * (MAX_ENTITY_KEY_LENGTH + 50): "d" * (MAX_PARAM_VAL_LENGTH + 50),
    }

    with mlflow.start_run() as run:
        client.set_tags(run_id=run.info.run_id, tags=tags_to_log)
        client.flush()

    run_tags = get_run_data(run.info.run_id)[2]
    assert run_tags == _truncate_dict(
        tags_to_log, max_key_length=MAX_ENTITY_KEY_LENGTH, max_value_length=MAX_TAG_VAL_LENGTH,
    )
Ejemplo n.º 9
0
def test_client_truncates_param_keys_and_values():
    client = MlflowAutologgingQueueingClient()
    params_to_log = {
        "a" * (MAX_ENTITY_KEY_LENGTH + 5): "b" * (MAX_PARAM_VAL_LENGTH + 5),
        "a" * (MAX_ENTITY_KEY_LENGTH + 50): "b" * (MAX_PARAM_VAL_LENGTH + 50),
    }

    with mlflow.start_run() as run:
        client.log_params(run_id=run.info.run_id, params=params_to_log)
        client.flush()

    run_params = get_run_data(run.info.run_id)[0]
    assert run_params == _truncate_dict(
        params_to_log, max_key_length=MAX_ENTITY_KEY_LENGTH, max_value_length=MAX_PARAM_VAL_LENGTH,
    )
Ejemplo n.º 10
0
    def _log_pretraining_metadata(estimator, params):

        if params and isinstance(params, dict):
            estimator = estimator.copy(params)

        # Chunk model parameters to avoid hitting the log_batch API limit
        for chunk in _chunk_dict(
                _get_instance_param_map(estimator),
                chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
        ):
            truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                       MAX_PARAM_VAL_LENGTH)
            try_mlflow_log(mlflow.log_params, truncated)

        try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
Ejemplo n.º 11
0
def test_truncate_dict():
    d = {"12345": "12345"}
    length = 5

    with mock.patch("mlflow.utils._logger.warning") as mock_warning:
        max_legnth = length - 1

        # Truncate keys
        assert _truncate_dict(d, max_key_length=max_legnth) == {
            "1...": "12345"
        }
        mock_warning.assert_called_once_with("Truncated the key `1...`")
        mock_warning.reset_mock()

        # Truncate values
        assert _truncate_dict(d, max_value_length=max_legnth) == {
            "12345": "1..."
        }
        mock_warning.assert_called_once_with(
            "Truncated the value of the key `12345`. Truncated value: `1...`")
        mock_warning.reset_mock()

        # Truncate both keys and values
        assert _truncate_dict(d,
                              max_key_length=max_legnth,
                              max_value_length=max_legnth) == {
                                  "1...": "1..."
                              }
        assert mock_warning.call_count == 2
        (args1, _), (args2, _) = mock_warning.call_args_list
        assert args1[0] == "Truncated the key `1...`"
        assert args2[
            0] == "Truncated the value of the key `1...`. Truncated value: `1...`"

    assert _truncate_dict(d, max_key_length=length,
                          max_value_length=length) == {
                              "12345": "12345"
                          }
    assert _truncate_dict(d,
                          max_key_length=length + 1,
                          max_value_length=length + 1) == {
                              "12345": "12345"
                          }

    with pytest.raises(
            ValueError,
            match=
            "Must specify at least either `max_key_length` or `max_value_length`"
    ):
        _truncate_dict(d)
Ejemplo n.º 12
0
 def log_metrics(
     self,
     run_id: Union[str, PendingRunId],
     metrics: Dict[str, float],
     step: Optional[int] = None,
 ) -> None:
     """
     Enqueues a collection of Metrics to be logged to the run specified by `run_id` at the
     step specified by `step`.
     """
     metrics = _truncate_dict(metrics, max_key_length=MAX_ENTITY_KEY_LENGTH)
     timestamp_ms = int(time.time() * 1000)
     metrics_arr = [
         Metric(key, value, timestamp_ms, step or 0)
         for key, value in metrics.items()
     ]
     self._get_pending_operations(run_id).enqueue(metrics=metrics_arr)
Ejemplo n.º 13
0
    def _log_pretraining_metadata(estimator, params):

        if params and isinstance(params, dict):
            estimator = estimator.copy(params)

        param_map = _get_instance_param_map(estimator)
        if isinstance(estimator, Pipeline):
            pipeline_hierarchy = _get_pipeline_stage_hierarchy(estimator)
            try_mlflow_log(mlflow.log_dict,
                           pipeline_hierarchy,
                           artifact_file="pipeline_hierarchy.json")

        # Chunk model parameters to avoid hitting the log_batch API limit
        for chunk in _chunk_dict(
                param_map,
                chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
        ):
            truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                       MAX_PARAM_VAL_LENGTH)
            try_mlflow_log(mlflow.log_params, truncated)

        try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
Ejemplo n.º 14
0
def _create_child_runs_for_parameter_search(cv_estimator,
                                            parent_run,
                                            child_tags=None):
    """
    Creates a collection of child runs for a parameter search training session.
    Runs are reconstructed from the `cv_results_` attribute of the specified trained
    parameter search estimator - `cv_estimator`, which provides relevant performance
    metrics for each point in the parameter search space. One child run is created
    for each point in the parameter search space. For additional information, see
    `https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html`_. # noqa: E501

    :param cv_estimator: The trained parameter search estimator for which to create
                         child runs.
    :param parent_run: A py:class:`mlflow.entities.Run` object referring to the parent
                       parameter search run for which child runs should be created.
    :param child_tags: An optional dictionary of MLflow tag keys and values to log
                       for each child run.
    """
    import pandas as pd

    client = MlflowClient()
    # Use the start time of the parent parameter search run as a rough estimate for the
    # start time of child runs, since we cannot precisely determine when each point
    # in the parameter search space was explored
    child_run_start_time = parent_run.info.start_time
    child_run_end_time = int(time.time() * 1000)

    seed_estimator = cv_estimator.estimator
    # In the unlikely case that a seed of a parameter search estimator is,
    # itself, a parameter search estimator, we should avoid logging the untuned
    # parameters of the seeds's seed estimator
    should_log_params_deeply = not _is_parameter_search_estimator(
        seed_estimator)
    # Each row of `cv_results_` only provides parameters that vary across
    # the user-specified parameter grid. In order to log the complete set
    # of parameters for each child run, we fetch the parameters defined by
    # the seed estimator and update them with parameter subset specified
    # in the result row
    base_params = seed_estimator.get_params(deep=should_log_params_deeply)

    cv_results_df = pd.DataFrame.from_dict(cv_estimator.cv_results_)
    for _, result_row in cv_results_df.iterrows():
        tags_to_log = dict(child_tags) if child_tags else {}
        tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id})
        tags_to_log.update(_get_estimator_info_tags(seed_estimator))
        child_run = client.create_run(
            experiment_id=parent_run.info.experiment_id,
            start_time=child_run_start_time,
            tags=tags_to_log,
        )

        from itertools import zip_longest

        params_to_log = dict(base_params)
        params_to_log.update(result_row.get("params", {}))
        param_batches_to_log = _chunk_dict(
            params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH)

        # Parameters values are recorded twice in the set of search `cv_results_`:
        # once within a `params` column with dictionary values and once within
        # a separate dataframe column that is created for each parameter. To prevent
        # duplication of parameters, we log the consolidated values from the parameter
        # dictionary column and filter out the other parameter-specific columns with
        # names of the form `param_{param_name}`. Additionally, `cv_results_` produces
        # metrics for each training split, which is fairly verbose; accordingly, we filter
        # out per-split metrics in favor of aggregate metrics (mean, std, etc.)
        excluded_metric_prefixes = ["param", "split"]
        metric_batches_to_log = _chunk_dict(
            {
                key: value
                for key, value in result_row.iteritems() if not any([
                    key.startswith(prefix)
                    for prefix in excluded_metric_prefixes
                ]) and isinstance(value, Number)
            },
            chunk_size=min(MAX_ENTITIES_PER_BATCH - MAX_PARAMS_TAGS_PER_BATCH,
                           MAX_METRICS_PER_BATCH),
        )

        for params_batch, metrics_batch in zip_longest(param_batches_to_log,
                                                       metric_batches_to_log,
                                                       fillvalue={}):
            # Trim any parameter keys / values and metric keys that exceed the limits
            # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric)
            truncated_params_batch = _truncate_dict(params_batch,
                                                    MAX_ENTITY_KEY_LENGTH,
                                                    MAX_PARAM_VAL_LENGTH)
            truncated_metrics_batch = _truncate_dict(
                metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH)
            client.log_batch(
                run_id=child_run.info.run_id,
                params=[
                    Param(str(key), str(value))
                    for key, value in truncated_params_batch.items()
                ],
                metrics=[
                    Metric(key=str(key),
                           value=value,
                           timestamp=child_run_end_time,
                           step=0)
                    for key, value in truncated_metrics_batch.items()
                ],
            )

        client.set_terminated(run_id=child_run.info.run_id,
                              end_time=child_run_end_time)
Ejemplo n.º 15
0
def truncate_param_dict(d):
    return _truncate_dict(d, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH)