Esempio n. 1
0
def _create_child_runs_for_parameter_search(parent_estimator, parent_model,
                                            parent_run, child_tags):
    from itertools import zip_longest

    client = MlflowClient()
    # Use the start time of the parent parameter search run as a rough estimate for the
    # start time of child runs, since we cannot precisely determine when each point
    # in the parameter search space was explored
    child_run_start_time = parent_run.info.start_time
    child_run_end_time = int(time.time() * 1000)

    estimator_param_maps = parent_estimator.getEstimatorParamMaps()
    tuned_estimator = parent_estimator.getEstimator()

    metrics_dict, _ = _get_param_search_metrics_and_best_index(
        parent_estimator, parent_model)
    for i in range(len(estimator_param_maps)):
        child_estimator = tuned_estimator.copy(estimator_param_maps[i])
        tags_to_log = dict(child_tags) if child_tags else {}
        tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id})
        tags_to_log.update(_get_estimator_info_tags(child_estimator))

        child_run = client.create_run(
            experiment_id=parent_run.info.experiment_id,
            start_time=child_run_start_time,
            tags=tags_to_log,
        )

        params_to_log = _get_instance_param_map(
            child_estimator,
            parent_estimator._autologging_metadata.uid_to_indexed_name_map)
        param_batches_to_log = _chunk_dict(
            params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH)
        metrics_to_log = {k: v[i] for k, v in metrics_dict.items()}
        for params_batch, metrics_batch in zip_longest(param_batches_to_log,
                                                       [metrics_to_log],
                                                       fillvalue={}):
            # Trim any parameter keys / values and metric keys that exceed the limits
            # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric)
            truncated_params_batch = _truncate_dict(params_batch,
                                                    MAX_ENTITY_KEY_LENGTH,
                                                    MAX_PARAM_VAL_LENGTH)
            truncated_metrics_batch = _truncate_dict(
                metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH)
            client.log_batch(
                run_id=child_run.info.run_id,
                params=[
                    Param(str(key), str(value))
                    for key, value in truncated_params_batch.items()
                ],
                metrics=[
                    Metric(key=str(key),
                           value=value,
                           timestamp=child_run_end_time,
                           step=0)
                    for key, value in truncated_metrics_batch.items()
                ],
            )
        client.set_terminated(run_id=child_run.info.run_id,
                              end_time=child_run_end_time)
Esempio n. 2
0
class MlflowAutologgingQueueingClient:
    """
    Efficiently implements a subset of MLflow Tracking's  `MlflowClient` and fluent APIs to provide
    automatic batching and async execution of run operations by way of queueing, as well as
    parameter / tag truncation for autologging use cases. Run operations defined by this client,
    such as `create_run` and `log_metrics`, enqueue data for future persistence to MLflow
    Tracking. Data is not persisted until the queue is flushed via the `flush()` method, which
    supports synchronous and asynchronous execution.

    MlflowAutologgingQueueingClient is not threadsafe; none of its APIs should be called
    concurrently.
    """
    def __init__(self, tracking_uri=None):
        self._client = MlflowClient(tracking_uri)
        self._pending_ops_by_run_id = {}

    def __enter__(self):
        """
        Enables `MlflowAutologgingQueueingClient` to be used as a context manager with
        synchronous flushing upon exit, removing the need to call `flush()` for use cases
        where logging completion can be waited upon synchronously.

        Run content is only flushed if the context exited without an exception.
        """
        return self

    def __exit__(self, exc_type, exc, traceback):  # pylint: disable=unused-argument
        """
        Enables `MlflowAutologgingQueueingClient` to be used as a context manager with
        synchronous flushing upon exit, removing the need to call `flush()` for use cases
        where logging completion can be waited upon synchronously.

        Run content is only flushed if the context exited without an exception.
        """
        # NB: Run content is only flushed upon context exit to ensure that we don't elide the
        # original exception thrown by the context (because `flush()` itself may throw). This
        # is consistent with the behavior of a routine that calls `flush()` explicitly: content
        # is not logged if an exception preempts the call to `flush()`
        if exc is None and exc_type is None and traceback is None:
            self.flush(synchronous=True)
        else:
            _logger.debug(
                "Skipping run content logging upon MlflowAutologgingQueueingClient context because"
                " an exception was raised within the context: %s",
                exc,
            )

    def create_run(
        self,
        experiment_id: str,
        start_time: Optional[int] = None,
        tags: Optional[Dict[str, Any]] = None,
    ) -> PendingRunId:
        """
        Enqueues a CreateRun operation with the specified attributes, returning a `PendingRunId`
        instance that can be used as input to other client logging APIs (e.g. `log_metrics`,
        `log_params`, ...).

        :return: A `PendingRunId` that can be passed as the `run_id` parameter to other client
                 logging APIs, such as `log_params` and `log_metrics`.
        """
        tags = tags or {}
        tags = _truncate_dict(tags,
                              max_key_length=MAX_ENTITY_KEY_LENGTH,
                              max_value_length=MAX_TAG_VAL_LENGTH)
        run_id = PendingRunId()
        self._get_pending_operations(run_id).enqueue(
            create_run=_PendingCreateRun(
                experiment_id=experiment_id,
                start_time=start_time,
                tags=[RunTag(key, str(value)) for key, value in tags.items()],
            ))
        return run_id

    def set_terminated(
        self,
        run_id: Union[str, PendingRunId],
        status: Optional[str] = None,
        end_time: Optional[int] = None,
    ) -> None:
        """
        Enqueues an UpdateRun operation with the specified `status` and `end_time` attributes
        for the specified `run_id`.
        """
        self._get_pending_operations(run_id).enqueue(
            set_terminated=_PendingSetTerminated(status=status,
                                                 end_time=end_time))

    def log_params(self, run_id: Union[str, PendingRunId],
                   params: Dict[str, Any]) -> None:
        """
        Enqueues a collection of Parameters to be logged to the run specified by `run_id`.
        """
        params = _truncate_dict(params,
                                max_key_length=MAX_ENTITY_KEY_LENGTH,
                                max_value_length=MAX_PARAM_VAL_LENGTH)
        params_arr = [Param(key, str(value)) for key, value in params.items()]
        self._get_pending_operations(run_id).enqueue(params=params_arr)

    def log_metrics(
        self,
        run_id: Union[str, PendingRunId],
        metrics: Dict[str, float],
        step: Optional[int] = None,
    ) -> None:
        """
        Enqueues a collection of Metrics to be logged to the run specified by `run_id` at the
        step specified by `step`.
        """
        metrics = _truncate_dict(metrics, max_key_length=MAX_ENTITY_KEY_LENGTH)
        timestamp_ms = int(time.time() * 1000)
        metrics_arr = [
            Metric(key, value, timestamp_ms, step or 0)
            for key, value in metrics.items()
        ]
        self._get_pending_operations(run_id).enqueue(metrics=metrics_arr)

    def set_tags(self, run_id: Union[str, PendingRunId],
                 tags: Dict[str, Any]) -> None:
        """
        Enqueues a collection of Tags to be logged to the run specified by `run_id`.
        """
        tags = _truncate_dict(tags,
                              max_key_length=MAX_ENTITY_KEY_LENGTH,
                              max_value_length=MAX_TAG_VAL_LENGTH)
        tags_arr = [RunTag(key, str(value)) for key, value in tags.items()]
        self._get_pending_operations(run_id).enqueue(tags=tags_arr)

    def flush(self, synchronous=True):
        """
        Flushes all queued run operations, resulting in the creation or mutation of runs
        and run data.

        :param synchronous: If `True`, run operations are performed synchronously, and a
                            `RunOperations` result object is only returned once all operations
                            are complete. If `False`, run operations are performed asynchronously,
                            and an `RunOperations` object is returned that represents the ongoing
                            run operations.
        :return: A `RunOperations` instance representing the flushed operations. These operations
                 are already complete if `synchronous` is `True`. If `synchronous` is `False`, these
                 operations may still be inflight. Operation completion can be synchronously waited
                 on via `RunOperations.await_completion()`.
        """
        logging_futures = []
        for pending_operations in self._pending_ops_by_run_id.values():
            future = _AUTOLOGGING_QUEUEING_CLIENT_THREAD_POOL.submit(
                self._flush_pending_operations,
                pending_operations=pending_operations,
            )
            logging_futures.append(future)
        self._pending_ops_by_run_id = {}

        logging_operations = RunOperations(logging_futures)
        if synchronous:
            logging_operations.await_completion()
        return logging_operations

    def _get_pending_operations(self, run_id):
        """
        :return: A `_PendingRunOperations` containing all pending operations for the
                 specified `run_id`.
        """
        if run_id not in self._pending_ops_by_run_id:
            self._pending_ops_by_run_id[run_id] = _PendingRunOperations(
                run_id=run_id)
        return self._pending_ops_by_run_id[run_id]

    def _try_operation(self, fn, *args, **kwargs):
        """
        Attempt to evaluate the specified function, `fn`, on the specified `*args` and `**kwargs`,
        returning either the result of the function evaluation (if evaluation was successful) or
        the exception raised by the function evaluation (if evaluation was unsuccessful).
        """
        try:
            return fn(*args, **kwargs)
        except Exception as e:
            return e

    def _flush_pending_operations(self, pending_operations):
        """
        Synchronously and sequentially flushes the specified list of pending run operations.

        NB: Operations are not parallelized on a per-run basis because MLflow's File Store, which
        is frequently used for local ML development, does not support threadsafe metadata logging
        within a given run.
        """
        if pending_operations.create_run:
            create_run_tags = pending_operations.create_run.tags
            num_additional_tags_to_include_during_creation = MAX_ENTITIES_PER_BATCH - len(
                create_run_tags)
            if num_additional_tags_to_include_during_creation > 0:
                create_run_tags.extend(
                    pending_operations.
                    tags_queue[:num_additional_tags_to_include_during_creation]
                )
                pending_operations.tags_queue = pending_operations.tags_queue[
                    num_additional_tags_to_include_during_creation:]

            new_run = self._client.create_run(
                experiment_id=pending_operations.create_run.experiment_id,
                start_time=pending_operations.create_run.start_time,
                tags={tag.key: tag.value
                      for tag in create_run_tags},
            )
            pending_operations.run_id = new_run.info.run_id

        run_id = pending_operations.run_id
        assert not isinstance(
            run_id, PendingRunId), "Run ID cannot be pending for logging"

        operation_results = []

        param_batches_to_log = chunk_list(
            pending_operations.params_queue,
            chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
        )
        tag_batches_to_log = chunk_list(
            pending_operations.tags_queue,
            chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
        )
        for params_batch, tags_batch in zip_longest(param_batches_to_log,
                                                    tag_batches_to_log,
                                                    fillvalue=[]):
            metrics_batch_size = min(
                MAX_ENTITIES_PER_BATCH - len(params_batch) - len(tags_batch),
                MAX_METRICS_PER_BATCH,
            )
            metrics_batch_size = max(metrics_batch_size, 0)
            metrics_batch = pending_operations.metrics_queue[:
                                                             metrics_batch_size]
            pending_operations.metrics_queue = pending_operations.metrics_queue[
                metrics_batch_size:]

            operation_results.append(
                self._try_operation(
                    self._client.log_batch,
                    run_id=run_id,
                    metrics=metrics_batch,
                    params=params_batch,
                    tags=tags_batch,
                ))

        for metrics_batch in chunk_list(pending_operations.metrics_queue,
                                        chunk_size=MAX_METRICS_PER_BATCH):
            operation_results.append(
                self._try_operation(self._client.log_batch,
                                    run_id=run_id,
                                    metrics=metrics_batch))

        if pending_operations.set_terminated:
            operation_results.append(
                self._try_operation(
                    self._client.set_terminated,
                    run_id=run_id,
                    status=pending_operations.set_terminated.status,
                    end_time=pending_operations.set_terminated.end_time,
                ))

        failures = [
            result for result in operation_results
            if isinstance(result, Exception)
        ]
        if len(failures) > 0:
            raise MlflowException(message=(
                "Failed to perform one or more operations on the run with ID {run_id}."
                " Failed operations: {failures}".format(run_id=run_id,
                                                        failures=failures)))
Esempio n. 3
0
def _create_child_runs_for_parameter_search(cv_estimator,
                                            parent_run,
                                            child_tags=None):
    """
    Creates a collection of child runs for a parameter search training session.
    Runs are reconstructed from the `cv_results_` attribute of the specified trained
    parameter search estimator - `cv_estimator`, which provides relevant performance
    metrics for each point in the parameter search space. One child run is created
    for each point in the parameter search space. For additional information, see
    `https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html`_. # noqa: E501

    :param cv_estimator: The trained parameter search estimator for which to create
                         child runs.
    :param parent_run: A py:class:`mlflow.entities.Run` object referring to the parent
                       parameter search run for which child runs should be created.
    :param child_tags: An optional dictionary of MLflow tag keys and values to log
                       for each child run.
    """
    import pandas as pd

    client = MlflowClient()
    # Use the start time of the parent parameter search run as a rough estimate for the
    # start time of child runs, since we cannot precisely determine when each point
    # in the parameter search space was explored
    child_run_start_time = parent_run.info.start_time
    child_run_end_time = int(time.time() * 1000)

    seed_estimator = cv_estimator.estimator
    # In the unlikely case that a seed of a parameter search estimator is,
    # itself, a parameter search estimator, we should avoid logging the untuned
    # parameters of the seeds's seed estimator
    should_log_params_deeply = not _is_parameter_search_estimator(
        seed_estimator)
    # Each row of `cv_results_` only provides parameters that vary across
    # the user-specified parameter grid. In order to log the complete set
    # of parameters for each child run, we fetch the parameters defined by
    # the seed estimator and update them with parameter subset specified
    # in the result row
    base_params = seed_estimator.get_params(deep=should_log_params_deeply)

    cv_results_df = pd.DataFrame.from_dict(cv_estimator.cv_results_)
    for _, result_row in cv_results_df.iterrows():
        tags_to_log = dict(child_tags) if child_tags else {}
        tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id})
        tags_to_log.update(_get_estimator_info_tags(seed_estimator))
        child_run = client.create_run(
            experiment_id=parent_run.info.experiment_id,
            start_time=child_run_start_time,
            tags=tags_to_log,
        )

        from itertools import zip_longest

        params_to_log = dict(base_params)
        params_to_log.update(result_row.get("params", {}))
        param_batches_to_log = _chunk_dict(
            params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH)

        # Parameters values are recorded twice in the set of search `cv_results_`:
        # once within a `params` column with dictionary values and once within
        # a separate dataframe column that is created for each parameter. To prevent
        # duplication of parameters, we log the consolidated values from the parameter
        # dictionary column and filter out the other parameter-specific columns with
        # names of the form `param_{param_name}`. Additionally, `cv_results_` produces
        # metrics for each training split, which is fairly verbose; accordingly, we filter
        # out per-split metrics in favor of aggregate metrics (mean, std, etc.)
        excluded_metric_prefixes = ["param", "split"]
        metric_batches_to_log = _chunk_dict(
            {
                key: value
                for key, value in result_row.iteritems() if not any([
                    key.startswith(prefix)
                    for prefix in excluded_metric_prefixes
                ]) and isinstance(value, Number)
            },
            chunk_size=min(MAX_ENTITIES_PER_BATCH - MAX_PARAMS_TAGS_PER_BATCH,
                           MAX_METRICS_PER_BATCH),
        )

        for params_batch, metrics_batch in zip_longest(param_batches_to_log,
                                                       metric_batches_to_log,
                                                       fillvalue={}):
            # Trim any parameter keys / values and metric keys that exceed the limits
            # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric)
            truncated_params_batch = _truncate_dict(params_batch,
                                                    MAX_ENTITY_KEY_LENGTH,
                                                    MAX_PARAM_VAL_LENGTH)
            truncated_metrics_batch = _truncate_dict(
                metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH)
            client.log_batch(
                run_id=child_run.info.run_id,
                params=[
                    Param(str(key), str(value))
                    for key, value in truncated_params_batch.items()
                ],
                metrics=[
                    Metric(key=str(key),
                           value=value,
                           timestamp=child_run_end_time,
                           step=0)
                    for key, value in truncated_metrics_batch.items()
                ],
            )

        client.set_terminated(run_id=child_run.info.run_id,
                              end_time=child_run_end_time)
Esempio n. 4
0
class MLflowWriter(object):
    def __init__(self, exp_name, save_dir, log_every, **mlflow_cfg):
        mlflow.set_tracking_uri(save_dir)
        self.client = MlflowClient(**mlflow_cfg)
        mlflow.set_experiment(exp_name)
        self.experiment_id = self.client.get_experiment_by_name(
            exp_name).experiment_id
        self.run_id = self.client.create_run(self.experiment_id).info.run_id

        self.log_every = log_every
        self.clear()

    def log_params_from_omegaconf(self, params):
        self._explore_recursive("", params)

    def _explore_recursive(self, parent_name, element):
        if isinstance(element, DictConfig):
            iterator = element.items()
        elif isinstance(element, ListConfig):
            iterator = enumerate(element)

        for k, v in iterator:
            if isinstance(v, DictConfig) or isinstance(v, ListConfig):
                self._explore_recursive(f"{parent_name}{k}.", v)
            else:
                self.client.log_param(
                    self.run_id, f"{parent_name}{k}", v)

    def log_torch_model(self, model, epoch):
        with mlflow.start_run(self.run_id):
            mlflow.pytorch.log_model(model, "model_%04d" % epoch)

    def log_metric(self, key, value, is_training):
        if isinstance(value, torch.Tensor):
            value = float(value.detach().cpu().numpy())

        metric_name = "train/" if is_training else "valid/"
        metric_name += str(key)

        if key in self.metrics:
            self.metrics[metric_name].append(value)
        else:
            self.metrics[metric_name] = [value]

    def next_iteration(self):
        self.iterations += 1
        if self.iterations % self.log_every == 0:
            self.toMlflow(nb_data=self.log_every)

    def toMlflow(self, nb_data=0, step=0):
        for key, value in self.metrics.items():
            self.client.log_metric(
                self.run_id, key,
                np.mean(value[-nb_data:]), step=step)

    def get_mean(self, key, is_training):
        metric_name = "train/" if is_training else "valid/"
        metric_name += str(key)

        return np.mean(self.metrics[metric_name])

    def clear(self):
        self.metrics = {}
        self.iterations = 0

    def log_artifact(self, path):
        self.client.log_artifact(self.run_id, local_path=path)

    def terminate(self):
        self.client.set_terminated(self.run_id)