def _average_training_metrics(
            self, per_batch_metrics: List[Dict[str,
                                               Any]]) -> List[Dict[str, Any]]:
        """Average training metrics across GPUs"""
        check.true(self.hvd_config.use,
                   "Can only average training metrics in multi-GPU training.")
        metrics_timeseries = util._list_to_dict(per_batch_metrics)

        # combined_timeseries is: dict[metric_name] -> 2d-array.
        # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx].
        combined_timeseries, _ = self._combine_metrics_across_processes(
            metrics_timeseries, num_batches=len(per_batch_metrics))

        # If the value for a metric is a single-element array, the averaging process will
        # change that into just the element. We record what metrics are single-element arrays
        # so we can wrap them in an array later (for perfect compatibility with non-averaging
        # codepath).
        array_metrics = []
        for metric_name in per_batch_metrics[0].keys():
            if isinstance(per_batch_metrics[0][metric_name], np.ndarray):
                array_metrics.append(metric_name)

        if self.is_chief:
            combined_timeseries_type = Dict[str, List[List[Any]]]
            combined_timeseries = cast(combined_timeseries_type,
                                       combined_timeseries)
            num_batches = len(per_batch_metrics)
            num_processes = hvd.size()
            averaged_metrics_timeseries = {}  # type: Dict[str, List]

            for metric_name in combined_timeseries.keys():
                averaged_metrics_timeseries[metric_name] = []
                for batch_idx in range(num_batches):
                    batch = [
                        combined_timeseries[metric_name][process_idx]
                        [batch_idx] for process_idx in range(num_processes)
                    ]

                    np_batch = np.array(batch)
                    batch_avg = np.mean(
                        np_batch[np_batch != None])  # noqa: E711
                    if metric_name in array_metrics:
                        batch_avg = np.array(batch_avg)
                    averaged_metrics_timeseries[metric_name].append(batch_avg)
            per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries)
        return per_batch_metrics
def _combine_and_average_training_metrics(
        context: det.core.DistributedContext,
        per_batch_metrics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    assert context.size > 1, "Can only average training metrics in multi-GPU training."
    metrics_timeseries = util._list_to_dict(per_batch_metrics)

    # Gather metrics across ranks onto rank 0 slot.
    # The combined_timeseries is: dict[metric_name] -> 2d-array.
    # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx].
    combined_timeseries, combined_num_batches = _combine_metrics_across_processes(
        context, metrics_timeseries, num_batches=len(per_batch_metrics))

    if context.rank == 0:
        # We can safely cast variables here because this is all happening on the chief, which
        # is where we gather metrics.
        combined_timeseries = cast(Dict[str, List[List[Any]]],
                                   combined_timeseries)
        combined_num_batches = cast(List[int], combined_num_batches)

        per_batch_metrics = _average_training_metrics(combined_timeseries,
                                                      combined_num_batches)
    return per_batch_metrics
Ejemplo n.º 3
0
def test_list_to_dict() -> None:
    r = _list_to_dict([{"a": 1}, {"b": 2}, {"a": 2}])
    assert r == {"a": [1, 2], "b": [2]}