def _average_training_metrics(
        combined_timeseries: Dict[str, Any],
        combined_num_batches: List[int]) -> List[Dict[str, Any]]:
    """Average combined training metrics across GPUs"""
    # If the value for a metric is a single-element array, the averaging process will
    # change that into just the element. We record what metrics are single-element arrays
    # so we can wrap them in an array later (for perfect compatibility with non-averaging
    # codepath).
    array_metrics = []
    for metric_name in combined_timeseries.keys():
        process_batches = combined_timeseries[metric_name]
        if isinstance(process_batches[0][0], np.ndarray):
            array_metrics.append(metric_name)

    num_batches = combined_num_batches[
        0]  # num_batches matches across data parallel ranks.
    num_processes = len(combined_num_batches)
    averaged_metrics_timeseries = {}  # type: Dict[str, List]

    for metric_name in combined_timeseries.keys():
        averaged_metrics_timeseries[metric_name] = []
        for batch_idx in range(num_batches):
            batch = [
                combined_timeseries[metric_name][process_idx][batch_idx]
                for process_idx in range(num_processes)
            ]

            np_batch = np.array(batch)
            batch_avg = np.mean(np_batch[np_batch != None])  # noqa: E711
            if metric_name in array_metrics:
                batch_avg = np.array(batch_avg)
            averaged_metrics_timeseries[metric_name].append(batch_avg)
    return util._dict_to_list(averaged_metrics_timeseries)
    def _average_training_metrics(
            self, per_batch_metrics: List[Dict[str,
                                               Any]]) -> List[Dict[str, Any]]:
        """Average training metrics across GPUs"""
        check.true(self.hvd_config.use,
                   "Can only average training metrics in multi-GPU training.")
        metrics_timeseries = util._list_to_dict(per_batch_metrics)

        # combined_timeseries is: dict[metric_name] -> 2d-array.
        # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx].
        combined_timeseries, _ = self._combine_metrics_across_processes(
            metrics_timeseries, num_batches=len(per_batch_metrics))

        # If the value for a metric is a single-element array, the averaging process will
        # change that into just the element. We record what metrics are single-element arrays
        # so we can wrap them in an array later (for perfect compatibility with non-averaging
        # codepath).
        array_metrics = []
        for metric_name in per_batch_metrics[0].keys():
            if isinstance(per_batch_metrics[0][metric_name], np.ndarray):
                array_metrics.append(metric_name)

        if self.is_chief:
            combined_timeseries_type = Dict[str, List[List[Any]]]
            combined_timeseries = cast(combined_timeseries_type,
                                       combined_timeseries)
            num_batches = len(per_batch_metrics)
            num_processes = hvd.size()
            averaged_metrics_timeseries = {}  # type: Dict[str, List]

            for metric_name in combined_timeseries.keys():
                averaged_metrics_timeseries[metric_name] = []
                for batch_idx in range(num_batches):
                    batch = [
                        combined_timeseries[metric_name][process_idx]
                        [batch_idx] for process_idx in range(num_processes)
                    ]

                    np_batch = np.array(batch)
                    batch_avg = np.mean(
                        np_batch[np_batch != None])  # noqa: E711
                    if metric_name in array_metrics:
                        batch_avg = np.array(batch_avg)
                    averaged_metrics_timeseries[metric_name].append(batch_avg)
            per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries)
        return per_batch_metrics
Exemple #3
0
def test_dict_to_list() -> None:
    r = _dict_to_list({"a": [1, 2], "b": [3, 4]})
    assert r == [{"a": 1, "b": 3}, {"a": 2, "b": 4}]