Esempio n. 1
0
    def _batch_all_reduce(self, aggregation, per_device_values):
        """All-reduce across all workers in a batch."""
        if context.executing_eagerly():
            raise ValueError(
                "Eager execution with collective ops is not supported yet.")

        logging.log_first_n(
            logging.INFO,
            "Collective All-reduce invoked with batches size = %d, "
            "num_workers = %d" % (len(per_device_values), self._num_workers),
            10)

        grouped_by_tower = _group_value_by_device(per_device_values)

        grouped_by_var = list(zip(*grouped_by_tower))
        # grouped_by_var is grouped by variables and takes the following format:
        # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
        #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
        #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
        #  ...
        # ]
        chunked_gv = [
            grouped_by_var[x:x + self._all_reduce_merge_scope] for x in range(
                0, len(grouped_by_var), self._all_reduce_merge_scope)
        ]

        reduced_gv_list = []
        for chunk in chunked_gv:
            with ops.name_scope("allreduce"):
                for grad_and_vars in chunk:
                    scaled_grads = [g for g, _ in grad_and_vars]
                    collective_reduced = cross_tower_utils.build_collective_reduce(
                        scaled_grads, self._num_workers, self._collective_keys,
                        "Add", "Id")
                    result = []
                    for (_, v), g in zip(grad_and_vars, collective_reduced):
                        result.append([g, v])
                    reduced_gv_list.append(result)

        new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
        return _ungroup_and_make_mirrored(
            new_tower_grads,
            per_device_values[0].devices,
            aggregation,
            num_between_graph_workers=self._num_workers)
Esempio n. 2
0
  def _batch_all_reduce(self, aggregation, per_device_values):
    """All-reduce across all workers in a batch."""
    if context.executing_eagerly():
      raise ValueError(
          "Eager execution with collective ops is not supported yet.")

    logging.log_first_n(
        logging.INFO, "Collective All-reduce invoked with batches size = %d, "
        "num_workers = %d" % (len(per_device_values), self._num_workers), 10)

    grouped_by_tower = _group_value_by_device(per_device_values)

    grouped_by_var = list(zip(*grouped_by_tower))
    # grouped_by_var is grouped by variables and takes the following format:
    # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
    #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
    #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
    #  ...
    # ]
    chunked_gv = [
        grouped_by_var[x:x + self._all_reduce_merge_scope]
        for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope)
    ]

    reduced_gv_list = []
    for chunk in chunked_gv:
      with ops.name_scope("allreduce"):
        for grad_and_vars in chunk:
          scaled_grads = [g for g, _ in grad_and_vars]
          collective_reduced = cross_tower_utils.build_collective_reduce(
              scaled_grads, self._num_workers, self._collective_keys, "Add",
              "Id")
          result = []
          for (_, v), g in zip(grad_and_vars, collective_reduced):
            result.append([g, v])
          reduced_gv_list.append(result)

    new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
    return _ungroup_and_make_mirrored(
        new_tower_grads,
        per_device_values[0].devices,
        aggregation,
        num_between_graph_workers=self._num_workers)