def _do_batch_all_reduce(self, reduce_op, dense_values): """Run batch all-reduces.""" logging.log_first_n( logging.INFO, "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(dense_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = dense_values[0].devices grouped = _group_value_by_device(dense_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_device_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_device_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
def _do_batch_all_reduce(self, reduce_op, dense_values): """Run batch all-reduces.""" logging.log_first_n( logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s," "num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(dense_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = dense_values[0].devices grouped = _group_value_by_device(dense_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_device_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_device_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)