def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values): """All-reduce across all workers in a batch.""" logging.log_first_n( logging.INFO, "Collective batch_all_reduce: %d all-reduces, " "num_workers = %d" % (len(per_replica_values), self._num_workers), 10) chunked_gv = self._make_gradient_chunks(per_replica_values, self._all_reduce_merge_scope) reduced_gv_list = [] for chunk in chunked_gv: with ops.name_scope("allreduce"): for grad_and_vars in chunk: # Gradients for the same variable but from different devices. scaled_grads = [g for g, _ in grad_and_vars] collective_reduced = cross_device_utils.build_collective_reduce( scaled_grads, self._num_workers, self._collective_keys, "Add", "Id") result = [] for (_, v), g in zip(grad_and_vars, collective_reduced): result.append([g, v]) reduced_gv_list.append(result) new_device_grads = [list(x) for x in zip(*reduced_gv_list)] return _ungroup_and_make_mirrored( new_device_grads, per_replica_values[0], reduce_op, num_between_graph_workers=self._num_workers)
def _batch_all_reduce(self, reduce_op, per_replica_values): """All-reduce across all workers in a batch.""" if context.executing_eagerly(): raise ValueError( "Eager execution with collective ops is not supported yet.") logging.log_first_n( logging.INFO, "Collective All-reduce invoked with batches size = %d, " "num_workers = %d" % (len(per_replica_values), self._num_workers), 10) grouped_by_device = _group_value_by_device(per_replica_values) grouped_by_var = list(zip(*grouped_by_device)) # grouped_by_var is grouped by variables and takes the following format: # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..), # ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..), # ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..), # ... # ] chunked_gv = [ grouped_by_var[x:x + self._all_reduce_merge_scope] for x in range( 0, len(grouped_by_var), self._all_reduce_merge_scope) ] reduced_gv_list = [] for chunk in chunked_gv: with ops.name_scope("allreduce"): for grad_and_vars in chunk: scaled_grads = [g for g, _ in grad_and_vars] collective_reduced = cross_device_utils.build_collective_reduce( scaled_grads, self._num_workers, self._collective_keys, "Add", "Id") result = [] for (_, v), g in zip(grad_and_vars, collective_reduced): result.append([g, v]) reduced_gv_list.append(result) new_device_grads = [list(x) for x in zip(*reduced_gv_list)] return _ungroup_and_make_mirrored( new_device_grads, per_replica_values[0].devices, reduce_op, num_between_graph_workers=self._num_workers)
def _batch_all_reduce(self, reduce_op, per_replica_values): """All-reduce across all workers in a batch.""" if context.executing_eagerly(): raise ValueError( "Eager execution with collective ops is not supported yet.") logging.log_first_n( logging.INFO, "Collective All-reduce invoked with batches size = %d, " "num_workers = %d" % (len(per_replica_values), self._num_workers), 10) grouped_by_device = _group_value_by_device(per_replica_values) grouped_by_var = list(zip(*grouped_by_device)) # grouped_by_var is grouped by variables and takes the following format: # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..), # ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..), # ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..), # ... # ] chunked_gv = [ grouped_by_var[x:x + self._all_reduce_merge_scope] for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope) ] reduced_gv_list = [] for chunk in chunked_gv: with ops.name_scope("allreduce"): for grad_and_vars in chunk: scaled_grads = [g for g, _ in grad_and_vars] collective_reduced = cross_device_utils.build_collective_reduce( scaled_grads, self._num_workers, self._collective_keys, "Add", "Id") result = [] for (_, v), g in zip(grad_and_vars, collective_reduced): result.append([g, v]) reduced_gv_list.append(result) new_device_grads = [list(x) for x in zip(*reduced_gv_list)] return _ungroup_and_make_mirrored( new_device_grads, per_replica_values[0].devices, reduce_op, num_between_graph_workers=self._num_workers)
def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values, experimental_hints): """All-reduce across all workers in a batch.""" batch_size = len(per_replica_values) # Pass self._communication to the runtime as a communication hint. communication = self._communication.value # For now, we use NCCL only when batch_size > 1. # TODO(b/132575814): switch to NCCL for all collectives when communication # is NCCL. if self._communication == CollectiveCommunication.NCCL and batch_size == 1: communication = CollectiveCommunication.AUTO.value # Reverse the lists so that there's better chance that values follows # the order in which they are calculated (e.g. when they're gradients), so # as to overlap calculation with communication. However, this may not be # optimal for cases like gradients of complicated non-sequential models. # # Note that we reverse the list before packing so that the first pack won't # be too small, since it's more likely for first few packs to have long # queuing time due to concurrent intense computation. # # TODO(b/147393503): explore solutions for optimal ordering. packs = cross_device_utils.pack_by_size( list(reversed(per_replica_values)), experimental_hints.bytes_per_pack) if batch_size > 1: logging.info( "Collective batch_all_reduce: %d all-reduces, num_devices = %d, " "group_size = %d, communication_hint = %s, num_packs = %d", batch_size, len(self._devices), self._group_size, communication, len(packs)) else: logging.log_first_n( logging.INFO, "Collective batch_all_reduce: %d all-reduces, " "num_devices = %d, group_size = %d, communication_hint = %s, " "num_packs = %d" % (batch_size, len(self._devices), self._group_size, communication, len(packs)), 10) reduced_values = [] with self._lock: for pack in packs: # By placing all CollectiveReduce ops in a pack under single name scope, # we ensure they will be picked up by the `ScopedAllocator` grappler # optimizer and packed into a single all-reduce. with ops.name_scope("allreduce"): for per_replica in pack: # Add control dependencies per device from the last gradients to the # current set, in order to serialize NCCL launches. if (communication == CollectiveCommunication.NCCL.value and reduced_values): control_inputs = list(reduced_values[-1]) else: control_inputs = None reduced_values.append( cross_device_utils.build_collective_reduce( per_replica.values, self._devices, self._group_size, self._collective_keys, "Add", "Id", communication, control_inputs, executors=self._executors, timeout=experimental_hints.timeout_seconds)) for e in self._executors: e.wait() mirrored = [] # Reverse the order of reduced value to recover the order in the input. for value in reversed(reduced_values): if reduce_op == reduce_util.ReduceOp.MEAN: for i, v in enumerate(value): with ops.device(v.device): value[i] = v / self._group_size mirrored.append( distribute_utils.regroup(value, wrap_class=value_lib.Mirrored)) return mirrored