def test_horovod_allreduce_grad_cpu(self): """Test the correctness of the allreduce gradient on CPU.""" hvd.init() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): if _executing_eagerly(): tensor = self.tfe.Variable( self.random_uniform([5] * dim, -100, 100, dtype=dtype)) with tf.GradientTape() as tape: summed = hvd.allreduce(tensor, average=False) else: tensor = self.random_uniform([5] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) grad_ys = tf.ones([5] * dim) if _executing_eagerly(): grad_out = tape.gradient(summed, tensor, grad_ys) else: grad = tf.gradients(summed, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([5] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast_grad_gpu(self): """Test the correctness of the broadcast gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_BROADCAST. return hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): if _executing_eagerly(): tensor = self.tfe.Variable(tf.ones([5] * dim) * rank) else: tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 if _executing_eagerly(): with tf.GradientTape() as tape: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) with tf.device("/gpu:%d" % local_rank): grad_out = tape.gradient(broadcasted_tensor, tensor) else: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) with tf.device("/gpu:%d" % local_rank): grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = self.evaluate(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def _allreduce(tensor, name=None, op=Sum, prescale_factor=1.0, postscale_factor=1.0, ignore_name_scope=False): """An op which reduces an input tensor over all the Horovod processes. The default reduction is a sum. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all Horovod processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ if name is None and not _executing_eagerly(): name = 'HorovodAllreduce_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_allreduce(tensor, name=name, reduce_op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor, ignore_name_scope=ignore_name_scope)
def _reducescatter(tensor, name=None, op=Sum, ignore_name_scope=False, process_set=global_process_set): """An op which sums an input tensor over all the Horovod processes, then scatters the result across all the Horovod processes. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all Horovod processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same rank and type as `tensor`. The shape is identical to the input shape, except for the first dimension, which will be divided across the different Horovod processes. """ if name is None and not _executing_eagerly(): name = 'HorovodReducescatter_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_reducescatter( tensor, name=name, reduce_op=op, ignore_name_scope=ignore_name_scope, process_set_id=process_set.process_set_id)
def _grouped_allreduce(tensors, name=None, op=Sum, prescale_factor=1.0, postscale_factor=1.0, ignore_name_scope=False): """An op which reduces input tensors over all the Horovod processes. The default reduction is a sum. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all Horovod processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. The reduction operations are keyed by the name of the op. Reductions are performed across tensors in the same list position. The tensor type and shape must be the same on all Horovod processes for tensors sharing positions in the input tensor list. The reduction will not start until all processes are ready to send and receive the tensors. Returns: A list of tensors of the same shape and type as those in `tensors`, summed across all processes. """ if name is None and not _executing_eagerly(): name = _normalize_name('HorovodGroupedAllreduce_%s_%s' % (tensors[0].name, tensors[-1].name)) return MPI_LIB.horovod_grouped_allreduce( tensors, name=name, reduce_op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor, ignore_name_scope=ignore_name_scope)
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0, backward_passes_per_step=1, average_aggregated_gradients=False): if name is None: name = "Distributed{}".format(type(optimizer).__name__) super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._allreduce_grads = _make_allreduce_grads_fn( name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor) self._agg_helper = None if backward_passes_per_step > 1: if _executing_eagerly(): raise ValueError( "backward_passes_per_step > 1 is not yet supported " "for _LegacyOptimizer with eager execution." ) self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients=average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_LEGACY, )
def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor): if op == Average: # Split average operation across pre/postscale factors # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / gradient_predivide_factor postscale_factor = gradient_predivide_factor else: prescale_factor = 1.0 postscale_factor = 1.0 def allreduce_grads(grads): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) if grad is not None else grad for grad in grads ] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads
def alltoall(tensor, splits=None, name=None, ignore_name_scope=False): """An op that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The slicing is done on the first dimension, so the input tensors on the different processes must have the same rank and shape, except for the first dimension, which is allowed to be different. Arguments: tensor: A tensor to distribute with alltoall. splits: A tensor of integers in rank order describing how many elements in `tensor` to send to each worker. Splitting is applied along the first dimension of `tensor`. If `splits` is not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. ignore_name_scope: If True, ignores any outer name scope applied by TensorFlow in the name used by the Horovod operation. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero across all processes. The shape is identical to the input shape, except for the first dimension, which may be greater and is the sum of all first dimensions of the gathered tensor slices from different Horovod processes. """ # If splits not provided, create empty tensor as placeholder splits_ = tf.convert_to_tensor( splits) if splits is not None else tf.constant([], dtype=tf.int32) if name is None and not _executing_eagerly(): name = 'HorovodAlltoall_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_alltoall(tensor, splits=splits_, name=name, ignore_name_scope=ignore_name_scope)
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, op=Average, aggregation_frequency=1, average_aggregated_gradients=True): if name is None: name = "Distributed{}".format(type(optimizer).__name__) super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._allreduce_grads = _make_allreduce_grads_fn( name, device_dense, device_sparse, compression, sparse_as_dense, op) if not _executing_eagerly(): self._agg_helper = LocalGradientAggregationHelper( aggregation_frequency=aggregation_frequency, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, grad_updated_sizes_dict=None, average_aggregated_gradients=average_aggregated_gradients)
def _make_broadcast_group_fn(): def broadcast_group(variables, root_rank): return [var.assign(broadcast(var, root_rank)) for var in variables] if _executing_eagerly(): return _make_subgraph(broadcast_group) else: return broadcast_group
def apply_gradients(self, *args, **kwargs): """Calls this same method from the local gradient aggregation helper.""" if _executing_eagerly(): return self._optimizer.apply_gradients(*args, **kwargs) else: return self._agg_helper.apply_gradients( lambda: self._optimizer.apply_gradients(*args, **kwargs), *args, **kwargs)
def test_horovod_allreduce_grad_gpu(self): """Test the correctness of the allreduce gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE. return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) if _executing_eagerly(): tensor = self.tfe.Variable( tf.random_uniform([5] * dim, -100, 100, dtype=dtype)) with tf.GradientTape() as tape: summed = hvd.allreduce(tensor, average=False) else: tensor = tf.random_uniform([5] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) grad_ys = tf.ones([5] * dim) if _executing_eagerly(): grad_out = tape.gradient(summed, tensor, grad_ys) else: grad = tf.gradients(summed, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([5] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def evaluate(self, tensors): if _executing_eagerly(): return self._eval_helper(tensors) sess = ops.get_default_session() if sess is None: with self.test_session(config=config) as sess: return sess.run(tensors) else: return sess.run(tensors)
def test_horovod_broadcast_grad_cpu(self): """Test the correctness of the broadcast gradient on CPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): if _executing_eagerly(): tensor = self.tfe.Variable(tf.ones([5] * dim) * rank) else: tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 if _executing_eagerly(): with tf.GradientTape() as tape: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) with tf.device("/cpu:0"): grad_out = tape.gradient(broadcasted_tensor, tensor) else: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) with tf.device("/cpu:0"): grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = self.evaluate(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allgather_grad_cpu(self): """Test the correctness of the allgather gradient on CPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] if _executing_eagerly(): with tf.GradientTape() as tape: tensor = self.tfe.Variable( tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank) if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/cpu:0"): grad_out = tape.gradient(gathered, tensor, grad_ys) else: tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/cpu:0"): grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def assign(self, variables, values): if _executing_eagerly(): for var, val in zip(variables, values): var.assign(val) else: sess = ops.get_default_session() if sess is None: with self.test_session(config=config) as sess: for var, val in zip(variables, values): var.load(val, sess) else: for var, val in zip(variables, values): var.load(val, sess)
def _allreduce(tensor, name=None): """An op which sums an input tensor over all the Horovod processes. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all Horovod processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ if name is None and not _executing_eagerly(): name = 'HorovodAllreduce_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_allreduce(tensor, name=name)
def _make_broadcast_group_fn(): if _executing_eagerly(): # Eager mode requires Tensor def broadcast_group(variables, root_rank): return [var.assign(broadcast(var, root_rank)) for var in variables] return _make_subgraph(broadcast_group) else: # Graph mode requires an Op def broadcast_group(variables, root_rank): return tf.group( *[var.assign(broadcast(var, root_rank)) for var in variables]) return broadcast_group
def _make_broadcast_group_fn(): if _executing_eagerly(): # Eager mode will parallelize independent control flow def broadcast_group(variables, root_rank): for var in variables: var.assign(broadcast(var, root_rank)) return _make_subgraph(broadcast_group) else: # Graph mode requires an Op def broadcast_group(variables, root_rank): return tf.group( *[var.assign(broadcast(var, root_rank)) for var in variables]) return broadcast_group
def broadcast(tensor, root_rank, name=None): """An op which broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The broadcast operation is keyed by the name of the op. The tensor type and shape must be the same on all Horovod processes for a given name. The broadcast will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ if name is None and not _executing_eagerly(): name = 'HorovodBroadcast_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_broadcast(tensor, name=name, root_rank=root_rank)
def broadcast_global_variables(root_rank): """Broadcasts all global variables from root rank to all other processes. **NOTE:** deprecated in TensorFlow 2.0. Arguments: root_rank: rank of the process from which global variables will be broadcasted to all other processes. """ if _executing_eagerly(): raise RuntimeError( "hvd.broadcast_global_variables() does not support eager execution. " "Please use `hvd.broadcast_variables(<model/optimizer variables>)` instead." ) return broadcast_variables(_global_variables(), root_rank)
def broadcast_global_variables(root_rank): """Broadcasts all global variables from root rank to all other processes. **NOTE:** deprecated in TensorFlow 2.0. Arguments: root_rank: rank of the process from which global variables will be broadcasted to all other processes. """ if _executing_eagerly(): raise RuntimeError( "Eager Execution is not supported by `hvd.BroadcastGlobalVariablesHook`\n" "We recommend using `hvd.DistributedGradientTape` instead" ) return broadcast_variables(_global_variables(), root_rank)
def allgather(tensor, name=None): """An op which concatenates the input tensor with the same input tensor on all other Horovod processes. The concatenation is done on the first dimension, so the input tensors on the different processes must have the same rank and shape, except for the first dimension, which is allowed to be different. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero across all processes. The shape is identical to the input shape, except for the first dimension, which may be greater and is the sum of all first dimensions of the tensors in different Horovod processes. """ if name is None and not _executing_eagerly(): name = 'HorovodAllgather_%s' % _normalize_name(tensor.name) return MPI_LIB.horovod_allgather(tensor, name=name)
def _make_inplace_broadcast_group_fn(): if _executing_eagerly(): # These are just a few calls of broadcast_, no need to aggregate them in a tf.function def broadcast_group(variable_lists, root_rank, process_set: ProcessSet): for variables in variable_lists: broadcast_(variables, root_rank, process_set=process_set) return broadcast_group else: # Graph mode requires an Op def broadcast_group(variable_lists, root_rank, process_set: ProcessSet): return tf.group(*[ broadcast_(variables, root_rank, process_set=process_set) for variables in variable_lists ]) return broadcast_group
def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense): def allreduce_grads(grads): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads] return [allreduce(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression) if grad is not None else grad for grad in grads] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads
def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) if size() > 1: self.grads, vars = zip(*gradients) if _executing_eagerly(): allreduced_grads = self._allreduce_grads(self.grads) else: self._agg_helper.init_aggregation_vars(self.grads) allreduced_grads = self._agg_helper.compute_gradients( self.grads) return list(zip(allreduced_grads, vars)) else: return gradients
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False): """Construct a new DistributedOptimizer, which uses another optimizer under the hood for computing single-process gradient values and applying gradient updates after the gradient values have been averaged across all the Horovod ranks. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. use_locking: Whether to use locking when updating variables. See Optimizer.__init__ for more info. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLREDUCE. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLGATHER. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. """ if name is None: name = "Distributed{}".format(type(optimizer).__name__) self._optimizer = optimizer self._device_dense = device_dense self._device_sparse = device_sparse self._compression = compression self._sparse_as_dense = sparse_as_dense def allreduce_grads(grads): with tf.name_scope(self._name + "_Allreduce"): if self._sparse_as_dense: grads = [tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads] return [allreduce(grad, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression) if grad is not None else grad for grad in grads] if _executing_eagerly(): self._allreduce_grads = tf.contrib.eager.defun(allreduce_grads) else: self._allreduce_grads = allreduce_grads super(DistributedOptimizer, self).__init__( name=name, use_locking=use_locking)
def to_numpy(v): if not _executing_eagerly(): sess = session or ops.get_default_session() return sess.run(v) else: return v.numpy()
def _make_cached_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor, groups): groups = refs_to_vars(groups) if isinstance(groups, tuple) else groups if op == Average: # Split average operation across pre/postscale factors # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / gradient_predivide_factor postscale_factor = gradient_predivide_factor else: prescale_factor = 1.0 postscale_factor = 1.0 def allreduce_grads(grads, vars=None): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] if groups is not None: if isinstance(groups, list): var_name2grad = {} for i in range(len(vars)): var = vars[i] grad = grads[i] if grad is not None: var_name2grad[var.name] = (i, grad) grads_split = [] for group in groups: grad_group = [] for var in group: if var.name in var_name2grad: grad_group.append(var_name2grad[var.name]) del var_name2grad[var.name] grads_split.append(grad_group) for _, grad in var_name2grad.items(): grads_split.append([grad]) elif groups > 0: grads_clean = [(i, grad) for i, grad in enumerate(grads) if grad is not None] grads_split = split_list(grads_clean, groups) reduce_ops = [None] * len(vars) for group in grads_split: index_group, grad_group = [list(t) for t in zip(*group)] reduce_ops_group = _grouped_allreduce_cond( grad_group, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) for i in range(len(index_group)): reduce_ops[index_group[i]] = reduce_ops_group[i] return reduce_ops return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) if grad is not None else grad for grad in grads ] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads
def test_horovod_allgather_grad_gpu(self): """Test the correctness of the allgather gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): self.skipTest(("No GPUs available")) if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE. self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE") hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] if _executing_eagerly(): with tf.GradientTape() as tape: tensor = self.tfe.Variable( tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank) if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad_out = tape.gradient(gathered, tensor, grad_ys) else: tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))