Exemple #1
0
    def test_horovod_allreduce_grad_cpu(self):
        """Test the correctness of the allreduce gradient on CPU."""
        hvd.init()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            with tf.device("/cpu:0"):
                if _executing_eagerly():
                    tensor = self.tfe.Variable(
                        self.random_uniform([5] * dim, -100, 100, dtype=dtype))
                    with tf.GradientTape() as tape:
                        summed = hvd.allreduce(tensor, average=False)
                else:
                    tensor = self.random_uniform([5] * dim,
                                                 -100,
                                                 100,
                                                 dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                if _executing_eagerly():
                    grad_out = tape.gradient(summed, tensor, grad_ys)
                else:
                    grad = tf.gradients(summed, tensor, grad_ys)[0]
                    grad_out = self.evaluate(grad)

            expected = np.ones([5] * dim) * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Exemple #2
0
    def test_horovod_broadcast_grad_gpu(self):
        """Test the correctness of the broadcast gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_BROADCAST.
            return

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            if _executing_eagerly():
                tensor = self.tfe.Variable(tf.ones([5] * dim) * rank)
            else:
                tensor = tf.ones([5] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = tf.cast(tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                with tf.device("/gpu:%d" % local_rank):
                    grad_out = tape.gradient(broadcasted_tensor, tensor)
            else:
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                with tf.device("/gpu:%d" % local_rank):
                    grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            c = size if rank == root_rank else 0
            expected = np.ones([5] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Exemple #3
0
def _allreduce(tensor,
               name=None,
               op=Sum,
               prescale_factor=1.0,
               postscale_factor=1.0,
               ignore_name_scope=False):
    """An op which reduces an input tensor over all the Horovod processes. The
    default reduction is a sum.

    The reduction operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Horovod processes for a given name. The reduction
    will not start until all processes are ready to send and receive the tensor.

    Returns:
      A tensor of the same shape and type as `tensor`, summed across all
      processes.
    """
    if name is None and not _executing_eagerly():
        name = 'HorovodAllreduce_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_allreduce(tensor,
                                     name=name,
                                     reduce_op=op,
                                     prescale_factor=prescale_factor,
                                     postscale_factor=postscale_factor,
                                     ignore_name_scope=ignore_name_scope)
Exemple #4
0
def _reducescatter(tensor,
                   name=None,
                   op=Sum,
                   ignore_name_scope=False,
                   process_set=global_process_set):
    """An op which sums an input tensor over all the Horovod processes, then
    scatters the result across all the Horovod processes.

    The reduction operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Horovod processes for a given name. The reduction
    will not start until all processes are ready to send and receive the tensor.

    Returns:
      A tensor of the same rank and type as `tensor`. The shape is identical to the
        input shape, except for the first dimension, which will be divided across
        the different Horovod processes.
    """
    if name is None and not _executing_eagerly():
        name = 'HorovodReducescatter_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_reducescatter(
        tensor,
        name=name,
        reduce_op=op,
        ignore_name_scope=ignore_name_scope,
        process_set_id=process_set.process_set_id)
Exemple #5
0
def _grouped_allreduce(tensors,
                       name=None,
                       op=Sum,
                       prescale_factor=1.0,
                       postscale_factor=1.0,
                       ignore_name_scope=False):
    """An op which reduces input tensors over all the Horovod processes. The
    default reduction is a sum.

    The reduction operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Horovod processes for a given name. The reduction
    will not start until all processes are ready to send and receive the tensor.

    The reduction operations are keyed by the name of the op. Reductions are
    performed across tensors in the same list position. The tensor type and
    shape must be the same on all Horovod processes for tensors sharing
    positions in the input tensor list. The reduction will not start until all
    processes are ready to send and receive the tensors.

    Returns:
      A list of tensors of the same shape and type as those in `tensors`,
      summed across all processes.
    """
    if name is None and not _executing_eagerly():
        name = _normalize_name('HorovodGroupedAllreduce_%s_%s' %
                               (tensors[0].name, tensors[-1].name))
    return MPI_LIB.horovod_grouped_allreduce(
        tensors,
        name=name,
        reduce_op=op,
        prescale_factor=prescale_factor,
        postscale_factor=postscale_factor,
        ignore_name_scope=ignore_name_scope)
Exemple #6
0
        def __init__(self, optimizer, name=None, use_locking=False, device_dense='',
                    device_sparse='', compression=Compression.none,
                    sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0,
                    backward_passes_per_step=1, average_aggregated_gradients=False):
            if name is None:
                name = "Distributed{}".format(type(optimizer).__name__)
            super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking)

            self._optimizer = optimizer
            self._allreduce_grads = _make_allreduce_grads_fn(
                name, device_dense, device_sparse, compression, sparse_as_dense, op,
                gradient_predivide_factor)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if _executing_eagerly():
                    raise ValueError(
                        "backward_passes_per_step > 1 is not yet supported "
                        "for _LegacyOptimizer with eager execution."
                    )

                self._agg_helper = LocalGradientAggregationHelper(
                    backward_passes_per_step=backward_passes_per_step,
                    allreduce_func=self._allreduce_grads,
                    sparse_as_dense=sparse_as_dense,
                    average_aggregated_gradients=average_aggregated_gradients,
                    rank=rank(),
                    optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_LEGACY,
                )
Exemple #7
0
def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression,
                             sparse_as_dense, op, gradient_predivide_factor):
    if op == Average:
        # Split average operation across pre/postscale factors
        # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
        prescale_factor = 1.0 / gradient_predivide_factor
        postscale_factor = gradient_predivide_factor
    else:
        prescale_factor = 1.0
        postscale_factor = 1.0

    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor)
                if grad is not None else grad for grad in grads
            ]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads
Exemple #8
0
def alltoall(tensor, splits=None, name=None, ignore_name_scope=False):
    """An op that scatters slices of the input tensor to all other Horovod processes
    and returns a tensor of gathered slices from all other Horovod processes.

    The slicing is done on the first dimension, so the input tensors on the
    different processes must have the same rank and shape, except for the first
    dimension, which is allowed to be different.

    Arguments:
        tensor: A tensor to distribute with alltoall.
        splits: A tensor of integers in rank order describing how many
                elements in `tensor` to send to each worker.  Splitting is
                applied along the first dimension of `tensor`. If `splits` is
                not provided, the first dimension is split equally by the
                number of Horovod processes.
        name: A name of the alltoall operation.
        ignore_name_scope: If True, ignores any outer name scope applied by
                           TensorFlow in the name used by the Horovod operation.

    Returns:
      A tensor of the same type as `tensor`, concatenated on dimension zero
      across all processes. The shape is identical to the input shape, except for
      the first dimension, which may be greater and is the sum of all first
      dimensions of the gathered tensor slices from different Horovod processes.
    """
    # If splits not provided, create empty tensor as placeholder
    splits_ = tf.convert_to_tensor(
        splits) if splits is not None else tf.constant([], dtype=tf.int32)

    if name is None and not _executing_eagerly():
        name = 'HorovodAlltoall_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_alltoall(tensor,
                                    splits=splits_,
                                    name=name,
                                    ignore_name_scope=ignore_name_scope)
Exemple #9
0
        def __init__(self,
                     optimizer,
                     name=None,
                     use_locking=False,
                     device_dense='',
                     device_sparse='',
                     compression=Compression.none,
                     sparse_as_dense=False,
                     op=Average,
                     aggregation_frequency=1,
                     average_aggregated_gradients=True):
            if name is None:
                name = "Distributed{}".format(type(optimizer).__name__)
            super(_DistributedOptimizer,
                  self).__init__(name=name, use_locking=use_locking)

            self._optimizer = optimizer
            self._allreduce_grads = _make_allreduce_grads_fn(
                name, device_dense, device_sparse, compression,
                sparse_as_dense, op)

            if not _executing_eagerly():
                self._agg_helper = LocalGradientAggregationHelper(
                    aggregation_frequency=aggregation_frequency,
                    allreduce_func=self._allreduce_grads,
                    sparse_as_dense=sparse_as_dense,
                    grad_updated_sizes_dict=None,
                    average_aggregated_gradients=average_aggregated_gradients)
Exemple #10
0
def _make_broadcast_group_fn():
    def broadcast_group(variables, root_rank):
        return [var.assign(broadcast(var, root_rank)) for var in variables]

    if _executing_eagerly():
        return _make_subgraph(broadcast_group)
    else:
        return broadcast_group
Exemple #11
0
 def apply_gradients(self, *args, **kwargs):
     """Calls this same method from the local gradient aggregation helper."""
     if _executing_eagerly():
         return self._optimizer.apply_gradients(*args, **kwargs)
     else:
         return self._agg_helper.apply_gradients(
             lambda: self._optimizer.apply_gradients(*args, **kwargs),
             *args, **kwargs)
Exemple #12
0
    def test_horovod_allreduce_grad_gpu(self):
        """Test the correctness of the allreduce gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE.
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            with tf.device("/gpu:%d" % local_rank):
                tf.set_random_seed(1234)
                if _executing_eagerly():
                    tensor = self.tfe.Variable(
                        tf.random_uniform([5] * dim, -100, 100, dtype=dtype))
                    with tf.GradientTape() as tape:
                        summed = hvd.allreduce(tensor, average=False)
                else:
                    tensor = tf.random_uniform([5] * dim,
                                               -100,
                                               100,
                                               dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                if _executing_eagerly():
                    grad_out = tape.gradient(summed, tensor, grad_ys)
                else:
                    grad = tf.gradients(summed, tensor, grad_ys)[0]
                    grad_out = self.evaluate(grad)

            expected = np.ones([5] * dim) * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
 def evaluate(self, tensors):
     if _executing_eagerly():
         return self._eval_helper(tensors)
     sess = ops.get_default_session()
     if sess is None:
         with self.test_session(config=config) as sess:
             return sess.run(tensors)
     else:
         return sess.run(tensors)
Exemple #14
0
    def test_horovod_broadcast_grad_cpu(self):
        """Test the correctness of the broadcast gradient on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            if _executing_eagerly():
                tensor = self.tfe.Variable(tf.ones([5] * dim) * rank)
            else:
                tensor = tf.ones([5] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = tf.cast(tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                with tf.device("/cpu:0"):
                    grad_out = tape.gradient(broadcasted_tensor, tensor)
            else:
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                with tf.device("/cpu:0"):
                    grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            c = size if rank == root_rank else 0
            expected = np.ones([5] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Exemple #15
0
    def test_horovod_allgather_grad_cpu(self):
        """Test the correctness of the allgather gradient on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) *
                        rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    gathered = hvd.allgather(tensor)
                    grad_list = []
                    for r, tensor_size in enumerate(tensor_sizes):
                        g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                        grad_list.append(g)
                    grad_ys = tf.concat(grad_list, axis=0)
                with tf.device("/cpu:0"):
                    grad_out = tape.gradient(gathered, tensor, grad_ys)
            else:
                tensor = tf.ones([tensor_sizes[rank]] + [17] *
                                 (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                with tf.device("/cpu:0"):
                    grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Exemple #16
0
 def assign(self, variables, values):
     if _executing_eagerly():
         for var, val in zip(variables, values):
             var.assign(val)
     else:
         sess = ops.get_default_session()
         if sess is None:
             with self.test_session(config=config) as sess:
                 for var, val in zip(variables, values):
                     var.load(val, sess)
         else:
             for var, val in zip(variables, values):
                 var.load(val, sess)
Exemple #17
0
def _allreduce(tensor, name=None):
    """An op which sums an input tensor over all the Horovod processes.

    The reduction operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Horovod processes for a given name. The reduction
    will not start until all processes are ready to send and receive the tensor.

    Returns:
      A tensor of the same shape and type as `tensor`, summed across all
      processes.
    """
    if name is None and not _executing_eagerly():
        name = 'HorovodAllreduce_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_allreduce(tensor, name=name)
Exemple #18
0
def _make_broadcast_group_fn():
    if _executing_eagerly():
        # Eager mode requires Tensor
        def broadcast_group(variables, root_rank):
            return [var.assign(broadcast(var, root_rank)) for var in variables]

        return _make_subgraph(broadcast_group)
    else:
        # Graph mode requires an Op
        def broadcast_group(variables, root_rank):
            return tf.group(
                *[var.assign(broadcast(var, root_rank)) for var in variables])

        return broadcast_group
def _make_broadcast_group_fn():
    if _executing_eagerly():
        # Eager mode will parallelize independent control flow
        def broadcast_group(variables, root_rank):
            for var in variables:
                var.assign(broadcast(var, root_rank))

        return _make_subgraph(broadcast_group)
    else:
        # Graph mode requires an Op
        def broadcast_group(variables, root_rank):
            return tf.group(
                *[var.assign(broadcast(var, root_rank)) for var in variables])

        return broadcast_group
Exemple #20
0
def broadcast(tensor, root_rank, name=None):
    """An op which broadcasts the input tensor on root rank to the same input tensor
    on all other Horovod processes.

    The broadcast operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Horovod processes for a given name. The broadcast
    will not start until all processes are ready to send and receive the tensor.

    Returns:
      A tensor of the same shape and type as `tensor`, with the value broadcasted
      from root rank.
    """
    if name is None and not _executing_eagerly():
        name = 'HorovodBroadcast_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_broadcast(tensor, name=name, root_rank=root_rank)
Exemple #21
0
    def broadcast_global_variables(root_rank):
        """Broadcasts all global variables from root rank to all other processes.

        **NOTE:** deprecated in TensorFlow 2.0.

        Arguments:
            root_rank: rank of the process from which global variables will be broadcasted
                       to all other processes.
        """
        if _executing_eagerly():
            raise RuntimeError(
                "hvd.broadcast_global_variables() does not support eager execution. "
                "Please use `hvd.broadcast_variables(<model/optimizer variables>)` instead."
            )

        return broadcast_variables(_global_variables(), root_rank)
Exemple #22
0
    def broadcast_global_variables(root_rank):
        """Broadcasts all global variables from root rank to all other processes.

        **NOTE:** deprecated in TensorFlow 2.0.

        Arguments:
            root_rank: rank of the process from which global variables will be broadcasted
                       to all other processes.
        """
        if _executing_eagerly():
            raise RuntimeError(
                "Eager Execution is not supported by `hvd.BroadcastGlobalVariablesHook`\n"
                "We recommend using `hvd.DistributedGradientTape` instead"
            )

        return broadcast_variables(_global_variables(), root_rank)
Exemple #23
0
def allgather(tensor, name=None):
    """An op which concatenates the input tensor with the same input tensor on
    all other Horovod processes.

    The concatenation is done on the first dimension, so the input tensors on the
    different processes must have the same rank and shape, except for the first
    dimension, which is allowed to be different.

    Returns:
      A tensor of the same type as `tensor`, concatenated on dimension zero
      across all processes. The shape is identical to the input shape, except for
      the first dimension, which may be greater and is the sum of all first
      dimensions of the tensors in different Horovod processes.
    """
    if name is None and not _executing_eagerly():
        name = 'HorovodAllgather_%s' % _normalize_name(tensor.name)
    return MPI_LIB.horovod_allgather(tensor, name=name)
Exemple #24
0
def _make_inplace_broadcast_group_fn():
    if _executing_eagerly():
        # These are just a few calls of broadcast_, no need to aggregate them in a tf.function
        def broadcast_group(variable_lists, root_rank,
                            process_set: ProcessSet):
            for variables in variable_lists:
                broadcast_(variables, root_rank, process_set=process_set)

        return broadcast_group
    else:
        # Graph mode requires an Op
        def broadcast_group(variable_lists, root_rank,
                            process_set: ProcessSet):
            return tf.group(*[
                broadcast_(variables, root_rank, process_set=process_set)
                for variables in variable_lists
            ])

        return broadcast_group
Exemple #25
0
def _make_allreduce_grads_fn(name, device_dense, device_sparse,
                             compression, sparse_as_dense):
    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [tf.convert_to_tensor(grad)
                         if grad is not None and isinstance(grad, tf.IndexedSlices)
                         else grad for grad in grads]

            return [allreduce(grad,
                              device_dense=device_dense,
                              device_sparse=device_sparse,
                              compression=compression)
                    if grad is not None else grad
                    for grad in grads]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads
Exemple #26
0
        def compute_gradients(self, *args, **kwargs):
            """Compute gradients of all trainable variables.

            See Optimizer.compute_gradients() for more info.

            In DistributedOptimizer, compute_gradients() is overriden to also
            allreduce the gradients before returning them.
            """
            gradients = self._optimizer.compute_gradients(*args, **kwargs)
            if size() > 1:
                self.grads, vars = zip(*gradients)

                if _executing_eagerly():
                    allreduced_grads = self._allreduce_grads(self.grads)
                else:
                    self._agg_helper.init_aggregation_vars(self.grads)
                    allreduced_grads = self._agg_helper.compute_gradients(
                        self.grads)
                return list(zip(allreduced_grads, vars))
            else:
                return gradients
Exemple #27
0
    def __init__(self, optimizer, name=None, use_locking=False, device_dense='',
                 device_sparse='', compression=Compression.none,
                 sparse_as_dense=False):
        """Construct a new DistributedOptimizer, which uses another optimizer
        under the hood for computing single-process gradient values and
        applying gradient updates after the gradient values have been averaged
        across all the Horovod ranks.

        Args:
          optimizer:
            Optimizer to use for computing gradients and applying updates.
          name:
            Optional name prefix for the operations created when applying
            gradients. Defaults to "Distributed" followed by the provided
            optimizer type.
          use_locking:
            Whether to use locking when updating variables.
            See Optimizer.__init__ for more info.
          device_dense:
            Device to be used for dense tensors. Uses GPU by default
            if Horovod was build with HOROVOD_GPU_ALLREDUCE.
          device_sparse:
            Device to be used for sparse tensors. Uses GPU by default
            if Horovod was build with HOROVOD_GPU_ALLGATHER.
          compression:
            Compression algorithm used during allreduce to reduce the amount
            of data sent during the each parameter update step.  Defaults to
            not using compression.
          sparse_as_dense:
            Treat all sparse gradients as dense tensors.  This can help improve
            performance and memory utilization if the original sparse gradient
            has high density.  Defaults to false.
        """
        if name is None:
            name = "Distributed{}".format(type(optimizer).__name__)

        self._optimizer = optimizer
        self._device_dense = device_dense
        self._device_sparse = device_sparse
        self._compression = compression
        self._sparse_as_dense = sparse_as_dense

        def allreduce_grads(grads):
            with tf.name_scope(self._name + "_Allreduce"):
                if self._sparse_as_dense:
                    grads = [tf.convert_to_tensor(grad)
                             if grad is not None and isinstance(grad, tf.IndexedSlices)
                             else grad for grad in grads]

                return [allreduce(grad,
                                  device_dense=self._device_dense,
                                  device_sparse=self._device_sparse,
                                  compression=self._compression)
                        if grad is not None else grad
                        for grad in grads]

        if _executing_eagerly():
            self._allreduce_grads = tf.contrib.eager.defun(allreduce_grads)
        else:
            self._allreduce_grads = allreduce_grads

        super(DistributedOptimizer, self).__init__(
            name=name, use_locking=use_locking)
 def to_numpy(v):
     if not _executing_eagerly():
         sess = session or ops.get_default_session()
         return sess.run(v)
     else:
         return v.numpy()
Exemple #29
0
def _make_cached_allreduce_grads_fn(name, device_dense, device_sparse,
                                    compression, sparse_as_dense, op,
                                    gradient_predivide_factor, groups):
    groups = refs_to_vars(groups) if isinstance(groups, tuple) else groups
    if op == Average:
        # Split average operation across pre/postscale factors
        # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
        prescale_factor = 1.0 / gradient_predivide_factor
        postscale_factor = gradient_predivide_factor
    else:
        prescale_factor = 1.0
        postscale_factor = 1.0

    def allreduce_grads(grads, vars=None):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            if groups is not None:
                if isinstance(groups, list):
                    var_name2grad = {}
                    for i in range(len(vars)):
                        var = vars[i]
                        grad = grads[i]
                        if grad is not None:
                            var_name2grad[var.name] = (i, grad)
                    grads_split = []
                    for group in groups:
                        grad_group = []
                        for var in group:
                            if var.name in var_name2grad:
                                grad_group.append(var_name2grad[var.name])
                                del var_name2grad[var.name]
                        grads_split.append(grad_group)
                    for _, grad in var_name2grad.items():
                        grads_split.append([grad])
                elif groups > 0:
                    grads_clean = [(i, grad) for i, grad in enumerate(grads)
                                   if grad is not None]
                    grads_split = split_list(grads_clean, groups)

                reduce_ops = [None] * len(vars)
                for group in grads_split:
                    index_group, grad_group = [list(t) for t in zip(*group)]
                    reduce_ops_group = _grouped_allreduce_cond(
                        grad_group,
                        device_dense=device_dense,
                        device_sparse=device_sparse,
                        compression=compression,
                        op=op,
                        prescale_factor=prescale_factor,
                        postscale_factor=postscale_factor)
                    for i in range(len(index_group)):
                        reduce_ops[index_group[i]] = reduce_ops_group[i]
                return reduce_ops

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor)
                if grad is not None else grad for grad in grads
            ]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads
Exemple #30
0
    def test_horovod_allgather_grad_gpu(self):
        """Test the correctness of the allgather gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            self.skipTest(("No GPUs available"))

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE.
            self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE")

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) *
                        rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    gathered = hvd.allgather(tensor)
                    grad_list = []
                    for r, tensor_size in enumerate(tensor_sizes):
                        g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                        grad_list.append(g)
                    grad_ys = tf.concat(grad_list, axis=0)
                with tf.device("/gpu:%d" % local_rank):
                    grad_out = tape.gradient(gathered, tensor, grad_ys)
            else:
                tensor = tf.ones([tensor_sizes[rank]] + [17] *
                                 (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                with tf.device("/gpu:%d" % local_rank):
                    grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))