Beispiel #1
0
    def test_bluefog_allreduce_grad_cpu(self):
        """Test the correctness of the allreduce gradient on CPU."""
        size = bf.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            with tf.device("/cpu:0"):
                if _executing_eagerly():
                    tensor = self.tfe.Variable(
                        random_uniform([5] * dim, -100, 100, dtype=dtype))
                    with tf.GradientTape() as tape:
                        summed = bf.allreduce(tensor, average=False)
                else:
                    tensor = random_uniform([5] * dim, -100, 100, dtype=dtype)
                    summed = bf.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                if _executing_eagerly():
                    grad_out = tape.gradient(summed, tensor, grad_ys)
                else:
                    grad = tf.gradients(summed, tensor, grad_ys)[0]
                    grad_out = self.evaluate(grad)

            expected = np.ones([5] * dim) * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Beispiel #2
0
 def evaluate(self, tensors):
     if _executing_eagerly():
         return self._eval_helper(tensors)
     sess = ops.get_default_session()
     if sess is None:
         with self.test_session(config=config) as sess:
             return sess.run(tensors)
     else:
         return sess.run(tensors)
Beispiel #3
0
    def test_bluefog_broadcast_grad_cpu(self):
        """Test the correctness of the broadcast gradient on CPU."""
        rank = bf.rank()
        size = bf.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            if _executing_eagerly():
                tensor = self.tfe.Variable(tf.ones([5] * dim) * rank)
            else:
                tensor = tf.ones([5] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = tf.cast(tensor, dtype=dtype)
                    broadcasted_tensor = bf.broadcast(tensor, root_rank)
                with tf.device("/cpu:0"):
                    grad_out = tape.gradient(broadcasted_tensor, tensor)
            else:
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = bf.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                with tf.device("/cpu:0"):
                    grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            c = size if rank == root_rank else 0
            expected = np.ones([5] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Beispiel #4
0
def _allreduce(tensor, name=None):
    """An op which reduces an input tensor over all the Bluefog processes. The
    default reduction is a sum.

    The reduction operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Bluefog processes for a given name. The reduction
    will not start until all processes are ready to send and receive the tensor.

    Returns:
      A tensor of the same shape and type as `tensor`, summed across all
      processes.
    """
    if name is None and not _executing_eagerly():
        name = 'BluefogAllreduce_%s' % _normalize_name(tensor.name)
    return MPI_LIB.bluefog_allreduce(tensor, name=name)
Beispiel #5
0
def _make_allreduce_grads_fn(name, device):
    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            ar_grads = [
                allreduce(grad, device=device) if grad is not None else grad
                for grad in grads
            ]
        return ar_grads

    if _executing_eagerly():
        if hasattr(tf, 'function'):
            # TensorFlow 1.14.0+
            return tf.function(allreduce_grads)
        else:
            return tf.contrib.eager.defun(allreduce_grads)
    else:
        return allreduce_grads
Beispiel #6
0
def _make_broadcast_group_fn():
    if _executing_eagerly():
        # Eager mode will parallelize independent control flow
        def broadcast_group(variables, root_rank):
            for var in sorted(variables, key=__name__):
                var.assign(broadcast(var, root_rank))

        if hasattr(tf, 'function'):
            # TensorFlow 1.14.0+
            return tf.function(broadcast_group)
        else:
            return tf.contrib.eager.defun(broadcast_group)
    else:
        # Graph mode requires an Op
        def broadcast_group(variables, root_rank):
            return tf.group(
                *[var.assign(broadcast(var, root_rank)) for var in variables])

        return broadcast_group
Beispiel #7
0
def allgather(tensor: tf.Tensor, name: str = None) -> tf.Tensor:
    """An op which concatenates the input tensor with the same input tensor on
    all other Bluefog processes.

    The concatenation is done on the first dimension, so the input tensors on the
    different processes must have the same rank and shape, except for the first
    dimension, which is allowed to be different.

    Arguments:
        tensor: A tensor to allgather.
        name: A name of the allgather operation.

    Returns:
      A tensor of the same type as `tensor`, concatenated on dimension zero
      across all processes. The shape is identical to the input shape, except for
      the first dimension, which may be greater and is the sum of all first
      dimensions of the tensors in different Bluefog processes.
    """
    if name is None and not _executing_eagerly():
        name = 'BluefogAllgather_%s' % _normalize_name(tensor.name)
    return MPI_LIB.bluefog_allgather(tensor, name=name)
Beispiel #8
0
def broadcast(tensor: tf.Tensor,
              root_rank: int,
              name: str = None) -> tf.Tensor:
    """An op which broadcasts the input tensor on root rank to the same input tensor
    on all other Bluefog processes.

    The broadcast operation is keyed by the name of the op. The tensor type and
    shape must be the same on all Bluefog processes for a given name. The broadcast
    will not start until all processes are ready to send and receive the tensor.

    Arguments:
        tensor: A tensor to broadcast.
        root_rank: The rank to broadcast the value from.
        name: A name of the broadcast operation.

    Returns:
      A tensor of the same shape and type as `tensor`, with the value broadcasted
      from root rank.
    """
    if name is None and not _executing_eagerly():
        name = 'BluefogBroadcast_%s' % _normalize_name(tensor.name)
    return MPI_LIB.bluefog_broadcast(tensor, name=name, root_rank=root_rank)