Example #1
0
 def forward(self, data):
     """
     Arguments:
         data:
             Tensor to be reduced across all processes.
     """
     return hvd.allreduce_async(data, name=self.name, op=self.reduction)
Example #2
0
    def test_horovod_allreduce_async_fused(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors
        with Tensor Fusion."""
        hvd.init()
        size = hvd.size()
        dtypes = [
            torch.IntTensor, torch.LongTensor, torch.FloatTensor,
            torch.DoubleTensor
        ]
        if _fp16_supported:
            dtypes += [torch.HalfTensor]
        if torch.cuda.is_available():
            dtypes += [
                torch.cuda.IntTensor, torch.cuda.LongTensor,
                torch.cuda.FloatTensor, torch.cuda.DoubleTensor
            ]
            if _fp16_supported:
                dtypes += [torch.cuda.HalfTensor]
        dims = [1, 2, 3]
        tests = []
        is_hvd_poll_false_once = False
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            handle = hvd.allreduce_async(tensor, average=False)
            if not hvd.poll(handle):
                is_hvd_poll_false_once = True
            tensor, = self.convert_cpu_fp16_to_fp32(tensor)
            multiplied = tensor * size
            tests.append((dtype, multiplied, handle))

        # Make sure it's an asynchronous operation.
        assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?'

        for dtype, multiplied, handle in tests:
            summed = hvd.synchronize(handle)
            summed, = self.convert_cpu_fp16_to_fp32(summed)
            max_difference = summed.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [
                    torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor,
                    torch.cuda.LongTensor
            ]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Example #3
0
    def test_horovod_allreduce_duplicate_name_error(self):
        """Test that the allreduce raises an error if there are
        two concurrent operations with the same name."""
        hvd.init()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dims = [17] * 3
        tensor = torch.FloatTensor(*dims)

        hvd.allreduce_async(tensor, name='duplicate_name')
        try:
            for i in range(10):
                hvd.allreduce_async(tensor, name='duplicate_name')
            assert False, 'hvd.allreduce_async did not throw error'
        except (torch.FatalError, ValueError):
            pass
Example #4
0
    def test_parallel(self):
        hvd.init()
        # TODO support non-MPI Adasum operation
        # Only do this test if there are GPUs available.
        if not hvd.mpi_enabled() or not torch.cuda.is_available():
            self.skipTest("No GPUs available")

        device = torch.device('cuda:{}'.format(hvd.local_rank()))
        np.random.seed(2)
        torch.manual_seed(2)
        size = hvd.size()
        local_size = hvd.local_size()
        rank = hvd.rank()

        for data_type in self.data_types:
            all_Ns = [size * 20 - 13, size * 2 + 1, size + 2, 2**19]
            tensors = []
            all_qs = []
            for N in all_Ns:
                a = np.random.normal(0, 1, (N, 1)).astype(np.float64)
                r = np.random.normal(0, 1, (size, 1)).astype(np.float64)
                q = np.dot(a, r.T)
                q = q.astype(data_type)
                all_qs.append(q.astype(np.float64))
                tensors.append(q[:, hvd.rank()])

            tensors = list(
                map(lambda x: torch.from_numpy(x).to(device), tensors))

            handles = [
                hvd.allreduce_async(tensor, op=hvd.Adasum)
                for tensor in tensors
            ]

            reduced_tensors = [synchronize(h) for h in handles]

            expected = [np.sum(q, axis=1) / size for q in all_qs]
            all_comp = [
                self.are_close(data_type, e,
                               rt.cpu().numpy())
                for e, rt in zip(expected, reduced_tensors)
            ]
            if np.alltrue(all_comp):
                print('Parallel test passed')
            else:
                for c, e, rt in zip(all_comp, expected, reduced_tensors):
                    if c == False:
                        print('computed: ', rt)
                        print('expected: ', e)
                        print('off by: ', self.diff_ratio(e, rt.cpu().numpy()))
            assert np.alltrue(all_comp)
def distributed_matmul_tn(left: Tensor, right: Tensor) -> Tensor:
    """
    Multiply two sequence tensors to obtain the result of :math:`A^{T} B`.

    Left and right inputs can be N-dimensional tensors, where the first one
    must be of size :math:`* \times \frac{T}{N} \times T` and the second one of
    size , where :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the
    total length,  :math:`N` is the total number of processes available and
    :math:`D`, the dimension of the sequence. The result of this function is a
    tensor of size :math:`* \times \frac{T}{N} \times D`, that contain the
    result chunk for each process of the resulting operation.

    Inputs
    ------
    left: Tensor
        :math:`A` in :math:`A^T B`, must be of size
        :math:`* \times \frac{T}{N} \times T`
    right: Tensor
        :math:`B` in :math:`A^T B`, must be of size
        :math:`* \times \frac{T}{N} \times D`

    Returns
    -------
    result: Tensor
        For each process, this function computes the corresponding segment
        of the operation :math:`A^T B`, of size
        :math:`* \times \frac{T}{N} \times D`
    """
    cols = left.size(-1)
    world_size = get_world_size()
    rank = get_rank()

    split_size = cols // world_size
    splits = left.split(split_size, -1)
    rank_block = None

    synchronize()
    for r in range(world_size):
        rank_split = splits[r]
        rank_multiplication = torch.matmul(rank_split.transpose(-1, -2), right)
        handle = hvd.allreduce_async(rank_multiplication,
                                     name=f'matmul_tn_{r}',
                                     op=hvd.Sum)
        if r == rank:
            rank_block = hvd.synchronize(handle)
    return rank_block.contiguous()
Example #6
0
    def test_horovod_allreduce_async_fused(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors
        with Tensor Fusion."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        tests = []
        is_hvd_poll_false_once = False
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            handle = hvd.allreduce_async(tensor, average=False)
            if not hvd.poll(handle):
                is_hvd_poll_false_once = True
            multiplied = tensor * size
            tests.append((dtype, multiplied, handle))

        # Make sure it's an asynchronous operation.
        assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?'

        for dtype, multiplied, handle in tests:
            summed = hvd.synchronize(handle)
            max_difference = summed.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor,
                                      torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Example #7
0
 def async_send(self, tensors_compressed, ctx):
     # assert only one tensor in tensors_compressed for allreduce
     return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average)