def test_all_gather(self):
        texp = np.arange(self.size * 10, dtype='int32')
        cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32')

        a = cpu
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, texp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((2 * self.size, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 1, 1, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2 * self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        with self.assertRaises(Exception):
            resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
    def test_all_gather(self):
        texp = np.arange(self.size * 10, dtype='int32')
        cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32')

        a = cpu
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, texp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((2 * self.size, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 1, 1, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2 * self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        with self.assertRaises(Exception):
            resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='C',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='F',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True