Example #1
0
    def test_copy(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((3,3))

        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])

        a_gpu = curand((3,1))
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])

        a_gpu = curand((3,3,3))
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step])

        a_gpu = curand((3,3,3)).transpose((1,2,0))
        a = a_gpu.get()
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])

        # 4-d should work as long as only 2 axes are discontiguous
        a_gpu = curand((3,3,3,3))
        a = a_gpu.get()
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
Example #2
0
    def test_dot(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()
        b_gpu = curand((200000,))
        b = b_gpu.get()

        dot_ab = numpy.dot(a, b)

        dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
Example #3
0
    def test_insert_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            m = np.random.randint(1, M)
            offset = np.random.randint(0, M - m)

            X = curand((N, M), dtype)
            Y = curand((N, m), dtype)
            insert_columns(Y, X, offset)

            self.assertTrue(np.all(X.get()[:, offset:offset+m] == Y.get()))
Example #4
0
def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle
    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15,27):
            sz = 1 << ex
            print sz

            from pycuda.curandom import rand as curand
            a_gpu = curand((sz,))
            b_gpu = curand((sz,))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel
            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    start.record()
                    f(*args, **kwargs)
                    stop.record()
                    stop.synchronize()
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(a_gpu, b_gpu,
                #krnl(a_gpu, 
                        kernel_wrapper=wrap_with_timer)

            bytes = a_gpu.nbytes*2*cnt
            secs = elapsed[0]*1e-3

            tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9))

    print tbl
Example #5
0
    def test_dot(self):
        from pycuda.curandom import rand as curand

        for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]:
            a_gpu = curand((l,))
            a = a_gpu.get()
            b_gpu = curand((l,))
            b = b_gpu.get()

            dot_ab = np.dot(a, b)

            dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Example #6
0
    def test_elwise_kernel(self):
        from pycuda.curandom import rand as curand

        a_gpu = curand((50,))
        b_gpu = curand((50,))

        from pycuda.elementwise import ElementwiseKernel
        lin_comb = ElementwiseKernel(
                "float a, float *x, float b, float *y, float *z",
                "z[i] = a*x[i] + b*y[i]",
                "linear_combination")

        c_gpu = gpuarray.empty_like(a_gpu)
        lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

        assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
Example #7
0
    def test_subset_minmax(self):
        from pycuda.curandom import rand as curand

        l_a = 200000
        gran = 5
        l_m = l_a - l_a // gran + 1

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
        else:
            dtypes = [np.float32, np.int32]

        for dtype in dtypes:
            a_gpu = curand((l_a,), dtype)
            a = a_gpu.get()

            meaningful_indices_gpu = gpuarray.zeros(l_m, dtype=np.int32)
            meaningful_indices = meaningful_indices_gpu.get()
            j = 0
            for i in range(len(meaningful_indices)):
                meaningful_indices[i] = j
                j = j + 1
                if j % gran == 0:
                    j = j + 1

            meaningful_indices_gpu = gpuarray.to_gpu(meaningful_indices)
            b = a[meaningful_indices]

            min_a = np.min(b)
            min_a_gpu = gpuarray.subset_min(meaningful_indices_gpu, a_gpu).get()

            assert min_a_gpu == min_a
Example #8
0
    def test_transpose(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous
        assert np.allclose(a_gpu.T.get(), a.T)
Example #9
0
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = np.sum(a)

        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
Example #10
0
    def test_if_positive(self):
        from pycuda.curandom import rand as curand

        l = 20
        a_gpu = curand((l,))
        b_gpu = curand((l,))
        a = a_gpu.get()
        b = b_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu)
        min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu)

        print (max_a_b_gpu)
        print((np.maximum(a, b)))

        assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
        assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
Example #11
0
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        assert np.array_equal(y.get(), X.get()[:3, :5])
Example #12
0
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = numpy.sum(a)

        from pycuda.reduction import get_sum_kernel
        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
Example #13
0
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
        else:
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision
            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = (curand((n,), real_dtype).astype(dtype)
                    + 1j*curand((n,), real_dtype).astype(dtype))

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0
    def test_astype(self):
        from pycuda.curandom import rand as curand

        if not has_double_support():
            return

        a_gpu = curand((2000, ), dtype=np.float32)

        a = a_gpu.get().astype(np.float64)
        a2 = a_gpu.astype(np.float64).get()

        assert a2.dtype == np.float64
        assert la.norm(a - a2) == 0, (a, a2)

        a_gpu = curand((2000, ), dtype=np.float64)

        a = a_gpu.get().astype(np.float32)
        a2 = a_gpu.astype(np.float32).get()

        assert a2.dtype == np.float32
        assert la.norm(a - a2) / la.norm(a) < 1e-7
Example #15
0
    def test_newaxis(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        b_gpu = a_gpu[:,np.newaxis]
        b = a[:,np.newaxis]

        assert b_gpu.shape == b.shape
        assert b_gpu.strides == b.strides
Example #16
0
    def test_newaxis(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        b_gpu = a_gpu[:,np.newaxis]
        b = a[:,np.newaxis]

        assert b_gpu.shape == b.shape
        assert b_gpu.strides == b.strides
Example #17
0
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        with pytest.raises(AssertionError):
            assert (y.get() == X.get()[:3, :5]).all()
Example #18
0
    def test_astype(self):
        from pycuda.curandom import rand as curand

        if not has_double_support():
            return

        a_gpu = curand((2000,), dtype=np.float32)

        a = a_gpu.get().astype(np.float64)
        a2 = a_gpu.astype(np.float64).get()

        assert a2.dtype == np.float64
        assert la.norm(a - a2) == 0, (a, a2)

        a_gpu = curand((2000,), dtype=np.float64)

        a = a_gpu.get().astype(np.float32)
        a2 = a_gpu.astype(np.float32).get()

        assert a2.dtype == np.float32
        assert la.norm(a - a2)/la.norm(a) < 1e-7
Example #19
0
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        import pytest
        with pytest.raises(AssertionError):
            assert (y.get() == X.get()[:3, :5]).all()
Example #20
0
    def test_random(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float32, np.float64]
        else:
            dtypes = [np.float32]

        for dtype in dtypes:
            a = curand((10, 100), dtype=dtype).get()

            assert (0 <= a).all()
            assert (a < 1).all()
Example #21
0
    def test_random(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float32, np.float64]
        else:
            dtypes = [np.float32]

        for dtype in dtypes:
            a = curand((10, 100), dtype=dtype).get()

            assert (0 <= a).all()
            assert (a < 1).all()
Example #22
0
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
        else:
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision

            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = curand((n, ), real_dtype).astype(dtype) + 1j * curand(
                (n, ), real_dtype).astype(dtype)

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0

            # verify contiguity is preserved
            for order in ["C", "F"]:
                # test both zero and non-zero value code paths
                z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order)
                z2 = z.reshape(z.shape, order=order)
                for zdata in [z_real, z2]:
                    if order == "C":
                        assert zdata.flags.c_contiguous
                        assert zdata.real.flags.c_contiguous
                        assert zdata.imag.flags.c_contiguous
                        assert zdata.conj().flags.c_contiguous
                    elif order == "F":
                        assert zdata.flags.f_contiguous
                        assert zdata.real.flags.f_contiguous
                        assert zdata.imag.flags.f_contiguous
                        assert zdata.conj().flags.f_contiguous
Example #23
0
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
        else:
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision
            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = (curand((n,), real_dtype).astype(dtype)
                    + 1j*curand((n,), real_dtype).astype(dtype))

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0

            # verify contiguity is preserved
            for order in ["C", "F"]:
                # test both zero and non-zero value code paths
                z_real = gpuarray.zeros(z.shape, dtype=real_dtype,
                                        order=order)
                z2 = z.reshape(z.shape, order=order)
                for zdata in [z_real, z2]:
                    if order == "C":
                        assert zdata.flags.c_contiguous == True
                        assert zdata.real.flags.c_contiguous == True
                        assert zdata.imag.flags.c_contiguous == True
                        assert zdata.conj().flags.c_contiguous == True
                    elif order == "F":
                        assert zdata.flags.f_contiguous == True
                        assert zdata.real.flags.f_contiguous == True
                        assert zdata.imag.flags.f_contiguous == True
                        assert zdata.conj().flags.f_contiguous == True
Example #24
0
    def test_reduce_out(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((10, 200), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.reduction import ReductionKernel
        red = ReductionKernel(np.float32, neutral=0,
                              reduce_expr="max(a,b)",
                              arguments="float *in")
        max_gpu = gpuarray.empty(10, dtype=np.float32)
        for i in range(10):
            red(a_gpu[i], out=max_gpu[i])

        assert np.alltrue(a.max(axis=1) == max_gpu.get())
Example #25
0
    def test_minimum_maximum_scalar(self):
        from pycuda.curandom import rand as curand

        l = 20
        a_gpu = curand((l,))
        a = a_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a0_gpu = gpuarray.maximum(a_gpu, 0)
        min_a0_gpu = gpuarray.minimum(0, a_gpu)

        assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0
        assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
Example #26
0
    def test_extract_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            a = np.random.randint(0, M)
            b = np.random.randint(a + 1, M)
            m = b - a
            assert m > 0

            X = curand((N, M), dtype)
            Y = extract_columns(X, a, b)

            self.assertTrue(np.all(X.get()[:, a:b] == Y.get()))
Example #27
0
    def test_extract_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            a = np.random.randint(0, M)
            b = np.random.randint(a + 1, M)
            m = b - a
            assert m > 0

            X = curand((N, M), dtype)
            Y = extract_columns(X, a, b)

            self.assertTrue(np.all(X.get()[:, a:b] == Y.get()))
Example #28
0
    def test_minimum_maximum_scalar(self):
        from pycuda.curandom import rand as curand

        sz = 20
        a_gpu = curand((sz, ))
        a = a_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a0_gpu = gpuarray.maximum(a_gpu, 0)
        min_a0_gpu = gpuarray.minimum(0, a_gpu)

        assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0
        assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
Example #29
0
    def test_copy(self):
        from pycuda.curandom import rand as curand

        a_gpu = curand((3, 3))

        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]
            )

        a_gpu = curand((3, 1))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]
            )

        a_gpu = curand((3, 3, 3))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, start:stop:step].get(),
                a_gpu.get()[start:stop:step, start:stop:step],
            )

        a_gpu = curand((3, 3, 3)).transpose((1, 2, 0))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, :, start:stop:step].get(),
                a_gpu.get()[start:stop:step, :, start:stop:step],
            )

        # 4-d should work as long as only 2 axes are discontiguous
        a_gpu = curand((3, 3, 3, 3))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, :, start:stop:step].get(),
                a_gpu.get()[start:stop:step, :, start:stop:step],
            )
Example #30
0
    def test_slice(self):
        from pycuda.curandom import rand as curand

        l = 20000
        a_gpu = curand((l, ))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(l)
            end = randrange(start, l)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Example #31
0
    def test_slice(self):
        from pycuda.curandom import rand as curand

        l = 20000
        a_gpu = curand((l,))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(l)
            end = randrange(start, l)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Example #32
0
    def test_2d_slice_c(self):
        from pycuda.curandom import rand as curand

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Example #33
0
    def test_minmax(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
        else:
            dtypes = [np.float32, np.int32]

        for what in ["min", "max"]:
            for dtype in dtypes:
                a_gpu = curand((200000, ), dtype)
                a = a_gpu.get()

                op_a = getattr(np, what)(a)
                op_a_gpu = getattr(gpuarray, what)(a_gpu).get()

                assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
Example #34
0
    def test_minmax(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
        else:
            dtypes = [np.float32, np.int32]

        for what in ["min", "max"]:
            for dtype in dtypes:
                a_gpu = curand((200000,), dtype)
                a = a_gpu.get()

                op_a = getattr(np, what)(a)
                op_a_gpu = getattr(gpuarray, what)(a_gpu).get()

                assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
Example #35
0
    def test_2d_slice_c(self):
        from pycuda.curandom import rand as curand

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Example #36
0
    def test_2d_slice_f(self):
        from pycuda.curandom import rand as curand
        import pycuda.gpuarray as gpuarray

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a_gpu_f = gpuarray.GPUArray((m, n), np.float32, gpudata=a_gpu.gpudata, order="F")
        a = a_gpu_f.get()

        from random import randrange

        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu_f[:, start:end]
            a_slice = a[:, start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Example #37
0
    def test_2d_slice_f(self):
        from pycuda.curandom import rand as curand
        import pycuda.gpuarray as gpuarray

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a_gpu_f = gpuarray.GPUArray((m, n), np.float32,
                                    gpudata=a_gpu.gpudata,
                                    order="F")
        a = a_gpu_f.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu_f[:, start:end]
            a_slice = a[:, start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Example #38
0
def main():
    from pytools import Table

    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle

    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15, 27):
            sz = 1 << ex
            print(sz)

            from pycuda.curandom import rand as curand

            a_gpu = curand((sz, ))
            b_gpu = curand((sz, ))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel

            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    start.record()
                    f(*args, **kwargs)
                    stop.record()
                    stop.synchronize()
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(
                    a_gpu,
                    b_gpu,
                    # krnl(a_gpu,
                    kernel_wrapper=wrap_with_timer,
                )

            bytes = a_gpu.nbytes * 2 * cnt
            secs = elapsed[0] * 1e-3

            tbl.add_row((
                str(dtype_out),
                a_gpu.nbytes / (1 << 20),
                elapsed[0] / cnt,
                bytes / secs / 1e9,
            ))

    print(tbl)
from pycuda.curandom import rand as curand    # import CUDA random number module

a_gpu = curand((50,))                         # create a 1-d array with random number
b_gpu = curand((50,))                         

from pycuda.elementwise import ElementwiseKernel  # import ElementwiseKernel module

# specify the detail of element-wise operation
lin_comb = ElementwiseKernel(   
    " float a, float *x, float b, float *y, float *z",
    "z[i] = a*x[i] + b*y[i]")

c_gpu = gpuarray.empty_like(a_gpu)            # create a GPU array of same size
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)           # run the ElementwiseKernel function
assert  la.norm((c_gpu -  (5*a_gpu+6*b_gpu)).get()) < 1e-5

print a_gpu
print b_gpu
print c_gpu
Example #40
0
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50,))
b_gpu = curand((50,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = my_f(a*x[i], b*y[i])",
        "linear_combination",
        preamble="""
        __device__ float my_f(float x, float y)
        { 
          return x + y;
        }
        """)

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

print c_gpu
#print (5*a_gpu+6*b_gpu)
#import numpy.linalg as la
#assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
from pycuda.curandom import rand as curand
import numpy

n = 1024

matMultKernel = """
__global__ void mat_mult(float *a, float *b, float *c) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	for(int k = 0; k < %(ENE)s; k++)
		c[y + x * %(ENE)s] += a[k + x * %(ENE)s] * b[y + k * %(ENE)s];
}
"""

a_gpu = curand((n, n))
b_gpu = curand((n, n))
c_gpu = gpuarray.zeros((n, n), dtype=numpy.float32)

matMultKernel = matMultKernel % {
    "ENE": n,
}

mod = SourceModule(matMultKernel)
mat_mult = mod.get_function("mat_mult")

mat_mult(a_gpu, b_gpu, c_gpu, block=(32, 32, 1), grid=(n // 32, n // 32, 1))

print(a_gpu.get())
print("-" * 80)
print(b_gpu.get())
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
from pycuda.curandom import rand as curand

n = 1000000

reverseKernel = ElementwiseKernel("float *a, float *b, int c",
                                  "b[i] = a[n-1-i]", "reverse")

a_gpu = curand((n))
b_gpu = gpuarray.empty_like(a_gpu)

reverseKernel(a_gpu, b_gpu, n)

print(a_gpu)
print("-" * 80)
print(b_gpu)
print("-" * 80)
print(n)
Example #43
0
def SSA(update_matrix, initial_conditions, function_rates, t_max,
        **kwargs):  # noqa

    # Fix the maximum number of steps available at each repetition. Should be function of the
    # amount of memory available on the device and the number of iterations (= threads) requested.
    _num_steps = 20
    _num_reacs = len(kwargs["variables"])
    start_time, end_time = np.float32(0), np.float32(t_max)

    function_rates_wo_param = deepcopy(function_rates)
    for fr_id, f_rate in enumerate(function_rates_wo_param):
        for par, val in kwargs["parameters"].items():
            f_rate = f_rate.replace(par, str(val))
        for sp_id, spec in enumerate(kwargs["variables"]):
            f_rate = f_rate.replace(
                spec,
                "_time_and_states[th_id * (@num__reacs@ + 1) * @num__rep@"
                " + rep * (@num__reacs@ + 1) + 1 + {}]".format(sp_id))

        function_rates_wo_param[fr_id] = f_rate

    unroll_func_rate = "\n".join(
        (f_rate.join(("_rates_arr[{}] = ".format(fr_id), ";"))
         for fr_id, f_rate in enumerate(function_rates_wo_param)))

    kernel_ready = _kernel_str \
        .replace("@unroll__func__rate@", unroll_func_rate) \
        .replace("@num__iter@", str(kwargs["iterations"])) \
        .replace("@num__rep@", str(_num_steps)) \
        .replace("@num__reacs@", str(_num_reacs))

    if kwargs.get("print_cuda"):
        print("\n".join(
            " ".join((str(line_no + 2), line))
            for line_no, line in enumerate(kernel_ready.split("\n"))))

    upd_mat_dev = gpuarray.to_gpu(update_matrix.astype(np.float32))

    # The vector of initial conditions has to be repeated for each thread, since in the future,
    # when we will split in chunks, each chunk will restart from a different initial condition.
    init_cond_dev = gpuarray.to_gpu(
        np.tile(initial_conditions.astype(np.float32),
                (kwargs["iterations"], 1)))

    # Each thread should produce its own array of random numbers or at least have access to a
    # private set of random numbers: we need two numbers for each repetition, one to select the
    # reaction and one to select the time.
    # Note that pycuda.curandom.rand is a toy-random generator, and all the threads share the array.
    # https://documen.tician.de/pycuda/array.html?highlight=random#module-pycuda.curandom
    rand_arr_dev = curand((_num_steps, 2, kwargs["iterations"]))

    # There seems to be no need to manually copy back to host gpuarrays, see example/demo.py.
    time_states_dev = gpuarray.GPUArray(
        (kwargs["iterations"], _num_steps, _num_reacs + 1), dtype=np.float32)

    mod = SourceModule(kernel_ready)
    func = mod.get_function("ssa_simple")
    func(upd_mat_dev,
         init_cond_dev,
         start_time,
         end_time,
         time_states_dev,
         rand_arr_dev,
         block=(kwargs["iterations"], 1, 1))

    return time_states_dev
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand
from pycuda.elementwise import ElementwiseKernel
import numpy.linalg as la

input_vector_a = curand((50, ))
input_vector_b = curand((50, ))
mult_coefficient_a = 2
mult_coefficient_b = 5

linear_combination = ElementwiseKernel(
    "float a, float *x, float b, float *y, float *c", "c[i] = a*x[i] + b*y[i]",
    "linear_combination")

linear_combination_result = gpuarray.empty_like(input_vector_a)
linear_combination(mult_coefficient_a, input_vector_a,\
                   mult_coefficient_b, input_vector_b,\
                   linear_combination_result)

print("INPUT VECTOR A =")
print(input_vector_a)

print("INPUT VECTOR B = ")
print(input_vector_b)

print("RESULTING VECTOR C = ")
print linear_combination_result

print(
Example #45
0
def cuda_mutate(sols,prob_mut, mut_range,min_param,max_param):
    """ mutates the values of the solutions given
    @params sols, probability of mutation, mutation range, min param, max param
    @returns mutated sols
    """

    #ALL SOLUTIONS MUST BE OF SAME LENGTH
    num_sols = len(sols);
    #get length of solutions
    sol_len = len(sols[0]);
    

    
    #get number of nodes
    num_nodes = netParams.nodeConfig['I'] + netParams.nodeConfig['H'] + netParams.nodeConfig['O'];
    
    #mutate not on architecture
    mutateFrom = constants.META_INFO_COUNT + num_nodes;
        
    
    #range
    m_range = 2 * mut_range;
    
    #convert to form of numpy arrays
    old_sols = numpy.array(sols[:,mutateFrom:], numpy.float32);
    cost_genes = numpy.ones((num_sols),numpy.float32);
    contrb_genes = numpy.zeros((num_sols),numpy.float32);
    mutants = numpy.array(sols).astype(numpy.float32);
    cost_genes *= -1;
    age_genes = numpy.zeros((num_sols),numpy.float32);
    
    
    
    #copy to gpu
    sols_gpu = gpuarray.to_gpu(old_sols);
    sol_len = len(old_sols[0]);
    
    #operation
    MutSols_gpu = gpuarray.zeros_like(sols_gpu).astype(numpy.float32);
    Mvals_gpu = (curand((num_sols,sol_len),numpy.float32) * m_range) - mut_range; #mutation values
    
    #calculate probabilites of mutation and form mutation mask
    Mprob_gpu = curand((num_sols,sol_len),numpy.float32); #mutation probabilities
    MutMask_gpu = gpuarray.zeros_like(Mprob_gpu).astype(numpy.float32);
    #-form mutation    
    form_mutation_mask(Mprob_gpu,MutMask_gpu,prob_mut);
    #-mutate genes
    MutSols_gpu = sols_gpu + (MutMask_gpu * Mvals_gpu);
    
    #get mutated solutions
    mutants[:,mutateFrom:] = MutSols_gpu.get();
    mutants[:,constants.COST_GENE] = cost_genes;
    mutants[:,constants.COST2_GENE] = cost_genes;
    mutants[:,constants.MISC_GENE] = contrb_genes;
    mutants[:,constants.AGE_GENE] = age_genes;
    
    if debug == True:
        print "sols",sols;
        print "mut_mask", MutMask_gpu.view();
        print "mut_sols", mutants;

    #return mutated solutions
    return mutants.tolist();
@author: bhaumik
"""

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
from pycuda.elementwise import ElementwiseKernel
import pycuda.autoinit
from pycuda.curandom import rand as curand

# Kernel function
add = ElementwiseKernel("float *d_a, float *d_b, float *d_c",
                        "d_c[i] = d_a[i] + d_b[i]", "add")

# create a couple of random matrices with a given shape
shape = 1000000
d_a = curand(shape)
d_b = curand(shape)
d_c = gpuarray.empty_like(d_a)
start = drv.Event()
end = drv.Event()
start.record()
# Calling kernel
add(d_a, d_b, d_c)
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Addition of %d element of GPU" % shape)
print("%fs" % (secs))
# check the result
if d_c == (d_a + d_b):
    print("The sum computed on GPU is correct")
Example #47
0
    def test_struct_reduce(self):
        preamble = """
        struct minmax_collector
        {
            float cur_min;
            float cur_max;

            __device__
            minmax_collector()
            { }

            __device__
            minmax_collector(float cmin, float cmax)
            : cur_min(cmin), cur_max(cmax)
            { }

            __device__ minmax_collector(minmax_collector const &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector(minmax_collector const volatile &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector volatile &operator=(
                minmax_collector const &src) volatile
            {
                cur_min = src.cur_min;
                cur_max = src.cur_max;
                return *this;
            }
        };

        __device__
        minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
        {
            return minmax_collector(
                fminf(a.cur_min, b.cur_min),
                fmaxf(a.cur_max, b.cur_max));
        }
        """
        mmc_dtype = np.dtype([("cur_min", np.float32),
                              ("cur_max", np.float32)])

        from pycuda.curandom import rand as curand

        a_gpu = curand((20000, ), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.tools import register_dtype

        register_dtype(mmc_dtype, "minmax_collector")

        from pycuda.reduction import ReductionKernel

        red = ReductionKernel(
            mmc_dtype,
            neutral="minmax_collector(10000, -10000)",
            # FIXME: needs infinity literal in real use, ok here
            reduce_expr="agg_mmc(a, b)",
            map_expr="minmax_collector(x[i], x[i])",
            arguments="float *x",
            preamble=preamble,
        )

        minmax = red(a_gpu).get()
        # print minmax["cur_min"], minmax["cur_max"]
        # print np.min(a), np.max(a)

        assert minmax["cur_min"] == np.min(a)
        assert minmax["cur_max"] == np.max(a)
from pycuda.reduction import ReductionKernel  # import ReductionKernel module

# specify the detail of the reduction operation
dot = ReductionKernel(
        dtype_out=numpy.float32, 				
        neutral="0",
        reduce_expr="a+b", 
        map_expr="x[i]*y[i]",
        arguments="const float *x, const float *y")

from pycuda.curandom import rand as curand

x = curand((1000*1000), dtype=numpy.float32)
y = curand((1000*1000), dtype=numpy.float32)

x_dot_y = dot(x, y).get()
x_dot_y_cpu = numpy.dot(x.get(), y.get())

print x
print y
print x_dot_y
print x_dot_y_cpu
Example #49
0
TILE_DIM = 32

transpuestaKernel = """
__global__ void transpuesta(float *a, float *b) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int i = y + x * %(EME)s;
	int j = x + y * %(ENE)s;

	if(i < (%(ENE)s * %(EME)s))
		b[j] = a[i];
}
"""
transpuestaKernel = transpuestaKernel % {"ENE": n, "EME": m}

a_gpu = curand((n * m))
b_gpu = gpuarray.empty_like(a_gpu)

mod = SourceModule(transpuestaKernel)
func = mod.get_function("transpuesta")

func(a_gpu,
     b_gpu,
     block=(TILE_DIM, TILE_DIM, 1),
     grid=(m // TILE_DIM, n // TILE_DIM, 1))

a_gpu = a_gpu.reshape((n, m))
b_gpu = b_gpu.reshape((m, n))

print(a_gpu)
print("-" * 80)
Example #50
0
# print(torch.cuda.device_count())
#
# print(torch.cuda.get_device_name(0))

# import torch.cuda
# if torch.cuda.is_available():
#     print('PyTorch found cuda')
# else:
#     print('PyTorch could not find cuda')
#
# import pycuda
# from pycuda import compiler
# import pycuda.driver as drv
#
# drv.init()
# print("%d device(s) found." % drv.Device.count())
#
# for ordinal in range(drv.Device.count()):
#     dev = drv.Device(ordinal)
#     print(ordinal, dev.name())

from pycuda import gpuarray
from pycuda.curandom import rand as curand
# -- initialize the device
import pycuda.autoinit

height = 100
width = 200
X = curand((height, width), np.float32)
X.flags.c_contiguous
print(type(X))
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.reduction import ReductionKernel
from pycuda.curandom import rand as curand
import numpy

n = 1000000
a = curand(n, dtype=numpy.float32)
b = curand(n, dtype=numpy.float32)

dotKernel = ReductionKernel(numpy.float32,
                            neutral="0",
                            reduce_expr="a+b",
                            map_expr="x[i]*y[i]",
                            arguments="float *x, float*y")

doot = dotKernel(a, b).get()

print(doot)
Example #52
0
s = time()
dC = cumath.log(dA)
e = time()

print 'gpu elapsed time: %f \n' % (e-s)

###################
# 3) elementwise kernel
# performs array operations much faster than gpu_array

print '\n elementwise kernel\n'
print '---------------------\n'

from pycuda.curandom import rand as curand

a_gpu = curand((1000,))
b_gpu = curand((1000,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = a*x[i] + b*y[i]",
        "linear_combination")

c_gpu = gpuarray.empty_like(a_gpu)

s = time()
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
e = time()
print 'elementwise kernel elapsed time: %f \n' % (e-s)
Example #53
0
# Element wise add operation

from __future__ import absolute_import
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit

import torch
import numpy

from pycuda.curandom import rand as curand

# Vector size
N = 10000

a_gpu = curand((N, ))
b_gpu = 1 - a_gpu

c_cpu = torch.cuda.FloatTensor(N)

from pycuda.elementwise import ElementwiseKernel
func_kernel = ElementwiseKernel("float *a, float *b, float *c",
                                "c[i] = a[i] + b[i]", "add")

c_gpu = gpuarray.empty_like(a_gpu)

func_kernel(a_gpu, b_gpu, c_gpu)

# Copy result to host
#cuda.memcpy_dtoh(c_cpu, c_gpu)
Example #54
0
mod = SourceModule(source)
get_energy = mod.get_function("energy")
polKroku = mod.get_function("polKroku")
fupdate = mod.get_function("fupdate")
leapfrog = mod.get_function("leapfrog")
repopulate = mod.get_function("repopulate")
#sila = mod.get_function("sila")

# Initialize data
t = 0
particles = []
velocities = []
energy = []
celllist = {}
# random velocities
px = curand((stale.particleNumber, )).get().astype(np.float32)
py = curand((stale.particleNumber, )).get().astype(np.float32)
# velocity distribution around 0, not 0.5
px = px - 0.5
py = py - 0.5

# Here we have energy, not velocity ([XXX] needs correction)
v = np.zeros((stale.particleNumber, )).astype(np.float32)
rx = np.zeros((stale.particleNumber, )).astype(np.float32)
ry = np.zeros((stale.particleNumber, )).astype(np.float32)
fx = np.zeros((stale.particleNumber, )).astype(np.float32)
fy = np.zeros((stale.particleNumber, )).astype(np.float32)

# Initializing a list of neighbors (structure)
# It reduces complexity from O(N^2) to O(N)
nl = (-1) * np.ones((stale.particleNumber, stale.rn)).astype(np.float32)
Example #55
0
    def test_struct_reduce(self):
        preamble = """
        struct minmax_collector
        {
            float cur_min;
            float cur_max;

            __device__
            minmax_collector()
            { }

            __device__
            minmax_collector(float cmin, float cmax)
            : cur_min(cmin), cur_max(cmax)
            { }

            __device__ minmax_collector(minmax_collector const &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector(minmax_collector const volatile &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector volatile &operator=(
                minmax_collector const &src) volatile
            {
                cur_min = src.cur_min;
                cur_max = src.cur_max;
                return *this;
            }
        };

        __device__
        minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
        {
            return minmax_collector(
                fminf(a.cur_min, b.cur_min),
                fmaxf(a.cur_max, b.cur_max));
        }
        """
        mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)])

        from pycuda.curandom import rand as curand
        a_gpu = curand((20000,), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.tools import register_dtype
        register_dtype(mmc_dtype, "minmax_collector")

        from pycuda.reduction import ReductionKernel
        red = ReductionKernel(mmc_dtype,
                neutral="minmax_collector(10000, -10000)",
                # FIXME: needs infinity literal in real use, ok here
                reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])",
                arguments="float *x", preamble=preamble)

        minmax = red(a_gpu).get()
        #print minmax["cur_min"], minmax["cur_max"]
        #print np.min(a), np.max(a)

        assert minmax["cur_min"] == np.min(a)
        assert minmax["cur_max"] == np.max(a)
Example #56
0
from pycuda.reduction import ReductionKernel
import numpy

dot = ReductionKernel(dtype_out=numpy.float32,
                      neutral="0",
                      reduce_expr="a+b",
                      map_expr="x[i]∗y[i]",
                      arguments="const float ∗x, const float ∗y")

from pycuda.curandom import rand as curand

x = curand((1000 * 1000), dtype=numpy.float32)
y = curand((1000 * 1000), dtype=numpy.float32)
x_dot_y = dot(x, y).get()
x_dot_y_cpu = numpy.dot(x.get(), y.get())
print x_dot_y
print x_dot_y_cpu
Example #57
0
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50, ))
b_gpu = curand((50, ))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel("float a, float *x, float b, float *y, float *z",
                             "z[i] = my_f(a*x[i], b*y[i])",
                             "linear_combination",
                             preamble="""
        __device__ float my_f(float x, float y)
        { 
          return x + y;
        }
        """)

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

print c_gpu
#print (5*a_gpu+6*b_gpu)
#import numpy.linalg as la
#assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
Example #58
0
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
from pycuda.curandom import rand as curand

add = ElementwiseKernel("float *a, float *b, float *c", "c[i] = a[i] + b[i]",
                        "add")

shape = 128, 1024
a_gpu = curand(shape)
b_gpu = curand(shape)

c_gpu = gpuarray.empty_like(a_gpu)
add(a_gpu, b_gpu, c_gpu)

print np.max(np.abs(c_gpu.get() - a_gpu.get() - b_gpu.get()))
Example #59
0
n = 1024
m = 512
l = 128

matMultKernel = """
__global__ void mat_mult(float *a, float *b, float *c) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	for(int k = 0; k < %(EME)s; k++)
		c[y + x * %(ELE)s] += a[k + x * %(EME)s] * b[y + k * %(ELE)s];
}
"""

a_gpu = curand((n,m))
b_gpu = curand((m,l))

c_gpu = gpuarray.zeros((n,l), dtype=numpy.float32)

matMultKernel = matMultKernel % {
		"EME" : m,
		"ELE" : l
}

mod = SourceModule(matMultKernel)
mat_mult = mod.get_function("mat_mult")

mat_mult(
		a_gpu, b_gpu,
		c_gpu,