Ejemplo n.º 1
    def test_copy(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((3,3))

        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])

        a_gpu = curand((3,1))
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])

        a_gpu = curand((3,3,3))
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step])

        a_gpu = curand((3,3,3)).transpose((1,2,0))
        a = a_gpu.get()
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])

        # 4-d should work as long as only 2 axes are discontiguous
        a_gpu = curand((3,3,3,3))
        a = a_gpu.get()
        for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
Ejemplo n.º 2
    def test_dot(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()
        b_gpu = curand((200000,))
        b = b_gpu.get()

        dot_ab = numpy.dot(a, b)

        dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
Ejemplo n.º 3
    def test_insert_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            m = np.random.randint(1, M)
            offset = np.random.randint(0, M - m)

            X = curand((N, M), dtype)
            Y = curand((N, m), dtype)
            insert_columns(Y, X, offset)

            self.assertTrue(np.all(X.get()[:, offset:offset+m] == Y.get()))
Ejemplo n.º 4
def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle
    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15,27):
            sz = 1 << ex
            print sz

            from pycuda.curandom import rand as curand
            a_gpu = curand((sz,))
            b_gpu = curand((sz,))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel
            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    f(*args, **kwargs)
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(a_gpu, b_gpu,

            bytes = a_gpu.nbytes*2*cnt
            secs = elapsed[0]*1e-3

            tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9))

    print tbl
Ejemplo n.º 5
    def test_dot(self):
        from pycuda.curandom import rand as curand

        for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]:
            a_gpu = curand((l,))
            a = a_gpu.get()
            b_gpu = curand((l,))
            b = b_gpu.get()

            dot_ab = np.dot(a, b)

            dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Ejemplo n.º 6
    def test_elwise_kernel(self):
        from pycuda.curandom import rand as curand

        a_gpu = curand((50,))
        b_gpu = curand((50,))

        from pycuda.elementwise import ElementwiseKernel
        lin_comb = ElementwiseKernel(
                "float a, float *x, float b, float *y, float *z",
                "z[i] = a*x[i] + b*y[i]",

        c_gpu = gpuarray.empty_like(a_gpu)
        lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

        assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
Ejemplo n.º 7
    def test_subset_minmax(self):
        from pycuda.curandom import rand as curand

        l_a = 200000
        gran = 5
        l_m = l_a - l_a // gran + 1

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
            dtypes = [np.float32, np.int32]

        for dtype in dtypes:
            a_gpu = curand((l_a,), dtype)
            a = a_gpu.get()

            meaningful_indices_gpu = gpuarray.zeros(l_m, dtype=np.int32)
            meaningful_indices = meaningful_indices_gpu.get()
            j = 0
            for i in range(len(meaningful_indices)):
                meaningful_indices[i] = j
                j = j + 1
                if j % gran == 0:
                    j = j + 1

            meaningful_indices_gpu = gpuarray.to_gpu(meaningful_indices)
            b = a[meaningful_indices]

            min_a = np.min(b)
            min_a_gpu = gpuarray.subset_min(meaningful_indices_gpu, a_gpu).get()

            assert min_a_gpu == min_a
Ejemplo n.º 8
    def test_transpose(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous
        assert np.allclose(a_gpu.T.get(), a.T)
Ejemplo n.º 9
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = np.sum(a)

        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
Ejemplo n.º 10
    def test_if_positive(self):
        from pycuda.curandom import rand as curand

        l = 20
        a_gpu = curand((l,))
        b_gpu = curand((l,))
        a = a_gpu.get()
        b = b_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu)
        min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu)

        print (max_a_b_gpu)
        print((np.maximum(a, b)))

        assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
        assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
Ejemplo n.º 11
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        assert np.array_equal(y.get(), X.get()[:3, :5])
Ejemplo n.º 12
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = numpy.sum(a)

        from pycuda.reduction import get_sum_kernel
        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
Ejemplo n.º 13
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision
            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = (curand((n,), real_dtype).astype(dtype)
                    + 1j*curand((n,), real_dtype).astype(dtype))

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0
Ejemplo n.º 14
    def test_astype(self):
        from pycuda.curandom import rand as curand

        if not has_double_support():

        a_gpu = curand((2000, ), dtype=np.float32)

        a = a_gpu.get().astype(np.float64)
        a2 = a_gpu.astype(np.float64).get()

        assert a2.dtype == np.float64
        assert la.norm(a - a2) == 0, (a, a2)

        a_gpu = curand((2000, ), dtype=np.float64)

        a = a_gpu.get().astype(np.float32)
        a2 = a_gpu.astype(np.float32).get()

        assert a2.dtype == np.float32
        assert la.norm(a - a2) / la.norm(a) < 1e-7
Ejemplo n.º 15
    def test_newaxis(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        b_gpu = a_gpu[:,np.newaxis]
        b = a[:,np.newaxis]

        assert b_gpu.shape == b.shape
        assert b_gpu.strides == b.strides
Ejemplo n.º 16
    def test_newaxis(self):
        import pycuda.gpuarray as gpuarray
        from pycuda.curandom import rand as curand

        a_gpu = curand((10,20,30))
        a = a_gpu.get()

        b_gpu = a_gpu[:,np.newaxis]
        b = a[:,np.newaxis]

        assert b_gpu.shape == b.shape
        assert b_gpu.strides == b.strides
Ejemplo n.º 17
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        with pytest.raises(AssertionError):
            assert (y.get() == X.get()[:3, :5]).all()
Ejemplo n.º 18
    def test_astype(self):
        from pycuda.curandom import rand as curand

        if not has_double_support():

        a_gpu = curand((2000,), dtype=np.float32)

        a = a_gpu.get().astype(np.float64)
        a2 = a_gpu.astype(np.float64).get()

        assert a2.dtype == np.float64
        assert la.norm(a - a2) == 0, (a, a2)

        a_gpu = curand((2000,), dtype=np.float64)

        a = a_gpu.get().astype(np.float32)
        a2 = a_gpu.astype(np.float32).get()

        assert a2.dtype == np.float32
        assert la.norm(a - a2)/la.norm(a) < 1e-7
Ejemplo n.º 19
    def test_view_and_strides(self):
        from pycuda.curandom import rand as curand

        X = curand((5, 10), dtype=np.float32)
        Y = X[:3, :5]
        y = Y.view()

        assert y.shape == Y.shape
        assert y.strides == Y.strides

        import pytest
        with pytest.raises(AssertionError):
            assert (y.get() == X.get()[:3, :5]).all()
Ejemplo n.º 20
    def test_random(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float32, np.float64]
            dtypes = [np.float32]

        for dtype in dtypes:
            a = curand((10, 100), dtype=dtype).get()

            assert (0 <= a).all()
            assert (a < 1).all()
Ejemplo n.º 21
    def test_random(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float32, np.float64]
            dtypes = [np.float32]

        for dtype in dtypes:
            a = curand((10, 100), dtype=dtype).get()

            assert (0 <= a).all()
            assert (a < 1).all()
Ejemplo n.º 22
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision

            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = curand((n, ), real_dtype).astype(dtype) + 1j * curand(
                (n, ), real_dtype).astype(dtype)

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0

            # verify contiguity is preserved
            for order in ["C", "F"]:
                # test both zero and non-zero value code paths
                z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order)
                z2 = z.reshape(z.shape, order=order)
                for zdata in [z_real, z2]:
                    if order == "C":
                        assert zdata.flags.c_contiguous
                        assert zdata.real.flags.c_contiguous
                        assert zdata.imag.flags.c_contiguous
                        assert zdata.conj().flags.c_contiguous
                    elif order == "F":
                        assert zdata.flags.f_contiguous
                        assert zdata.real.flags.f_contiguous
                        assert zdata.imag.flags.f_contiguous
                        assert zdata.conj().flags.f_contiguous
Ejemplo n.º 23
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision
            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = (curand((n,), real_dtype).astype(dtype)
                    + 1j*curand((n,), real_dtype).astype(dtype))

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0

            # verify contiguity is preserved
            for order in ["C", "F"]:
                # test both zero and non-zero value code paths
                z_real = gpuarray.zeros(z.shape, dtype=real_dtype,
                z2 = z.reshape(z.shape, order=order)
                for zdata in [z_real, z2]:
                    if order == "C":
                        assert zdata.flags.c_contiguous == True
                        assert zdata.real.flags.c_contiguous == True
                        assert zdata.imag.flags.c_contiguous == True
                        assert zdata.conj().flags.c_contiguous == True
                    elif order == "F":
                        assert zdata.flags.f_contiguous == True
                        assert zdata.real.flags.f_contiguous == True
                        assert zdata.imag.flags.f_contiguous == True
                        assert zdata.conj().flags.f_contiguous == True
Ejemplo n.º 24
    def test_reduce_out(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((10, 200), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.reduction import ReductionKernel
        red = ReductionKernel(np.float32, neutral=0,
                              arguments="float *in")
        max_gpu = gpuarray.empty(10, dtype=np.float32)
        for i in range(10):
            red(a_gpu[i], out=max_gpu[i])

        assert np.alltrue(a.max(axis=1) == max_gpu.get())
Ejemplo n.º 25
    def test_minimum_maximum_scalar(self):
        from pycuda.curandom import rand as curand

        l = 20
        a_gpu = curand((l,))
        a = a_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a0_gpu = gpuarray.maximum(a_gpu, 0)
        min_a0_gpu = gpuarray.minimum(0, a_gpu)

        assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0
        assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
Ejemplo n.º 26
    def test_extract_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            a = np.random.randint(0, M)
            b = np.random.randint(a + 1, M)
            m = b - a
            assert m > 0

            X = curand((N, M), dtype)
            Y = extract_columns(X, a, b)

            self.assertTrue(np.all(X.get()[:, a:b] == Y.get()))
Ejemplo n.º 27
    def test_extract_columns(self):
        for _ in range(20):
            dtype = random.choice((np.float32, np.float64))
            N = np.random.randint(100, 1000)
            M = np.random.randint(100, 1000)
            a = np.random.randint(0, M)
            b = np.random.randint(a + 1, M)
            m = b - a
            assert m > 0

            X = curand((N, M), dtype)
            Y = extract_columns(X, a, b)

            self.assertTrue(np.all(X.get()[:, a:b] == Y.get()))
Ejemplo n.º 28
    def test_minimum_maximum_scalar(self):
        from pycuda.curandom import rand as curand

        sz = 20
        a_gpu = curand((sz, ))
        a = a_gpu.get()

        import pycuda.gpuarray as gpuarray

        max_a0_gpu = gpuarray.maximum(a_gpu, 0)
        min_a0_gpu = gpuarray.minimum(0, a_gpu)

        assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0
        assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
Ejemplo n.º 29
    def test_copy(self):
        from pycuda.curandom import rand as curand

        a_gpu = curand((3, 3))

        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]

        a_gpu = curand((3, 1))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]

        a_gpu = curand((3, 3, 3))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, start:stop:step].get(),
                a_gpu.get()[start:stop:step, start:stop:step],

        a_gpu = curand((3, 3, 3)).transpose((1, 2, 0))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, :, start:stop:step].get(),
                a_gpu.get()[start:stop:step, :, start:stop:step],

        # 4-d should work as long as only 2 axes are discontiguous
        a_gpu = curand((3, 3, 3, 3))
        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 3)]:
            assert np.allclose(
                a_gpu[start:stop:step, :, start:stop:step].get(),
                a_gpu.get()[start:stop:step, :, start:stop:step],
Ejemplo n.º 30
    def test_slice(self):
        from pycuda.curandom import rand as curand

        l = 20000
        a_gpu = curand((l, ))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(l)
            end = randrange(start, l)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Ejemplo n.º 31
    def test_slice(self):
        from pycuda.curandom import rand as curand

        l = 20000
        a_gpu = curand((l,))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(l)
            end = randrange(start, l)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Ejemplo n.º 32
    def test_2d_slice_c(self):
        from pycuda.curandom import rand as curand

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Ejemplo n.º 33
    def test_minmax(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
            dtypes = [np.float32, np.int32]

        for what in ["min", "max"]:
            for dtype in dtypes:
                a_gpu = curand((200000, ), dtype)
                a = a_gpu.get()

                op_a = getattr(np, what)(a)
                op_a_gpu = getattr(gpuarray, what)(a_gpu).get()

                assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
Ejemplo n.º 34
    def test_minmax(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.float64, np.float32, np.int32]
            dtypes = [np.float32, np.int32]

        for what in ["min", "max"]:
            for dtype in dtypes:
                a_gpu = curand((200000,), dtype)
                a = a_gpu.get()

                op_a = getattr(np, what)(a)
                op_a_gpu = getattr(gpuarray, what)(a_gpu).get()

                assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
Ejemplo n.º 35
    def test_2d_slice_c(self):
        from pycuda.curandom import rand as curand

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a = a_gpu.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu[start:end]
            a_slice = a[start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Ejemplo n.º 36
    def test_2d_slice_f(self):
        from pycuda.curandom import rand as curand
        import pycuda.gpuarray as gpuarray

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a_gpu_f = gpuarray.GPUArray((m, n), np.float32, gpudata=a_gpu.gpudata, order="F")
        a = a_gpu_f.get()

        from random import randrange

        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu_f[:, start:end]
            a_slice = a[:, start:end]

            assert la.norm(a_gpu_slice.get() - a_slice) == 0
Ejemplo n.º 37
    def test_2d_slice_f(self):
        from pycuda.curandom import rand as curand
        import pycuda.gpuarray as gpuarray

        n = 1000
        m = 300
        a_gpu = curand((n, m))
        a_gpu_f = gpuarray.GPUArray((m, n), np.float32,
        a = a_gpu_f.get()

        from random import randrange
        for i in range(200):
            start = randrange(n)
            end = randrange(start, n)

            a_gpu_slice = a_gpu_f[:, start:end]
            a_slice = a[:, start:end]

            assert la.norm(a_gpu_slice.get()-a_slice) == 0
Ejemplo n.º 38
def main():
    from pytools import Table

    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle

    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15, 27):
            sz = 1 << ex

            from pycuda.curandom import rand as curand

            a_gpu = curand((sz, ))
            b_gpu = curand((sz, ))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel

            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    f(*args, **kwargs)
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                    # krnl(a_gpu,

            bytes = a_gpu.nbytes * 2 * cnt
            secs = elapsed[0] * 1e-3

                a_gpu.nbytes / (1 << 20),
                elapsed[0] / cnt,
                bytes / secs / 1e9,

from pycuda.curandom import rand as curand    # import CUDA random number module

a_gpu = curand((50,))                         # create a 1-d array with random number
b_gpu = curand((50,))                         

from pycuda.elementwise import ElementwiseKernel  # import ElementwiseKernel module

# specify the detail of element-wise operation
lin_comb = ElementwiseKernel(   
    " float a, float *x, float b, float *y, float *z",
    "z[i] = a*x[i] + b*y[i]")

c_gpu = gpuarray.empty_like(a_gpu)            # create a GPU array of same size
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)           # run the ElementwiseKernel function
assert  la.norm((c_gpu -  (5*a_gpu+6*b_gpu)).get()) < 1e-5

print a_gpu
print b_gpu
print c_gpu
Ejemplo n.º 40
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50,))
b_gpu = curand((50,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = my_f(a*x[i], b*y[i])",
        __device__ float my_f(float x, float y)
          return x + y;

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

print c_gpu
#print (5*a_gpu+6*b_gpu)
#import numpy.linalg as la
#assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
from pycuda.curandom import rand as curand
import numpy

n = 1024

matMultKernel = """
__global__ void mat_mult(float *a, float *b, float *c) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	for(int k = 0; k < %(ENE)s; k++)
		c[y + x * %(ENE)s] += a[k + x * %(ENE)s] * b[y + k * %(ENE)s];

a_gpu = curand((n, n))
b_gpu = curand((n, n))
c_gpu = gpuarray.zeros((n, n), dtype=numpy.float32)

matMultKernel = matMultKernel % {
    "ENE": n,

mod = SourceModule(matMultKernel)
mat_mult = mod.get_function("mat_mult")

mat_mult(a_gpu, b_gpu, c_gpu, block=(32, 32, 1), grid=(n // 32, n // 32, 1))

print("-" * 80)
Ejemplo n.º 42
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
from pycuda.curandom import rand as curand

n = 1000000

reverseKernel = ElementwiseKernel("float *a, float *b, int c",
                                  "b[i] = a[n-1-i]", "reverse")

a_gpu = curand((n))
b_gpu = gpuarray.empty_like(a_gpu)

reverseKernel(a_gpu, b_gpu, n)

print("-" * 80)
print("-" * 80)
Ejemplo n.º 43
def SSA(update_matrix, initial_conditions, function_rates, t_max,
        **kwargs):  # noqa

    # Fix the maximum number of steps available at each repetition. Should be function of the
    # amount of memory available on the device and the number of iterations (= threads) requested.
    _num_steps = 20
    _num_reacs = len(kwargs["variables"])
    start_time, end_time = np.float32(0), np.float32(t_max)

    function_rates_wo_param = deepcopy(function_rates)
    for fr_id, f_rate in enumerate(function_rates_wo_param):
        for par, val in kwargs["parameters"].items():
            f_rate = f_rate.replace(par, str(val))
        for sp_id, spec in enumerate(kwargs["variables"]):
            f_rate = f_rate.replace(
                "_time_and_states[th_id * (@num__reacs@ + 1) * @num__rep@"
                " + rep * (@num__reacs@ + 1) + 1 + {}]".format(sp_id))

        function_rates_wo_param[fr_id] = f_rate

    unroll_func_rate = "\n".join(
        (f_rate.join(("_rates_arr[{}] = ".format(fr_id), ";"))
         for fr_id, f_rate in enumerate(function_rates_wo_param)))

    kernel_ready = _kernel_str \
        .replace("@unroll__func__rate@", unroll_func_rate) \
        .replace("@num__iter@", str(kwargs["iterations"])) \
        .replace("@num__rep@", str(_num_steps)) \
        .replace("@num__reacs@", str(_num_reacs))

    if kwargs.get("print_cuda"):
            " ".join((str(line_no + 2), line))
            for line_no, line in enumerate(kernel_ready.split("\n"))))

    upd_mat_dev = gpuarray.to_gpu(update_matrix.astype(np.float32))

    # The vector of initial conditions has to be repeated for each thread, since in the future,
    # when we will split in chunks, each chunk will restart from a different initial condition.
    init_cond_dev = gpuarray.to_gpu(
                (kwargs["iterations"], 1)))

    # Each thread should produce its own array of random numbers or at least have access to a
    # private set of random numbers: we need two numbers for each repetition, one to select the
    # reaction and one to select the time.
    # Note that pycuda.curandom.rand is a toy-random generator, and all the threads share the array.
    # https://documen.tician.de/pycuda/array.html?highlight=random#module-pycuda.curandom
    rand_arr_dev = curand((_num_steps, 2, kwargs["iterations"]))

    # There seems to be no need to manually copy back to host gpuarrays, see example/demo.py.
    time_states_dev = gpuarray.GPUArray(
        (kwargs["iterations"], _num_steps, _num_reacs + 1), dtype=np.float32)

    mod = SourceModule(kernel_ready)
    func = mod.get_function("ssa_simple")
         block=(kwargs["iterations"], 1, 1))

    return time_states_dev
Ejemplo n.º 44
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand
from pycuda.elementwise import ElementwiseKernel
import numpy.linalg as la

input_vector_a = curand((50, ))
input_vector_b = curand((50, ))
mult_coefficient_a = 2
mult_coefficient_b = 5

linear_combination = ElementwiseKernel(
    "float a, float *x, float b, float *y, float *c", "c[i] = a*x[i] + b*y[i]",

linear_combination_result = gpuarray.empty_like(input_vector_a)
linear_combination(mult_coefficient_a, input_vector_a,\
                   mult_coefficient_b, input_vector_b,\

print("INPUT VECTOR A =")

print("INPUT VECTOR B = ")

print linear_combination_result

Ejemplo n.º 45
def cuda_mutate(sols,prob_mut, mut_range,min_param,max_param):
    """ mutates the values of the solutions given
    @params sols, probability of mutation, mutation range, min param, max param
    @returns mutated sols

    num_sols = len(sols);
    #get length of solutions
    sol_len = len(sols[0]);

    #get number of nodes
    num_nodes = netParams.nodeConfig['I'] + netParams.nodeConfig['H'] + netParams.nodeConfig['O'];
    #mutate not on architecture
    mutateFrom = constants.META_INFO_COUNT + num_nodes;
    m_range = 2 * mut_range;
    #convert to form of numpy arrays
    old_sols = numpy.array(sols[:,mutateFrom:], numpy.float32);
    cost_genes = numpy.ones((num_sols),numpy.float32);
    contrb_genes = numpy.zeros((num_sols),numpy.float32);
    mutants = numpy.array(sols).astype(numpy.float32);
    cost_genes *= -1;
    age_genes = numpy.zeros((num_sols),numpy.float32);
    #copy to gpu
    sols_gpu = gpuarray.to_gpu(old_sols);
    sol_len = len(old_sols[0]);
    MutSols_gpu = gpuarray.zeros_like(sols_gpu).astype(numpy.float32);
    Mvals_gpu = (curand((num_sols,sol_len),numpy.float32) * m_range) - mut_range; #mutation values
    #calculate probabilites of mutation and form mutation mask
    Mprob_gpu = curand((num_sols,sol_len),numpy.float32); #mutation probabilities
    MutMask_gpu = gpuarray.zeros_like(Mprob_gpu).astype(numpy.float32);
    #-form mutation    
    #-mutate genes
    MutSols_gpu = sols_gpu + (MutMask_gpu * Mvals_gpu);
    #get mutated solutions
    mutants[:,mutateFrom:] = MutSols_gpu.get();
    mutants[:,constants.COST_GENE] = cost_genes;
    mutants[:,constants.COST2_GENE] = cost_genes;
    mutants[:,constants.MISC_GENE] = contrb_genes;
    mutants[:,constants.AGE_GENE] = age_genes;
    if debug == True:
        print "sols",sols;
        print "mut_mask", MutMask_gpu.view();
        print "mut_sols", mutants;

    #return mutated solutions
    return mutants.tolist();
@author: bhaumik

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
from pycuda.elementwise import ElementwiseKernel
import pycuda.autoinit
from pycuda.curandom import rand as curand

# Kernel function
add = ElementwiseKernel("float *d_a, float *d_b, float *d_c",
                        "d_c[i] = d_a[i] + d_b[i]", "add")

# create a couple of random matrices with a given shape
shape = 1000000
d_a = curand(shape)
d_b = curand(shape)
d_c = gpuarray.empty_like(d_a)
start = drv.Event()
end = drv.Event()
# Calling kernel
add(d_a, d_b, d_c)
secs = start.time_till(end) * 1e-3
print("Addition of %d element of GPU" % shape)
print("%fs" % (secs))
# check the result
if d_c == (d_a + d_b):
    print("The sum computed on GPU is correct")
Ejemplo n.º 47
    def test_struct_reduce(self):
        preamble = """
        struct minmax_collector
            float cur_min;
            float cur_max;

            { }

            minmax_collector(float cmin, float cmax)
            : cur_min(cmin), cur_max(cmax)
            { }

            __device__ minmax_collector(minmax_collector const &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector(minmax_collector const volatile &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector volatile &operator=(
                minmax_collector const &src) volatile
                cur_min = src.cur_min;
                cur_max = src.cur_max;
                return *this;

        minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
            return minmax_collector(
                fminf(a.cur_min, b.cur_min),
                fmaxf(a.cur_max, b.cur_max));
        mmc_dtype = np.dtype([("cur_min", np.float32),
                              ("cur_max", np.float32)])

        from pycuda.curandom import rand as curand

        a_gpu = curand((20000, ), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.tools import register_dtype

        register_dtype(mmc_dtype, "minmax_collector")

        from pycuda.reduction import ReductionKernel

        red = ReductionKernel(
            neutral="minmax_collector(10000, -10000)",
            # FIXME: needs infinity literal in real use, ok here
            reduce_expr="agg_mmc(a, b)",
            map_expr="minmax_collector(x[i], x[i])",
            arguments="float *x",

        minmax = red(a_gpu).get()
        # print minmax["cur_min"], minmax["cur_max"]
        # print np.min(a), np.max(a)

        assert minmax["cur_min"] == np.min(a)
        assert minmax["cur_max"] == np.max(a)
Ejemplo n.º 48
from pycuda.reduction import ReductionKernel  # import ReductionKernel module

# specify the detail of the reduction operation
dot = ReductionKernel(
        arguments="const float *x, const float *y")

from pycuda.curandom import rand as curand

x = curand((1000*1000), dtype=numpy.float32)
y = curand((1000*1000), dtype=numpy.float32)

x_dot_y = dot(x, y).get()
x_dot_y_cpu = numpy.dot(x.get(), y.get())

print x
print y
print x_dot_y
print x_dot_y_cpu
Ejemplo n.º 49

transpuestaKernel = """
__global__ void transpuesta(float *a, float *b) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int i = y + x * %(EME)s;
	int j = x + y * %(ENE)s;

	if(i < (%(ENE)s * %(EME)s))
		b[j] = a[i];
transpuestaKernel = transpuestaKernel % {"ENE": n, "EME": m}

a_gpu = curand((n * m))
b_gpu = gpuarray.empty_like(a_gpu)

mod = SourceModule(transpuestaKernel)
func = mod.get_function("transpuesta")

     block=(TILE_DIM, TILE_DIM, 1),
     grid=(m // TILE_DIM, n // TILE_DIM, 1))

a_gpu = a_gpu.reshape((n, m))
b_gpu = b_gpu.reshape((m, n))

print("-" * 80)
Ejemplo n.º 50
# print(torch.cuda.device_count())
# print(torch.cuda.get_device_name(0))

# import torch.cuda
# if torch.cuda.is_available():
#     print('PyTorch found cuda')
# else:
#     print('PyTorch could not find cuda')
# import pycuda
# from pycuda import compiler
# import pycuda.driver as drv
# drv.init()
# print("%d device(s) found." % drv.Device.count())
# for ordinal in range(drv.Device.count()):
#     dev = drv.Device(ordinal)
#     print(ordinal, dev.name())

from pycuda import gpuarray
from pycuda.curandom import rand as curand
# -- initialize the device
import pycuda.autoinit

height = 100
width = 200
X = curand((height, width), np.float32)
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.reduction import ReductionKernel
from pycuda.curandom import rand as curand
import numpy

n = 1000000
a = curand(n, dtype=numpy.float32)
b = curand(n, dtype=numpy.float32)

dotKernel = ReductionKernel(numpy.float32,
                            arguments="float *x, float*y")

doot = dotKernel(a, b).get()

Ejemplo n.º 52
s = time()
dC = cumath.log(dA)
e = time()

print 'gpu elapsed time: %f \n' % (e-s)

# 3) elementwise kernel
# performs array operations much faster than gpu_array

print '\n elementwise kernel\n'
print '---------------------\n'

from pycuda.curandom import rand as curand

a_gpu = curand((1000,))
b_gpu = curand((1000,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = a*x[i] + b*y[i]",

c_gpu = gpuarray.empty_like(a_gpu)

s = time()
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
e = time()
print 'elementwise kernel elapsed time: %f \n' % (e-s)
Ejemplo n.º 53
# Element wise add operation

from __future__ import absolute_import
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit

import torch
import numpy

from pycuda.curandom import rand as curand

# Vector size
N = 10000

a_gpu = curand((N, ))
b_gpu = 1 - a_gpu

c_cpu = torch.cuda.FloatTensor(N)

from pycuda.elementwise import ElementwiseKernel
func_kernel = ElementwiseKernel("float *a, float *b, float *c",
                                "c[i] = a[i] + b[i]", "add")

c_gpu = gpuarray.empty_like(a_gpu)

func_kernel(a_gpu, b_gpu, c_gpu)

# Copy result to host
#cuda.memcpy_dtoh(c_cpu, c_gpu)
Ejemplo n.º 54
mod = SourceModule(source)
get_energy = mod.get_function("energy")
polKroku = mod.get_function("polKroku")
fupdate = mod.get_function("fupdate")
leapfrog = mod.get_function("leapfrog")
repopulate = mod.get_function("repopulate")
#sila = mod.get_function("sila")

# Initialize data
t = 0
particles = []
velocities = []
energy = []
celllist = {}
# random velocities
px = curand((stale.particleNumber, )).get().astype(np.float32)
py = curand((stale.particleNumber, )).get().astype(np.float32)
# velocity distribution around 0, not 0.5
px = px - 0.5
py = py - 0.5

# Here we have energy, not velocity ([XXX] needs correction)
v = np.zeros((stale.particleNumber, )).astype(np.float32)
rx = np.zeros((stale.particleNumber, )).astype(np.float32)
ry = np.zeros((stale.particleNumber, )).astype(np.float32)
fx = np.zeros((stale.particleNumber, )).astype(np.float32)
fy = np.zeros((stale.particleNumber, )).astype(np.float32)

# Initializing a list of neighbors (structure)
# It reduces complexity from O(N^2) to O(N)
nl = (-1) * np.ones((stale.particleNumber, stale.rn)).astype(np.float32)
Ejemplo n.º 55
    def test_struct_reduce(self):
        preamble = """
        struct minmax_collector
            float cur_min;
            float cur_max;

            { }

            minmax_collector(float cmin, float cmax)
            : cur_min(cmin), cur_max(cmax)
            { }

            __device__ minmax_collector(minmax_collector const &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector(minmax_collector const volatile &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector volatile &operator=(
                minmax_collector const &src) volatile
                cur_min = src.cur_min;
                cur_max = src.cur_max;
                return *this;

        minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
            return minmax_collector(
                fminf(a.cur_min, b.cur_min),
                fmaxf(a.cur_max, b.cur_max));
        mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)])

        from pycuda.curandom import rand as curand
        a_gpu = curand((20000,), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.tools import register_dtype
        register_dtype(mmc_dtype, "minmax_collector")

        from pycuda.reduction import ReductionKernel
        red = ReductionKernel(mmc_dtype,
                neutral="minmax_collector(10000, -10000)",
                # FIXME: needs infinity literal in real use, ok here
                reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])",
                arguments="float *x", preamble=preamble)

        minmax = red(a_gpu).get()
        #print minmax["cur_min"], minmax["cur_max"]
        #print np.min(a), np.max(a)

        assert minmax["cur_min"] == np.min(a)
        assert minmax["cur_max"] == np.max(a)
Ejemplo n.º 56
from pycuda.reduction import ReductionKernel
import numpy

dot = ReductionKernel(dtype_out=numpy.float32,
                      arguments="const float ∗x, const float ∗y")

from pycuda.curandom import rand as curand

x = curand((1000 * 1000), dtype=numpy.float32)
y = curand((1000 * 1000), dtype=numpy.float32)
x_dot_y = dot(x, y).get()
x_dot_y_cpu = numpy.dot(x.get(), y.get())
print x_dot_y
print x_dot_y_cpu
Ejemplo n.º 57
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50, ))
b_gpu = curand((50, ))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel("float a, float *x, float b, float *y, float *z",
                             "z[i] = my_f(a*x[i], b*y[i])",
        __device__ float my_f(float x, float y)
          return x + y;

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

print c_gpu
#print (5*a_gpu+6*b_gpu)
#import numpy.linalg as la
#assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
Ejemplo n.º 58
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
from pycuda.curandom import rand as curand

add = ElementwiseKernel("float *a, float *b, float *c", "c[i] = a[i] + b[i]",

shape = 128, 1024
a_gpu = curand(shape)
b_gpu = curand(shape)

c_gpu = gpuarray.empty_like(a_gpu)
add(a_gpu, b_gpu, c_gpu)

print np.max(np.abs(c_gpu.get() - a_gpu.get() - b_gpu.get()))
Ejemplo n.º 59
n = 1024
m = 512
l = 128

matMultKernel = """
__global__ void mat_mult(float *a, float *b, float *c) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	for(int k = 0; k < %(EME)s; k++)
		c[y + x * %(ELE)s] += a[k + x * %(EME)s] * b[y + k * %(ELE)s];

a_gpu = curand((n,m))
b_gpu = curand((m,l))

c_gpu = gpuarray.zeros((n,l), dtype=numpy.float32)

matMultKernel = matMultKernel % {
		"EME" : m,
		"ELE" : l

mod = SourceModule(matMultKernel)
mat_mult = mod.get_function("mat_mult")

		a_gpu, b_gpu,