Beispiel #1
0
def start(A_matrix):
    A = A_matrix.flatten()
    L = np.zeros_like(A)
    U = np.zeros_like(A)

    rows = len(A_matrix)
    columns = len(A_matrix)
    tpb = 32
    n = rows * columns

    with cuda.pinned(A, L, U):
        stream = cuda.stream()
        gpu_A = cuda.to_device(A, stream=stream)
        gpu_L = cuda.to_device(L, stream=stream)
        gpu_U = cuda.to_device(U, stream=stream)
        bpg = n + (tpb - 1) // tpb

        crout[(bpg, bpg), (tpb, tpb)](gpu_A, gpu_L, gpu_U, rows)

    gpu_L.copy_to_host(L, stream)
    gpu_U.copy_to_host(U, stream)

    L = L.reshape(rows, columns)
    U = U.reshape(rows, columns)

    print(L)
    print(U)
    print(np.matmul(L, U))
Beispiel #2
0
    def test_generate(self):
        from pyculib.rand.binding import (Generator, CURAND_RNG_QUASI_SOBOL64,
                                          CURAND_RNG_QUASI_DEFAULT)
        N = 10
        stream = cuda.stream()

        ary32 = np.zeros(N, dtype=np.uint32)
        devary32 = cuda.to_device(ary32, stream=stream)

        rndgen = Generator(CURAND_RNG_QUASI_DEFAULT)
        rndgen.set_stream(stream)
        rndgen.set_offset(123)
        rndgen.set_quasi_random_generator_dimensions(1)
        rndgen.generate(devary32, N)

        devary32.copy_to_host(ary32, stream=stream)
        stream.synchronize()

        self.assertTrue(any(ary32 != 0))

        ary64 = np.zeros(N, dtype=np.uint64)
        devary64 = cuda.to_device(ary64, stream=stream)

        rndgen = Generator(CURAND_RNG_QUASI_SOBOL64)
        rndgen.set_stream(stream)
        rndgen.set_offset(123)
        rndgen.set_quasi_random_generator_dimensions(1)
        rndgen.generate(devary64, N)

        devary64.copy_to_host(ary64, stream=stream)
        stream.synchronize()

        self.assertTrue(any(ary64 != 0))
Beispiel #3
0
    def move(self, init, stream=None):

        if len(init[0].shape) == 2:
            self.threadim = (16, 16)
            _f = self._f2
        elif len(init[0].shape) == 3:
            self.threadim = (8, 8, 4)
            _f = self._f3

        BPG = np.array(init[0].shape) / np.array(self.threadim)

        self.gridim = tuple(BPG.astype(np.int))

        if stream is None:
            self.stream = cuda.stream()
        else:
            self.stream = stream

        if not cuda.devicearray.is_cuda_ndarray(init[0]):
            self.dA = cuda.to_device(init[0], stream=self.stream)
            self.dB = cuda.to_device(init[1], stream=self.stream)
            self.dC = cuda.to_device(init[2], stream=self.stream)
            self.stream.synchronize()
        else:
            self.dA = init[0]
            self.dB = init[1]
            self.dC = init[2]

        _f[self.gridim, self.threadim, self.stream](self.dA, self.dB, self.dC)

        self.stream.synchronize()
Beispiel #4
0
def test_11():

    from timeit import default_timer as time

    n = 7 * 32

    m = n // CMP_RATIO
    if n % CMP_RATIO != 0:
        m += 1

    thrd = 0.5

    A = np.array(np.random.random((n, n)), dtype=np.float32) - thrd * 2
    R = np.empty_like(A)  # for residual
    C = np.array(np.zeros((m, n)), dtype=np.uint32)

    stream = cuda.stream()
    dA = cuda.to_device(A, stream)
    dR = cuda.to_device(R, stream)
    dC = cuda.to_device(C, stream)

    tm = time()
    cu_fn_matrix_quantize_e(dA, dC, dR, thrd)
    print('quantize: ', time() - tm)

    B = np.empty_like(A)  # for dequantize
    dB = cuda.to_device(B, stream)

    tm = time()
    cu_fn_matrix_dequantize_e(dB, dC, dR, thrd)
    print('dequantize: ', time() - tm)
Beispiel #5
0
 def test_create_stream(self):
     stream = cuda.stream()
     states = cuda.random.create_xoroshiro128p_states(10,
                                                      seed=1,
                                                      stream=stream)
     s = states.copy_to_host()
     self.assertEqual(len(np.unique(s)), 10)
Beispiel #6
0
    def test_func(self):
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        print("N = %d x %d" % (n, n))

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        Amat = np.matrix(A)
        Bmat = np.matrix(B)

        s = time()
        Cans = Amat * Bmat
        e = time()
        tcpu = e - s

        print('cpu:  %f' % tcpu)
        print('cuda: %f' % tcuda)
        print('cuda speedup: %.2fx' % (tcpu / tcuda))

        # Check result
        self.assertTrue(np.allclose(C, Cans))
Beispiel #7
0
def compute_region_PSNR(filename, in_dir):
    n = np.array([16])  # Max length of interpolated pixels

    stream = cuda.stream()

    # Read the original image
    original = cv2.imread(join(in_dir, filename))
    original = cv2.split(cv2.cvtColor(original, cv2.COLOR_BGR2YCR_CB))[0]
    original = original.astype(np.float64)

    dim = np.array([original.shape[1], original.shape[0]])

    template = np.zeros_like(original)
    template = np.repeat(template[:, :, np.newaxis], n, axis=2)

    # Create images in GPU
    d_original = cuda.to_device(np.ascontiguousarray(original), stream=stream)
    d_n = cuda.to_device(np.ascontiguousarray(n), stream=stream)
    d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream)

    # Intepolate image
    d_interpolated = cuda.to_device(np.ascontiguousarray(template),
                                    stream=stream)
    interpolate_image[512, 512](d_original, d_n, d_dim, d_interpolated)

    # Compute MSE of a pixel in a 3x3 region
    d_mse_region = cuda.to_device(np.ascontiguousarray(template),
                                  stream=stream)
    region_MSE[512, 512](d_original, d_interpolated, d_n, d_dim, d_mse_region)
    region_PSNR[512, 512](d_mse_region, d_n, d_dim)
    psnr_region = d_mse_region.copy_to_host()

    return psnr_region
Beispiel #8
0
    def test_gufunc_stream(self):

        gufunc = _get_matmulcore_gufunc(max_blocksize=512)

        #cuda.driver.flush_pending_free()
        matrix_ct = 1001  # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4,
                      dtype=np.float32).reshape(matrix_ct, 2, 4)
        B = np.arange(matrix_ct * 4 * 5,
                      dtype=np.float32).reshape(matrix_ct, 4, 5)

        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5),
                               dtype=A.dtype,
                               stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        Gold = ut.matrix_multiply(A, B)

        self.assertTrue(np.allclose(C, Gold))
Beispiel #9
0
def GPUWrapper(data_out, device_id, photons_req_per_device,
               max_photons_per_device, muA, muS, g, source_type, source_param1,
               source_param2, detector_params, max_N, max_distance_from_det,
               target_type, target_mask, target_gridsize, z_target, z_bounded,
               z_range, ret_cols, absorb_threshold, absorb_chance):

    # TODO: These numbers can be optimized based on the device / architecture / number of photons
    threads_per_block = 256
    blocks = 64
    photons_per_thread = int(
        np.ceil(float(photons_req_per_device) / (threads_per_block * blocks)))
    max_photons_per_thread = int(
        np.ceil(float(max_photons_per_device) / (threads_per_block * blocks)))

    cuda.select_device(device_id)
    stream = cuda.stream()  # use stream to trigger async memory transfer

    # Keeping this piece of code here for now -potentially we need this in the future
    #  with compiler_lock:                        # lock the compiler
    # prepare function for this thread
    # the jitted CUDA kernel is loaded into the current context
    # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where
    # signature is the call to the function. So far I couldn't figure out what is the signature of the
    # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A)
    # But I couldn't get it to work yet.
    #     MC_cuda_kernel = cuda.jit(propPhotonGPU)

    data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread,
                             12),
                      dtype=np.float32)
    photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5),
                                 dtype=np.int)
    data_out_device = cuda.device_array_like(data, stream=stream)
    photon_counters_device = cuda.device_array_like(photon_counters,
                                                    stream=stream)

    # Used to initialize the threads random states.
    rng_states = create_xoroshiro128p_states(
        threads_per_block * blocks,
        seed=(np.random.randint(sys.maxsize) - 128) + device_id,
        stream=stream)

    # Actual kernel call
    propPhotonGPU[blocks, threads_per_block](
        rng_states, data_out_device, photon_counters_device,
        photons_per_thread, max_photons_per_thread, muA, muS, g, source_type,
        source_param1, source_param2, detector_params, max_N,
        max_distance_from_det, target_type, target_mask, target_gridsize,
        z_target, z_bounded, z_range, absorb_threshold, absorb_chance)
    # Copy data back
    data_out_device.copy_to_host(data, stream=stream)
    photon_counters_device.copy_to_host(photon_counters, stream=stream)
    stream.synchronize()

    data = data.reshape(data.shape[0] * data.shape[1], data.shape[2])
    data = data[:, ret_cols]
    data_out[device_id][0] = data

    photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0))
    data_out[device_id][1] = photon_counters_aggr
Beispiel #10
0
 def setup(self):
     self.stream = cuda.stream()
     self.small_data = np.zeros(512, dtype=np.float64)
     self.large_data = np.zeros(512 * 1024, dtype=np.float64)
     self.d_small_data = cuda.to_device(self.small_data, self.stream)
     self.d_large_data = cuda.to_device(self.large_data, self.stream)
     self.stream.synchronize()
Beispiel #11
0
    async def test_multiple_async_done_multiple_streams(self):
        streams = [cuda.stream() for _ in range(4)]
        done_aws = [stream.async_done() for stream in streams]
        done = await asyncio.gather(*done_aws)

        # Ensure we got the four original streams in done
        self.assertSetEqual(set(done), set(streams))
Beispiel #12
0
def laplace_3d_cuda(d, n):
    L = d.shape[0]
    M = d.shape[1]
    N = d.shape[2]

    blockdim = (8, 8, 8)
    griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2])
    #print(griddim)

    stream = cuda.stream()
    dd = cuda.to_device(d, stream)
    dn = cuda.to_device(n, stream)

    #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn)
    for i in range(0, 100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)

    evtstart = cuda.event(timing=True)
    evtend = cuda.event(timing=True)
    evtstart.record(stream)
    for i in range(100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)
    evtend.record(stream)
    evtend.synchronize()
    print(cuda.event_elapsed_time(evtstart, evtend) / 100.)

    dd.to_host()
    def __init__(self, configModule):
        self.imagesRGB = []
        self.imagesTH = []
        self.imagesRGBCuda = []
        self.imagesTHCuda = []
        self._imagesRGBPath = None
        self._imagesTHPath = None
        self.myHeatMapLookup = self.createHeatMapLookUp()
        self.CudaEnabled = cudaImport
        self.configurations = configModule.Configuration
        self.outArr = np.empty(shape=(int(self.configurations.rows) *
                                      int(self.configurations.cols) *
                                      int(self.configurations.dims)),
                               dtype=np.uint8)
        self.refreshConfigModule = False

        if self.CudaEnabled:
            self.streamCuda = cuda.stream()
            self.myHeatMapLookupCuda = cuda.to_device(self.myHeatMapLookup,
                                                      self.streamCuda)
        try:
            self.imagesRGBPath = self.configurations.datasetDemoNormalPath
            self.imagesTHPath = self.configurations.datasetDemoThermalPath
        except BaseException as e:
            print(
                "Dataset Yükleme hata oluştu. Konfigürasyon Dosyasını kontrol edin.{}",
                str(e))
    def start(self, A_matrix, b_vector):
        """Launches parallel Gauss Jordan elimination for a SLAE and returns
        its answer.

        @param A_matrix     Coefficient matrix of a SLAE.
        @param b_vector     Linearly independent vector of a SLAE.

        @return float64[:]
        """
        if 0 in A_matrix.diagonal():
            return None

        b = b_vector.reshape(len(b_vector), 1)
        A = np.hstack((A_matrix, b))
        A = A.flatten()

        n = len(b)

        with cuda.pinned(A):
            stream = cuda.stream()
            gpu_A = cuda.to_device(A, stream=stream)
            bpg = 1

            for i in range(0, n):
                self.gauss_jordan[(bpg, bpg), (tpb, tpb)](gpu_A, n, i)
                self.normalize[(bpg, bpg), (tpb, tpb)](gpu_A, n)

        gpu_A.copy_to_host(A, stream)

        x = A.reshape(n, (n + 1))[:, n]

        if True in np.isnan(x) or True in np.isinf(x):
            return None
        else:
            return x
Beispiel #15
0
 def setup(self):
     self.stream = cuda.stream()
     self.small_data = np.zeros(512, dtype=np.float64)
     self.large_data = np.zeros(512 * 1024, dtype=np.float64)
     self.d_small_data = cuda.to_device(self.small_data, self.stream)
     self.d_large_data = cuda.to_device(self.large_data, self.stream)
     self.stream.synchronize()
Beispiel #16
0
        def test(ty):
            print("Test %s" % ty)
            data = np.array(np.random.random(1e+6 + 1), dtype=ty)

            ts = time()
            stream = cuda.stream()
            device_data = cuda.to_device(data, stream)
            dresult = cuda_ufunc(device_data, device_data, stream=stream)
            result = dresult.copy_to_host()
            stream.synchronize()
            tnumba = time() - ts

            ts = time()
            gold = np_ufunc(data, data)
            tnumpy = time() - ts

            print("Numpy time: %fs" % tnumpy)
            print("Numba time: %fs" % tnumba)

            if tnumba < tnumpy:
                print("Numba is FASTER by %fx" % (tnumpy / tnumba))
            else:
                print("Numba is SLOWER by %fx" % (tnumba / tnumpy))

            self.assertTrue(np.allclose(gold, result), (gold, result))
Beispiel #17
0
 async def test_cancelled_future(self):
     stream = cuda.stream()
     done1, done2 = stream.async_done(), stream.async_done()
     done1.cancel()
     await done2
     self.assertTrue(done1.cancelled())
     self.assertTrue(done2.done())
Beispiel #18
0
def dist_matrix(X, p):
    """Calculate pairwise distance matrix using minkowski distance and returns in longform

    Args:
        X (np.ndarray): matrix with rows samples and columns features
        p (float): exponent for Minkowski distance

    Returns:
        [np.ndarray]: matrix of pairwise distances in longform
    """
    rows = X.shape[0]

    block_dim = (16, 16)
    grid_dim = (int(rows / block_dim[0] + 1), int(rows / block_dim[1] + 1))

    stream = cuda.stream()
    X = cuda.to_device(np.asarray(X, dtype=np_type), stream=stream)
    out2 = cuda.device_array(rows * (rows - 1) // 2, dtype=np_type)
    if p == 2:  # speed up performance by calling special function when p == 2.
        #tick = time.perf_counter()
        euclidean_pairs_gpu[grid_dim, block_dim](X, out2)
        #print('euc pairs gpu', time.perf_counter() - tick)
    else:
        distance_matrix_gpu[grid_dim, block_dim](X, p, out2)
    out = out2.copy_to_host(stream=stream)

    return out
Beispiel #19
0
    def __init__(self, x_shape, n_samples, logger: logging.Logger = None):
        if logger is None:
            logger = logging.Logger('AutoMemoryBuilder')
            logger.addHandler(logging.NullHandler())

        self._logger = logger.getChild('AutoMemoryBuilder')
        self._logger.debug('Constructor')

        if len(x_shape) == 1:
            self._x_size = x_shape[0]
        else:
            assert len(x_shape) == 2
            self._x_size = x_shape[0] * x_shape[1]

        assert self._x_size >= n_samples

        self._n_samples = n_samples
        self._xmem_shape = (n_samples, self._x_size)

        self._i = 0

        self._logger.info('Allocating pinned array: {0} bytes'.format(
            n_samples * self._x_size))

        self._pinned_mem = cuda.pinned_array(self._xmem_shape)
        self._stream = cuda.stream()
Beispiel #20
0
    def recall(self, x, how):
        if not isinstance(x, np.ndarray):
            x = np.array(x)

        assert np.size(x) == self._x_size

        orig_shape = x.shape
        x = x.reshape((self._x_size, 1))
        out = np.empty_like(x)

        if how == 'm':
            mem = self._dev_mem_m
            op = mnn.mat_morph_mul_min_plus
        elif how == 'w':
            mem = self._dev_mem_w
            op = mnn.mat_morph_mul_max_plus
        else:
            raise ValueError('Expected how=\'m\' or how=\'w\'')

        stream = cuda.stream()

        dev_x = cuda.to_device(x, stream=stream)
        out_x = cuda.device_array_like(out, stream=stream)

        op(mem, x, out_x, stream=stream)
        out_x.copy_to_host(out, stream=stream)

        stream.synchronize()

        return out.T.reshape(orig_shape)
    def test_div_up(self):
        # wrapper kernel for device function that is tested
        @cuda.jit
        def _kernel(x, y):
            x_pos = cuda.grid(1)
            if x_pos < x.shape[0] and x_pos < y.shape[0]:
                x[x_pos] = rnnt_helper.div_up(x[x_pos], y[x_pos])

        x = np.full([8], fill_value=10)  # np.random.rand(8192)
        y = np.full([8], fill_value=2)  # np.random.rand(8192)

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)
        y_c = cuda.to_device(y, stream=stream)

        # call kernel
        threads_per_block = global_constants.threads_per_block()
        blocks_per_grid = (x.shape[0] + threads_per_block -
                           1) // threads_per_block
        _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c)

        # sync kernel
        stream.synchronize()

        x_new = x_c.copy_to_host(stream=stream)
        del x_c, y_c

        for i in range(len(x_new)):
            assert x_new[i] == ((10 + 2 - 1) // 2)
    def reduce(self, arg, stream=0):
        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
                                                         "ufunc"
        assert arg.ndim == 1, "must use 1d array"

        n = arg.shape[0]
        gpu_mems = []

        if n == 0:
            raise TypeError("Reduction on an empty array.")
        elif n == 1:  # nothing to do
            return arg[0]

        # always use a stream
        stream = stream or cuda.stream()
        with stream.auto_synchronize():
            # transfer memory to device if necessary
            if devicearray.is_cuda_ndarray(arg):
                mem = arg
            else:
                mem = cuda.to_device(arg, stream)
                # do reduction
            out = self.__reduce(mem, gpu_mems, stream)
            # use a small buffer to store the result element
            buf = np.array((1,), dtype=arg.dtype)
            out.copy_to_host(buf, stream=stream)

        return buf[0]
    def test_log_sum_exp_neg_inf(self):
        # wrapper kernel for device function that is tested
        @cuda.jit
        def _kernel(x, y):
            x_pos = cuda.grid(1)
            if x_pos < x.shape[0] and x_pos < y.shape[0]:
                x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos])

        x = np.asarray([global_constants.FP32_NEG_INF] * 8)
        y = np.ones([len(x)])

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)
        y_c = cuda.to_device(y, stream=stream)

        # call kernel
        threads_per_block = global_constants.threads_per_block()
        blocks_per_grid = (x.shape[0] + threads_per_block -
                           1) // threads_per_block
        _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c)

        # sync kernel
        stream.synchronize()

        x_new = x_c.copy_to_host(stream=stream)
        del x_c, y_c

        assert np.allclose(x_new, np.ones_like(x_new), atol=1e-5)
    def test_log_sum_exp(self):
        # wrapper kernel for device function that is tested
        @cuda.jit
        def _kernel(x, y):
            x_pos = cuda.grid(1)
            if x_pos < x.shape[0] and x_pos < y.shape[0]:
                x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos])

        x = np.zeros([8])  # np.random.rand(8192)
        y = np.ones([8])  # np.random.rand(8192)

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)
        y_c = cuda.to_device(y, stream=stream)

        # call kernel
        threads_per_block = global_constants.threads_per_block()
        blocks_per_grid = (x.shape[0] + threads_per_block -
                           1) // threads_per_block
        _kernel[blocks_per_grid, threads_per_block, stream](x_c, y_c)

        # sync kernel
        stream.synchronize()

        x_new = x_c.copy_to_host(stream=stream)
        del x_c, y_c

        assert (x_new.sum() - 10.506093500145782) <= 1e-5
    def test_exponential(self):
        # wrapper kernel for device function that is tested
        @cuda.jit
        def _kernel(x):
            x_pos = cuda.grid(1)
            if x_pos < x.shape[0]:
                x[x_pos] = rnnt_helper.exponential(x[x_pos])

        x = np.random.rand(8)

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)

        # call kernel
        threads_per_block = global_constants.threads_per_block()
        blocks_per_grid = (x.shape[0] + threads_per_block -
                           1) // threads_per_block
        _kernel[blocks_per_grid, threads_per_block, stream](x_c)

        # sync kernel
        stream.synchronize()

        x_new = x_c.copy_to_host(stream=stream)
        del x_c

        y = np.exp(x)
        for i in range(len(x_new)):
            assert (x_new[i] - y[i]) < 1e-4
Beispiel #26
0
    def test_func(self):
        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
Beispiel #27
0
 def setup(self):
     self.stream = cuda.stream()
     self.f32 = np.zeros(self.n, dtype=np.float32)
     self.d_f32 = cuda.to_device(self.f32, self.stream)
     self.f64 = np.zeros(self.n, dtype=np.float64)
     self.d_f64 = cuda.to_device(self.f64, self.stream)
     self.stream.synchronize()
Beispiel #28
0
    def reduce(self, arg, stream=0):
        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
                                                         "ufunc"
        assert arg.ndim == 1, "must use 1d array"

        n = arg.shape[0]
        gpu_mems = []

        if n == 0:
            raise TypeError("Reduction on an empty array.")
        elif n == 1:  # nothing to do
            return arg[0]

        # always use a stream
        stream = stream or cuda.stream()
        with stream.auto_synchronize():
            # transfer memory to device if necessary
            if devicearray.is_cuda_ndarray(arg):
                mem = arg
            else:
                mem = cuda.to_device(arg, stream)
                # do reduction
            out = self.__reduce(mem, gpu_mems, stream)
            # use a small buffer to store the result element
            buf = np.array((1,), dtype=arg.dtype)
            out.copy_to_host(buf, stream=stream)

        return buf[0]
Beispiel #29
0
        def test(ty):
            print("Test %s" % ty)
            data = np.array(np.random.random(1e6 + 1), dtype=ty)

            ts = time()
            stream = cuda.stream()
            device_data = cuda.to_device(data, stream)
            dresult = cuda_ufunc(device_data, device_data, stream=stream)
            result = dresult.copy_to_host()
            stream.synchronize()
            tnumba = time() - ts

            ts = time()
            gold = np_ufunc(data, data)
            tnumpy = time() - ts

            print("Numpy time: %fs" % tnumpy)
            print("Numba time: %fs" % tnumba)

            if tnumba < tnumpy:
                print("Numba is FASTER by %fx" % (tnumpy / tnumba))
            else:
                print("Numba is SLOWER by %fx" % (tnumba / tnumpy))

            self.assertTrue(np.allclose(gold, result), (gold, result))
Beispiel #30
0
    def test_gufunc_stream(self):
        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
Beispiel #31
0
    def test_laplace_small(self):
        NN = 256
        NM = 256

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM
        iter_max = 1000

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            if iter % 100 == 0:
                print("%5d, %0.6f (elapsed: %f s)" %
                      (iter, error, time.time() - timer))

            iter += 1

        runtime = time.time() - timer
        print(" total: %f s" % runtime)
Beispiel #32
0
    def test_func(self):
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        print("N = %d x %d" % (n, n))

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        Amat = np.matrix(A)
        Bmat = np.matrix(B)

        s = time()
        Cans = Amat * Bmat
        e = time()
        tcpu = e - s

        print('cpu:  %f' % tcpu)
        print('cuda: %f' % tcuda)
        print('cuda speedup: %.2fx' % (tcpu / tcuda))

        # Check result
        self.assertTrue(np.allclose(C, Cans))
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Beispiel #34
0
def box2d_rotate_iou(boxes2d, gt_boxes2d, device_id=0):

    # Inputs:
    #   boxes2d: (N1, 5) x,y,w,l,r
    #   gt_boxes2d: (N2, 5) x,y,w,l,r
    # Outputs:
    #   iou: (N1, N2)
    boxes2d = boxes2d.astype(np.float32)
    gt_boxes2d = gt_boxes2d.astype(np.float32)
    N1 = boxes2d.shape[0]
    N2 = gt_boxes2d.shape[0]
    iou = np.zeros((N1, N2), dtype=np.float32)
    if N1 == 0 or N2 == 0:
        return iou

    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (DIVUP(N1, threadsPerBlock), DIVUP(N2, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes2d.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(gt_boxes2d.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel[blockspergrid, threadsPerBlock,
                          stream](N1, N2, boxes_dev, query_boxes_dev, iou_dev)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)

    return iou.astype(boxes2d.dtype)
Beispiel #35
0
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu. 
    
    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """

    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 4]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
    blockspergrid = (div_up(boxes_num, threadsPerBlock),
                     div_up(boxes_num, threadsPerBlock))
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
        mask_dev.copy_to_host(mask_host, stream=stream)
    # stream.synchronize()
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])
def main():
    NN = 512
    NM = 512

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

    timer = time.time()
    iter = 0

    blockdim = (32, 32)
    griddim = (NN // blockdim[0], NM // blockdim[1])

    error_grid = np.zeros_like(A)

    stream = cuda.stream()

    dA = cuda.to_device(A, stream)  # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)  # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)

    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64

        jacobi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

        derror_grid.to_host(stream)

        # error_grid is available on host
        stream.synchronize()

        error = np.abs(error_grid).max()

        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print("%5d, %0.6f (elapsed: %f s)" %
                  (iter, error, time.time() - timer))

        iter += 1

    runtime = time.time() - timer
    print(" total: %f s" % runtime)
Beispiel #37
0
    def test_func(self):

        @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]])
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
Beispiel #38
0
 async def async_cuda_fn(value_in: float) -> float:
     stream = cuda.stream()
     h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
     h_src[:] = value_in
     d_ary = cuda.to_device(h_src, stream=stream)
     d_ary.copy_to_host(h_dst, stream=stream)
     await stream.async_done()
     return h_dst.mean()
Beispiel #39
0
 def setup(self):
     self.stream = cuda.stream()
     self.d_callResult = cuda.to_device(callResultGold, self.stream)
     self.d_putResult = cuda.to_device(putResultGold, self.stream)
     self.d_stockPrice = cuda.to_device(stockPrice, self.stream)
     self.d_optionStrike = cuda.to_device(optionStrike, self.stream)
     self.d_optionYears = cuda.to_device(optionYears, self.stream)
     self.stream.synchronize()
Beispiel #40
0
    def test_add_callback(self):
        def callback(stream, status, event):
            event.set()

        stream = cuda.stream()
        callback_event = threading.Event()
        stream.add_callback(callback, callback_event)
        self.assertTrue(callback_event.wait(1.0))
Beispiel #41
0
 def setup(self):
     self.stream = cuda.stream()
     self.d_callResult = cuda.to_device(callResultGold, self.stream)
     self.d_putResult = cuda.to_device(putResultGold, self.stream)
     self.d_stockPrice = cuda.to_device(stockPrice, self.stream)
     self.d_optionStrike = cuda.to_device(optionStrike, self.stream)
     self.d_optionYears = cuda.to_device(optionYears, self.stream)
     self.stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Beispiel #43
0
def newthread():
    cuda.select_device(0)
    stream = cuda.stream()
    A = np.arange(100)
    dA = cuda.to_device(A, stream=stream)
    stream.synchronize()
    del dA
    del stream
    cuda.close()
Beispiel #44
0
 def test_stream_bind(self):
     stream = cuda.stream()
     with stream.auto_synchronize():
         arr = cuda.device_array(
             (3, 3),
             dtype=np.float64,
             stream=stream)
         self.assertEqual(arr.bind(stream).stream, stream)
         self.assertEqual(arr.stream, stream)
Beispiel #45
0
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
                          MAX_TPB = 512, stream = None):
    """
    All input (except MAX_TPB and stream) are device arrays.
    edges       : array with the IDs of the edges that will be part of the new graph
    n_edges     : array of 1 element with the number of valid edges in the edges array;
                  if n_edges < size of edges, the last elements of the edges array are
                  not considered
    """

    # check if number of valid edges was received
    if n_edges is None:
        edges_size = edges.size
        n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
    else:
        edges_size = int(n_edges.getitem(0))

    # check if a stream was received, if not create one
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream
    
    new_n_edges = edges_size * 2

    # allocate memory for new graph
    ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
                              stream = myStream)
    nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                stream = myStream)
    nfe = cuda.device_array_like(fe, stream = myStream)
    nod = cuda.device_array_like(od, stream = myStream)

    # fill new outdegree with zeros
    vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
    memSet[vertexGrid, MAX_TPB, myStream](nod, 0)

    # count all edges of new array and who they belong to
    edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
    countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)

    # get new first_edge array from new outdegree
    nfe.copy_to_device(nod, stream=myStream)
    ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)


    # copy new first_edge to top_edge to serve as pointer in adding edges
    top_edge = cuda.device_array_like(nfe, stream = myStream)
    top_edge.copy_to_device(nfe, stream = myStream)

    addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
                                          top_edge, ndest, nweight)

    del top_edge
    #del dest, weight, fe, od
    return ndest, nweight, nfe, nod
Beispiel #46
0
    def test_laplace_small(self):
        if config.ENABLE_CUDASIM:
            NN, NM = 4, 4
            iter_max = 20
        else:
            NN, NM = 256, 256
            iter_max = 1000

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            iter += 1

        runtime = time.time() - timer
Beispiel #47
0
    def _run_copies(self, A):
        A0 = np.copy(A)

        stream = cuda.stream()
        ptr = cuda.to_device(A, copy=False, stream=stream)
        ptr.copy_to_device(A, stream=stream)
        ptr.copy_to_host(A, stream=stream)
        stream.synchronize()

        self.assertTrue(np.allclose(A, A0))
Beispiel #48
0
 def __init__(self, positions, weights):
     self.calculate_forces = cuda.jit(
         argtypes=(float32[:,:], float32[:], float32[:,:])
         )(calculate_forces)
     self.accelerations = np.zeros_like(positions)
     self.n_bodies = len(weights)
     self.stream = cuda.stream()
     self.d_pos = cuda.to_device(positions, self.stream)
     self.d_wei = cuda.to_device(weights, self.stream)
     self.d_acc = cuda.to_device(self.accelerations, self.stream)
     self.stream.synchronize()
Beispiel #49
0
def newthread(exception_queue):
    try:
        cuda.select_device(0)
        stream = cuda.stream()
        A = np.arange(100)
        dA = cuda.to_device(A, stream=stream)
        stream.synchronize()
        del dA
        del stream
        cuda.close()
    except Exception as e:
        exception_queue.put(e)
Beispiel #50
0
        def test(ty):
            data = np.array(np.random.random(self.N), dtype=ty)

            stream = cuda.stream()
            device_data = cuda.to_device(data, stream)
            dresult = cuda_ufunc(device_data, device_data, stream=stream)
            result = dresult.copy_to_host()
            stream.synchronize()

            gold = np_ufunc(data, data)

            self.assertTrue(np.allclose(gold, result), (gold, result))
Beispiel #51
0
    def reduce_test2(self, n):
        @vectorize(sig, target=target)
        def vector_add(a, b):
            return a + b

        cuda_ufunc = vector_add

        x = np.arange(n, dtype=np.int32)
        gold = np.add.reduce(x)
        stream = cuda.stream()
        dx = cuda.to_device(x, stream)
        result = cuda_ufunc.reduce(dx, stream=stream)
        self.assertEqual(result, gold)
Beispiel #52
0
    def test_event_elapsed_stream(self):
        N = 32
        stream = cuda.stream()
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record(stream=stream)
        cuda.to_device(np.arange(N), to=dary, stream=stream)
        evtend.record(stream=stream)
        evtend.wait(stream=stream)
        evtend.synchronize()
        print(evtstart.elapsed_time(evtend))
Beispiel #53
0
    def _prepare(self, arr, stream):
        if arr.ndim != 1:
            raise TypeError("only support 1D array")

        from numba import cuda
        # If no stream is specified, allocate one
        if stream == 0:
            stream = cuda.stream()

        # Make sure `arr` in on the device
        darr, conv = cuda.devicearray.auto_device(arr, stream=stream)

        return darr, stream, conv
Beispiel #54
0
    def test_blackscholes(self):
        OPT_N = 400
        iterations = 2

        stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
        optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
        optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)

        callResultNumpy = np.zeros(OPT_N)
        putResultNumpy = -np.ones(OPT_N)

        callResultNumbapro = np.zeros(OPT_N)
        putResultNumbapro = -np.ones(OPT_N)

        # numpy
        for i in range(iterations):
            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                          optionStrike, optionYears, RISKFREE, VOLATILITY)

        # numbapro
        time0 = time.time()
        blockdim = 512, 1
        griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
        stream = cuda.stream()
        d_callResult = cuda.to_device(callResultNumbapro, stream)
        d_putResult = cuda.to_device(putResultNumbapro, stream)
        d_stockPrice = cuda.to_device(stockPrice, stream)
        d_optionStrike = cuda.to_device(optionStrike, stream)
        d_optionYears = cuda.to_device(optionYears, stream)
        time1 = time.time()
        for i in range(iterations):
            black_scholes_cuda[griddim, blockdim, stream](
                d_callResult, d_putResult, d_stockPrice, d_optionStrike,
                d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.copy_to_host(callResultNumbapro, stream)
        d_putResult.copy_to_host(putResultNumbapro, stream)
        stream.synchronize()

        dt = (time1 - time0)
        print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))

        delta = np.abs(callResultNumpy - callResultNumbapro)
        L1norm = delta.sum() / np.abs(callResultNumpy).sum()

        max_abs_err = delta.max()
        print('L1norm', L1norm)
        print('Max absolute error', max_abs_err)
        self.assertTrue(L1norm < 1e-13)
        self.assertTrue(max_abs_err < 1e-13)
Beispiel #55
0
    def _template(self, name, A):
        A0 = np.copy(A)

        s = timer()
        stream = cuda.stream()
        ptr = cuda.to_device(A, copy=False, stream=stream)

        ptr.copy_to_device(A, stream=stream)

        ptr.copy_to_host(A, stream=stream)
        stream.synchronize()

        e = timer()

        self.assertTrue(np.allclose(A, A0))

        elapsed = e - s
        return elapsed
Beispiel #56
0
    def test_gufunc_stream(self):

        @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])],
                     '(m,n),(n,p)->(m,p)',
                     target='cuda')
        def matmulcore(A, B, C):
            m, n = A.shape
            n, p = B.shape
            for i in range(m):
                for j in range(p):
                    C[i, j] = 0
                    for k in range(n):
                        C[i, j] += A[i, k] * B[k, j]

        gufunc = matmulcore
        gufunc.max_blocksize = 512

        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
Beispiel #57
0
    def test_device_auto_jit_2(self):
        @cuda.jit(device=True)
        def inner(arg):
            return arg + 1

        @cuda.jit
        def outer(argin, argout):
            argout[0] = inner(argin[0]) + inner(2)

        a = np.zeros(1)
        b = np.zeros(1)

        stream = cuda.stream()
        d_a = cuda.to_device(a, stream)
        d_b = cuda.to_device(b, stream)

        outer[1, 1, stream](d_a, d_b)

        d_b.copy_to_host(b, stream)

        self.assertEqual(b[0], (a[0] + 1) + (2 + 1))
Beispiel #58
0
    def device_partial_inplace(self, darr, size=None, init=0, stream=0):
        """Partially reduce a device array inplace as much as possible in an
        efficient manner. Does not automatically transfer host array.

        :param darr: Used to input and output.
        :type darr: device array
        :param size: Number of element in ``arr``.  If None, the entire array is used.
        :type size: int or None
        :param init: Initial value for the reduction
        :type init: dtype of darr
        :param stream: All CUDA operations are performed on this stream if it is given.
          Otherwise, a new stream is created.
        :type stream: cuda stream
        :returns: int -- Number of elements in ``darr`` that contains the reduction result.

        """
        if stream == 0:
            from numba import cuda
            stream = cuda.stream()
            ret = self._partial_inplace_driver(darr, size, init, stream)
            stream.synchronize()
        else:
            ret = self._partial_inplace_driver(darr, size, init, stream)
        return ret
def connected_comps_gpu(dest_in, weight_in, firstEdge_in, outDegree_in,
                        MAX_TPB = 512, stream = None):
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream

    dest = cuda.to_device(dest_in, stream = myStream)
    weight = cuda.to_device(weight_in, stream = myStream)
    firstEdge = cuda.to_device(firstEdge_in, stream = myStream)
    outDegree = cuda.to_device(outDegree_in, stream = myStream)

    n_vertices = firstEdge.size
    n_edges = dest.size

    n_components = n_vertices

    # still need edge_id for conflict resolution in find_minedge
    edge_id = cuda.to_device(np.arange(n_edges, dtype = dest.dtype),
                                       stream = myStream)
    
    #labels = np.empty(n_vertices, dtype = dest.dtype)
    first_iter = True

    # initialize with name top_edge so we can recycle an array between iterations
    top_edge = cuda.device_array(n_components, dtype = dest.dtype,
                                 stream = myStream)
    labels = cuda.device_array(n_components, dtype = dest.dtype,
                               stream = myStream)

    converged = cuda.device_array(1, dtype = np.int8, stream = myStream)
    
    gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB)
    gridDim = compute_cuda_grid_dim(n_components, MAX_TPB)

    final_converged = False
    while(not final_converged):
        vertex_minedge = top_edge

        findMinEdge_CUDA[gridDim, MAX_TPB, myStream](weight, firstEdge,
                                                     outDegree, vertex_minedge,
                                                     dest)

        removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge)

        colors = cuda.device_array(shape = n_components, dtype = np.int32,
                                   stream = myStream)
        initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge,
                                                          colors)

        # propagate colors until convergence
        propagateConverged = False
        while(not propagateConverged):
            propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged)
            converged_num = converged.getitem(0, stream = myStream)
            propagateConverged = True if converged_num == 1 else False

        # first we build the flags in the new_vertex array
        new_vertex = vertex_minedge # reuse the vertex_minedge array as the new new_vertex
        buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex)

        # new_n_vertices is the number of vertices of the new contracted graph
        new_n_vertices = ex_prefix_sum_gpu(new_vertex, MAX_TPB = MAX_TPB, stream = myStream).getitem(0, stream = myStream)
        new_n_vertices = int(new_n_vertices)

        if first_iter:
            # first iteration defines labels as the initial colors and updates
            labels.copy_to_device(colors, stream = myStream)
            first_iter = False
        
        # other iterations update the labels with the new colors
        update_labels_single_pass_cuda[gridDimLabels, MAX_TPB, myStream](labels, colors, new_vertex)

        if new_n_vertices == 1:
            final_converged = True
            del new_vertex
            break

        newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB)
        
        # count number of edges for new supervertices and write in new outDegree
        newOutDegree = cuda.device_array(shape = new_n_vertices,
                                         dtype = np.int32,
                                         stream = myStream)
        memSet[newGridDim, MAX_TPB, myStream](newOutDegree, 0) # zero the newOutDegree array
        countNewEdges_CUDA[gridDim, MAX_TPB, myStream](colors, firstEdge,
                                                       outDegree, dest,
                                                       new_vertex, newOutDegree)

        # new first edge array for contracted graph
        newFirstEdge = cuda.device_array_like(newOutDegree, stream = myStream)
        # copy newOutDegree to newFirstEdge
        newFirstEdge.copy_to_device(newOutDegree, stream = myStream)
        new_n_edges = ex_prefix_sum_gpu(newFirstEdge, MAX_TPB = MAX_TPB,
                                        stream = myStream)

        new_n_edges = new_n_edges.getitem(0, stream = myStream)
        new_n_edges = int(new_n_edges)

        # if no edges remain, then MST has converged
        if new_n_edges == 0:
            final_converged = True
            del newOutDegree, newFirstEdge, new_vertex
            break

        # create arrays for new edges
        new_dest = cuda.device_array(new_n_edges, dtype = np.int32,
                                     stream = myStream)
        new_edge_id = cuda.device_array(new_n_edges, dtype = np.int32,
                                        stream = myStream)
        new_weight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                       stream = myStream)

        top_edge = cuda.device_array_like(newFirstEdge, stream = myStream)
        top_edge.copy_to_device(newFirstEdge, stream = myStream)

        # assign and insert new edges
        assignInsert_CUDA[gridDim, MAX_TPB, myStream](edge_id, dest, weight,
                                            firstEdge, outDegree, colors,
                                            new_vertex, new_dest, new_edge_id,
                                            new_weight, top_edge)

        # delete old graph
        del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors

        # write new graph
        n_components = newFirstEdge.size
        edge_id = new_edge_id
        dest = new_dest
        weight = new_weight
        firstEdge = newFirstEdge
        outDegree = newOutDegree
        gridDim = newGridDim

    returnLabels = labels.copy_to_host()

    del dest, weight, edge_id, firstEdge, outDegree, converged, labels

    return returnLabels
Beispiel #60
0
    def test_blackscholes(self):
        OPT_N = 400
        iterations = 2

        stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
        optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
        optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)

        callResultNumpy = np.zeros(OPT_N)
        putResultNumpy = -np.ones(OPT_N)

        callResultNumbapro = np.zeros(OPT_N)
        putResultNumbapro = -np.ones(OPT_N)

        # numpy
        for i in range(iterations):
            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                          optionStrike, optionYears, RISKFREE, VOLATILITY)



        @cuda.jit(argtypes=(double,), restype=double, device=True, inline=True)
        def cnd_cuda(d):
            K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
            ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
                       (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
            if d > 0:
                ret_val = 1.0 - ret_val
            return ret_val


        @cuda.jit(argtypes=(double[:], double[:], double[:], double[:], double[:],
                            double, double))
        def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
            i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
            if i >= S.shape[0]:
                return
            sqrtT = math.sqrt(T[i])
            d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT)
            d2 = d1 - V * sqrtT
            cndd1 = cnd_cuda(d1)
            cndd2 = cnd_cuda(d2)

            expRT = math.exp((-1. * R) * T[i])
            callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
            putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))

        # numbapro
        time0 = time.time()
        blockdim = 512, 1
        griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
        stream = cuda.stream()
        d_callResult = cuda.to_device(callResultNumbapro, stream)
        d_putResult = cuda.to_device(putResultNumbapro, stream)
        d_stockPrice = cuda.to_device(stockPrice, stream)
        d_optionStrike = cuda.to_device(optionStrike, stream)
        d_optionYears = cuda.to_device(optionYears, stream)
        time1 = time.time()
        for i in range(iterations):
            black_scholes_cuda[griddim, blockdim, stream](
                d_callResult, d_putResult, d_stockPrice, d_optionStrike,
                d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.copy_to_host(callResultNumbapro, stream)
        d_putResult.copy_to_host(putResultNumbapro, stream)
        stream.synchronize()

        dt = (time1 - time0)

        delta = np.abs(callResultNumpy - callResultNumbapro)
        L1norm = delta.sum() / np.abs(callResultNumpy).sum()

        max_abs_err = delta.max()
        self.assertTrue(L1norm < 1e-13)
        self.assertTrue(max_abs_err < 1e-13)