Example #1
0
    def test_laplace_small(self):
        NN = 256
        NM = 256

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM
        iter_max = 1000

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            if iter % 100 == 0:
                print("%5d, %0.6f (elapsed: %f s)" %
                      (iter, error, time.time() - timer))

            iter += 1

        runtime = time.time() - timer
        print(" total: %f s" % runtime)
Example #2
0
    def test_func(self):
        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
Example #3
0
    def test_func(self):
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        print("N = %d x %d" % (n, n))

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        Amat = np.matrix(A)
        Bmat = np.matrix(B)

        s = time()
        Cans = Amat * Bmat
        e = time()
        tcpu = e - s

        print('cpu:  %f' % tcpu)
        print('cuda: %f' % tcuda)
        print('cuda speedup: %.2fx' % (tcpu / tcuda))

        # Check result
        self.assertTrue(np.allclose(C, Cans))
Example #4
0
    def test_gufunc_stream(self):
        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
Example #5
0
def stupidconv_gpu(img, filt, padval):
    """
    does convolution without using FFT because FFT is pissing me off and giving me weird answers
    :param img:
    :param filt:
    :param padval:
    :return:
    """
    cuda.close()
    cuda.select_device(1)
    # get the number of nonzero entries in the filter for later averaging of result
    filt_nnz = np.count_nonzero(filt)

    # pad the images
    s_filt = filt.shape
    s_img = img.shape

    # appropriate padding depends on context
    # pad with filt size all around img
    pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval

    pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img

    output = np.zeros(pad_img.shape, dtype=np.float32)

    d_pad_img = cuda.to_device(pad_img)
    d_filt = cuda.to_device(filt)
    d_output = cuda.to_device(output)

    stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output)

    output = d_output.copy_to_host()
    output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]]

    return output / filt_nnz
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Example #7
0
    def test_for_pre(self):
        """Test issue with loop not running due to bad sign-extension at the for loop
        precondition.
        """

        @cuda.jit(argtypes=[float32[:, :], float32[:, :], float32[:]])
        def diagproduct(c, a, b):
            startX, startY = cuda.grid(2)
            gridX = cuda.gridDim.x * cuda.blockDim.x
            gridY = cuda.gridDim.y * cuda.blockDim.y
            height = c.shape[0]
            width = c.shape[1]

            for x in range(startX, width, (gridX)):
                for y in range(startY, height, (gridY)):
                    c[y, x] = a[y, x] * b[x]

        N = 8

        A, B = generate_input(N)

        F = np.empty(A.shape, dtype=A.dtype)

        blockdim = (32, 8)
        griddim = (1, 1)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)
        dF = cuda.to_device(F, copy=False)
        diagproduct[griddim, blockdim](dF, dA, dB)

        E = np.dot(A, np.diag(B))
        np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
Example #8
0
 def setup(self):
     self.stream = cuda.stream()
     self.f32 = np.zeros(self.n, dtype=np.float32)
     self.d_f32 = cuda.to_device(self.f32, self.stream)
     self.f64 = np.zeros(self.n, dtype=np.float64)
     self.d_f64 = cuda.to_device(self.f64, self.stream)
     self.stream.synchronize()
Example #9
0
 def setup(self):
     self.stream = cuda.stream()
     self.small_data = np.zeros(512, dtype=np.float64)
     self.large_data = np.zeros(512 * 1024, dtype=np.float64)
     self.d_small_data = cuda.to_device(self.small_data, self.stream)
     self.d_large_data = cuda.to_device(self.large_data, self.stream)
     self.stream.synchronize()
Example #10
0
    def test_with_context(self):

        @cuda.jit
        def vector_add_scalar(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] += val


        hostarr = np.arange(10, dtype=np.float32)
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        with cuda.gpus[1]:
            arr2 = cuda.to_device(hostarr)

        with cuda.gpus[0]:
            vector_add_scalar[1, 10](arr1, 1)

        with cuda.gpus[1]:
            vector_add_scalar[1, 10](arr2, 2)

        with cuda.gpus[0]:
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))

        with cuda.gpus[1]:
            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))

        with cuda.gpus[0]:
            # Transfer from GPU1 to GPU0
            arr1.copy_to_device(arr2)
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 2))
def driver(niters, seed):
    curr = seed
    nxt = np.zeros(len(seed))
    nxt[0] = seed[0]
    nxt[-1] = seed[-1]

    start_time = time.time()

    threads_per_block = 256
    blocks_per_grid = int(math.ceil(float(len(curr) - 2) / threads_per_block))

    d_nxt = cuda.to_device(nxt)
    d_curr = cuda.to_device(curr)
    for iter in range(niters):
        kernel[blocks_per_grid, threads_per_block](d_nxt, d_curr, len(curr) - 2)

        tmp = d_nxt
        d_nxt = d_curr
        d_curr = tmp
    d_curr.copy_to_host(curr)
    elapsed_time = time.time() - start_time

    print('Elapsed time for N=' + str(len(seed) - 2) + ', # iters=' +
            str(niters) + ' is ' + str(elapsed_time) + ' s')
    print(str(float(niters) / elapsed_time) + ' iters / s')

    return curr
Example #12
0
    def test_func(self):

        @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]])
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
Example #13
0
 def test_devicearray_replace(self):
     N = 100
     array = np.arange(N, dtype=np.int32)
     original = array.copy()
     gpumem = cuda.to_device(array)
     cuda.to_device(array * 2, to=gpumem)
     gpumem.copy_to_host(array)
     np.testing.assert_array_equal(array, original * 2)
Example #14
0
def fork_test(q):
    from numba.cuda.cudadrv.error import CudaDriverError
    try:
        cuda.to_device(np.arange(1))
    except CudaDriverError as e:
        q.put(e)
    else:
        q.put(None)
Example #15
0
 def setup(self):
     self.stream = cuda.stream()
     self.d_callResult = cuda.to_device(callResultGold, self.stream)
     self.d_putResult = cuda.to_device(putResultGold, self.stream)
     self.d_stockPrice = cuda.to_device(stockPrice, self.stream)
     self.d_optionStrike = cuda.to_device(optionStrike, self.stream)
     self.d_optionYears = cuda.to_device(optionYears, self.stream)
     self.stream.synchronize()
Example #16
0
 def test_devicearray_replace(self):
     N = 100
     array = np.arange(N, dtype=np.int32)
     original = array.copy()
     gpumem = cuda.to_device(array)
     cuda.to_device(array * 2, to=gpumem)
     gpumem.copy_to_host(array)
     self.assertTrue((array == original * 2).all())
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Example #18
0
    def test_devicearray_contiguous_device_strided(self):
        d = cuda.to_device(np.arange(20))
        arr = np.arange(20)

        with self.assertRaises(ValueError) as e:
            d.copy_to_device(cuda.to_device(arr)[::2])
        self.assertEqual(
            devicearray.errmsg_contiguous_buffer,
            str(e.exception))
Example #19
0
    def test_laplace_small(self):
        if config.ENABLE_CUDASIM:
            NN, NM = 4, 4
            iter_max = 20
        else:
            NN, NM = 256, 256
            iter_max = 1000

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            iter += 1

        runtime = time.time() - timer
Example #20
0
    def test_contigous_2d(self):
        ary = np.arange(10)
        cary = ary.reshape(2, 5)
        fary = np.asfortranarray(cary)

        dcary = cuda.to_device(cary)
        dfary = cuda.to_device(fary)
        self.assertTrue(dcary.is_c_contigous())
        self.assertTrue(not dfary.is_c_contigous())
        self.assertTrue(not dcary.is_f_contigous())
        self.assertTrue(dfary.is_f_contigous())
Example #21
0
    def test_invalid_context_error_with_d2d(self):
        def d2d(dst, src):
            dst.copy_to_device(src)

        arr = np.arange(100)
        common = cuda.to_device(arr)
        darr = cuda.to_device(np.zeros(common.shape, dtype=common.dtype))
        th = threading.Thread(target=d2d, args=[darr, common])
        th.start()
        th.join()
        np.testing.assert_equal(darr.copy_to_host(), arr)
Example #22
0
def lapconv(img, filt, padval):
    """
    Performs FFT-based normalization on filter and image, without normalization

    :param numpy.core.multiarray.ndarray img: stimulus image to be convolved
    :param numpy.core.multiarray.ndarray filt: filter to convolve with
    :param float padval: value with which to pad the img before convolution
    :return: result of convolution
    :rtype: numpy.core.multiarray.ndarray
    """

    # get the number of nonzero entries in the filter for later dividing of the results
    filt_nnz = np.count_nonzero(filt)

    # pad the images
    s_filt = filt.shape
    s_img = img.shape

    # appropriate padding depends on context
    pad_img = np.ones((s_img[0] + s_filt[0], s_img[1] + s_filt[1])) * padval

    pad_img[0: s_img[0], 0: s_img[1]] = img

    pad_filt = np.zeros((s_img[0] + s_filt[0], s_img[1] + s_filt[1]))

    pad_filt[0: s_filt[0], 0: s_filt[1]] = filt

    # initialize the GPU
    FFTPlan(shape=pad_img.shape, itype=np.complex64, otype=np.complex64)

    # create temporary arrays for holding FFT values
    normtemp1 = np.zeros(pad_img.shape, dtype=np.complex64)
    normtemp2 = np.zeros(pad_img.shape, dtype=np.complex64)

    d_pad_filt = cuda.to_device(pad_filt.astype(np.complex64))
    d_pad_img = cuda.to_device(pad_img.astype(np.complex64))
    d_normtemp1 = cuda.to_device(normtemp1)
    d_normtemp2 = cuda.to_device(normtemp2)

    fft(d_pad_filt, d_normtemp1)
    fft(d_pad_img, d_normtemp2)
    vmult(d_normtemp1, d_normtemp2, out=d_normtemp1)
    ifft(d_normtemp1, d_normtemp2)
    # temp_out = (cuda.fft.ifft_inplace(cuda.fft.fft_inplace(pad_img)) * cuda.fft.fft_inplace(pad_filt)).real
    temp_out = d_normtemp2.copy_to_host().real

    # extract the appropriate portion of the filtered image
    filtered = temp_out[(s_filt[0] / 2): (s_filt[0] / 2) + s_img[0], (s_filt[1] / 2): (s_filt[1] / 2) + s_img[1]]

    # divide each value by the number of nonzero entries in the filter (and image?!?), so we get an average of all the
    # values
    filtered /= (filt_nnz * s_img[0] * s_img[1])

    return filtered
Example #23
0
 def __init__(self, positions, weights):
     self.calculate_forces = cuda.jit(
         argtypes=(float32[:,:], float32[:], float32[:,:])
         )(calculate_forces)
     self.accelerations = np.zeros_like(positions)
     self.n_bodies = len(weights)
     self.stream = cuda.stream()
     self.d_pos = cuda.to_device(positions, self.stream)
     self.d_wei = cuda.to_device(weights, self.stream)
     self.d_acc = cuda.to_device(self.accelerations, self.stream)
     self.stream.synchronize()
Example #24
0
    def test_device_array_interface(self):
        dary = cuda.device_array(shape=100)
        devicearray.verify_cuda_ndarray_interface(dary)

        ary = np.empty(100)
        dary = cuda.to_device(ary)
        devicearray.verify_cuda_ndarray_interface(dary)

        ary = np.asarray(1.234)
        dary = cuda.to_device(ary)
        self.assertEquals(dary.ndim, 1)
        devicearray.verify_cuda_ndarray_interface(dary)
Example #25
0
 def test_max_pending_count(self):
     # get deallocation manager and flush it
     deallocs = cuda.current_context().deallocations
     deallocs.clear()
     self.assertEqual(len(deallocs), 0)
     # deallocate to maximum count
     for i in range(config.CUDA_DEALLOCS_COUNT):
         cuda.to_device(np.arange(1))
         self.assertEqual(len(deallocs), i + 1)
     # one more to trigger .clear()
     cuda.to_device(np.arange(1))
     self.assertEqual(len(deallocs), 0)
Example #26
0
 def pooling(self, S, s, w, stride, th, blockdim, griddim):
     """
         Cuda Pooling Kernel call
         Returns the updated spike times
     """
     d_S = cuda.to_device(np.ascontiguousarray(S).astype(np.uint8))
     d_s = cuda.to_device(np.ascontiguousarray(s).astype(np.uint8))
     d_w = cuda.to_device(np.ascontiguousarray(w).astype(np.float32))
     S_out = np.empty(d_S.shape, dtype=d_S.dtype)
     pool[griddim, blockdim](d_S, d_s, d_w, stride, th)
     d_S.copy_to_host(S_out)
     return S_out
Example #27
0
    def test_event_elapsed(self):
        N = 32
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record()
        cuda.to_device(np.arange(N), to=dary)
        evtend.record()
        evtend.wait()
        evtend.synchronize()
        print(evtstart.elapsed_time(evtend))
Example #28
0
def is_points_inside_cuda(points, solution):
    threads_per_block = 128
    blocks_per_grid_x = math.ceil(points.shape[0] / threads_per_block)
    blocks_per_grid_y = math.ceil(solution.shape[0])

    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    result = np.empty((points.shape[0], solution.shape[0]), dtype=bool)

    p_points = cuda.to_device(points)
    p_edges = cuda.to_device(solution)
    ray_intersect_segment_cuda[blocks_per_grid, threads_per_block](p_points, p_edges, result)
    return odd(np.sum(result, axis=1))
def extract_1dlbp_gpu(input, neighborhood, d_powers):
    maxThread = 512
    blockDim = maxThread
    d_input = cuda.to_device(input)

    hist = np.zeros(2 ** (2 * neighborhood), dtype='int32')
    gridDim = (len(input) - 2 * neighborhood + blockDim) / blockDim

    d_hist = cuda.to_device(hist)

    lbp_kernel[gridDim, blockDim](d_input, neighborhood, d_powers, d_hist)
    d_hist.to_host()
    return hist
def convolve():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    image = get_image()

    print("Image size: %s" % (image.shape,))

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    # Use SciPy to perform the FFT convolution
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print('CPU: %.2fs' % (te - ts))

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Initialize the cuFFT system.
    FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    d_image_complex = cuda.to_device(image_complex)
    d_response_complex = cuda.to_device(response_complex)

    task1(d_image_complex, d_response_complex)

    cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print('GPU: %.2fs' % (te - ts))

    return cvimage_cpu, cvimage_gpu
Example #31
0
def connected_comps_gpu(dest_in,
                        weight_in,
                        firstEdge_in,
                        outDegree_in,
                        MAX_TPB=512,
                        stream=None):
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream

    dest = cuda.to_device(dest_in, stream=myStream)
    weight = cuda.to_device(weight_in, stream=myStream)
    firstEdge = cuda.to_device(firstEdge_in, stream=myStream)
    outDegree = cuda.to_device(outDegree_in, stream=myStream)

    n_vertices = firstEdge.size
    n_edges = dest.size

    n_components = n_vertices

    # still need edge_id for conflict resolution in find_minedge
    edge_id = cuda.to_device(np.arange(n_edges, dtype=dest.dtype),
                             stream=myStream)

    #labels = np.empty(n_vertices, dtype = dest.dtype)
    first_iter = True

    # initialize with name top_edge so we can recycle an array between iterations
    top_edge = cuda.device_array(n_components,
                                 dtype=dest.dtype,
                                 stream=myStream)
    labels = cuda.device_array(n_components, dtype=dest.dtype, stream=myStream)

    converged = cuda.device_array(1, dtype=np.int8, stream=myStream)

    gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB)
    gridDim = compute_cuda_grid_dim(n_components, MAX_TPB)

    final_converged = False
    while (not final_converged):
        vertex_minedge = top_edge

        findMinEdge_CUDA[gridDim, MAX_TPB,
                         myStream](weight, firstEdge, outDegree,
                                   vertex_minedge, dest)

        removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest,
                                                             vertex_minedge)

        colors = cuda.device_array(shape=n_components,
                                   dtype=np.int32,
                                   stream=myStream)
        initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge,
                                                          colors)

        # propagate colors until convergence
        propagateConverged = False
        while (not propagateConverged):
            propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged)
            converged_num = converged.getitem(0, stream=myStream)
            propagateConverged = True if converged_num == 1 else False

        # first we build the flags in the new_vertex array
        new_vertex = vertex_minedge  # reuse the vertex_minedge array as the new new_vertex
        buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex)

        # new_n_vertices is the number of vertices of the new contracted graph
        new_n_vertices = ex_prefix_sum_gpu(new_vertex,
                                           MAX_TPB=MAX_TPB,
                                           stream=myStream).getitem(
                                               0, stream=myStream)
        new_n_vertices = int(new_n_vertices)

        if first_iter:
            # first iteration defines labels as the initial colors and updates
            labels.copy_to_device(colors, stream=myStream)
            first_iter = False

        # other iterations update the labels with the new colors
        update_labels_single_pass_cuda[gridDimLabels, MAX_TPB,
                                       myStream](labels, colors, new_vertex)

        if new_n_vertices == 1:
            final_converged = True
            del new_vertex
            break

        newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB)

        # count number of edges for new supervertices and write in new outDegree
        newOutDegree = cuda.device_array(shape=new_n_vertices,
                                         dtype=np.int32,
                                         stream=myStream)
        memSet[newGridDim, MAX_TPB, myStream](newOutDegree,
                                              0)  # zero the newOutDegree array
        countNewEdges_CUDA[gridDim, MAX_TPB,
                           myStream](colors, firstEdge, outDegree, dest,
                                     new_vertex, newOutDegree)

        # new first edge array for contracted graph
        newFirstEdge = cuda.device_array_like(newOutDegree, stream=myStream)
        # copy newOutDegree to newFirstEdge
        newFirstEdge.copy_to_device(newOutDegree, stream=myStream)
        new_n_edges = ex_prefix_sum_gpu(newFirstEdge,
                                        MAX_TPB=MAX_TPB,
                                        stream=myStream)

        new_n_edges = new_n_edges.getitem(0, stream=myStream)
        new_n_edges = int(new_n_edges)

        # if no edges remain, then MST has converged
        if new_n_edges == 0:
            final_converged = True
            del newOutDegree, newFirstEdge, new_vertex
            break

        # create arrays for new edges
        new_dest = cuda.device_array(new_n_edges,
                                     dtype=np.int32,
                                     stream=myStream)
        new_edge_id = cuda.device_array(new_n_edges,
                                        dtype=np.int32,
                                        stream=myStream)
        new_weight = cuda.device_array(new_n_edges,
                                       dtype=weight.dtype,
                                       stream=myStream)

        top_edge = cuda.device_array_like(newFirstEdge, stream=myStream)
        top_edge.copy_to_device(newFirstEdge, stream=myStream)

        # assign and insert new edges
        assignInsert_CUDA[gridDim, MAX_TPB,
                          myStream](edge_id, dest, weight, firstEdge,
                                    outDegree, colors, new_vertex, new_dest,
                                    new_edge_id, new_weight, top_edge)

        # delete old graph
        del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors

        # write new graph
        n_components = newFirstEdge.size
        edge_id = new_edge_id
        dest = new_dest
        weight = new_weight
        firstEdge = newFirstEdge
        outDegree = newOutDegree
        gridDim = newGridDim

    returnLabels = labels.copy_to_host()

    del dest, weight, edge_id, firstEdge, outDegree, converged, labels

    return returnLabels
    VAR1 = np.full((nx + 2 * nb, ny + 2 * nb, nz), np.nan, dtype=wp)
    VAR2 = np.full((nx + 2 * nb, ny + 2 * nb, nz), np.nan, dtype=wp)

    VAR[ii, jj, :] = np.random.random((nx, ny, nz))
    VAR1[ii, jj, :] = np.random.random((nx, ny, nz))
    VAR2[ii, jj, :] = np.random.random((nx, ny, nz))

    dVARdt[ii, jj, :] = 0.

    exchange_BC(VAR)
    exchange_BC(VAR1)
    exchange_BC(VAR2)

    stream = cuda.stream()

    VARd = cuda.to_device(VAR, stream)
    dVARdtd = cuda.to_device(dVARdt, stream)
    VAR1d = cuda.to_device(VAR1, stream)
    VAR2d = cuda.to_device(VAR2, stream)

    kernel_x = cuda.jit(cuda_kernel_decorator(kernel_x))(kernel_x)
    kernel_y = cuda.jit(cuda_kernel_decorator(kernel_y))(kernel_y)
    kernel_z = cuda.jit(cuda_kernel_decorator(kernel_z))(kernel_z)
    kernel_improve = cuda.jit(cuda_kernel_decorator(kernel_improve))\
                                (kernel_improve)
    kernel_shared = cuda.jit(cuda_kernel_decorator(kernel_shared))\
                                (kernel_shared)
    kernel_shared_z = cuda.jit(cuda_kernel_decorator(kernel_shared_z))\
                                (kernel_shared_z)
    kernel_shared_x = cuda.jit(cuda_kernel_decorator(kernel_shared_x))\
                                (kernel_shared_x)
    def test_array_views(self):
        """Views created via array interface support:
            - Strided slices
            - Strided slices
        """
        h_arr = np.random.random(10)
        c_arr = cuda.to_device(h_arr)

        arr = cuda.as_cuda_array(c_arr)

        # __getitem__ interface accesses expected data

        # Direct views
        np.testing.assert_array_equal(arr.copy_to_host(), h_arr)
        np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr)

        # Slicing
        np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5])

        # Strided view
        np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2])

        # View of strided array
        arr_strided = cuda.as_cuda_array(c_arr[::2])
        np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2])

        # A strided-view-of-array and view-of-strided-array have the same
        # shape, strides, itemsize, and alloc_size
        self.assertEqual(arr[::2].shape, arr_strided.shape)
        self.assertEqual(arr[::2].strides, arr_strided.strides)
        self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize)
        self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size)
        self.assertEqual(arr[::2].nbytes,
                         arr_strided.size * arr_strided.dtype.itemsize)

        # __setitem__ interface propagates into external array

        # Writes to a slice
        arr[:5] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host(),
            np.concatenate((np.full(5, np.pi), h_arr[5:]))
        )

        # Writes to a slice from a view
        arr[:5] = arr[5:]
        np.testing.assert_array_equal(
            c_arr.copy_to_host(),
            np.concatenate((h_arr[5:], h_arr[5:]))
        )

        # Writes through a view
        arr[:] = cuda.to_device(h_arr)
        np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr)

        # Writes to a strided slice
        arr[::2] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host()[::2],
            np.full(5, np.pi),
        )
        np.testing.assert_array_equal(
            c_arr.copy_to_host()[1::2],
            h_arr[1::2]
        )
 def test_negative_slicing_2d(self):
     arr = np.arange(12).reshape(3, 4)
     darr = cuda.to_device(arr)
     for x, y, w, s in product(range(-4, 4), repeat=4):
         np.testing.assert_array_equal(arr[x:y, w:s],
                                       darr[x:y, w:s].copy_to_host())
Example #35
0
 def apply(self, frame):
     d_frame = cuda.to_device(frame.flatten().astype(np.float32))
     ColorGrade.k_tonemap[(self.blocksPGrid),(self.threadsPBlock)](d_frame, self.tonemap_coeffs, np.int32(self.framesize))
     ColorGrade.k_colorcorrect[(self.blocksPGrid),(self.threadsPBlock)](d_frame, self.cc_res, self.cc_vals, np.int32(self.framesize))
     return d_frame.copy_to_host().reshape(self.h, self.w, 3)
Example #36
0
    C[y, x] = 0
    for i in range(n):
        C[y, x] += A[y, i] * B[i, x]


A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)

print("N = %d x %d" % (n, n))

s = time()
stream = cuda.stream()
with stream.auto_synchronize():
    dA = cuda.to_device(A, stream)
    dB = cuda.to_device(B, stream)
    dC = cuda.to_device(C, stream)
    cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
    dC.to_host(stream)

e = time()
tcuda = e - s

# Host compute
Amat = np.matrix(A)
Bmat = np.matrix(B)

s = time()
Cans = Amat * Bmat
e = time()
Example #37
0
    def test_blackscholes(self):
        OPT_N = 400
        iterations = 2

        stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
        optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
        optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)

        callResultNumpy = np.zeros(OPT_N)
        putResultNumpy = -np.ones(OPT_N)

        callResultNumba = np.zeros(OPT_N)
        putResultNumba = -np.ones(OPT_N)

        # numpy
        for i in range(iterations):
            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                          optionStrike, optionYears, RISKFREE, VOLATILITY)

        @cuda.jit(double(double), device=True, inline=True)
        def cnd_cuda(d):
            K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
            ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
                       (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
            if d > 0:
                ret_val = 1.0 - ret_val
            return ret_val

        @cuda.jit(
            void(double[:], double[:], double[:], double[:], double[:], double,
                 double))
        def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
            i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
            if i >= S.shape[0]:
                return
            sqrtT = math.sqrt(T[i])
            d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) /
                  (V * sqrtT))
            d2 = d1 - V * sqrtT
            cndd1 = cnd_cuda(d1)
            cndd2 = cnd_cuda(d2)

            expRT = math.exp((-1. * R) * T[i])
            callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
            putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] *
                            (1.0 - cndd1))

        # numba
        blockdim = 512, 1
        griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
        stream = cuda.stream()
        d_callResult = cuda.to_device(callResultNumba, stream)
        d_putResult = cuda.to_device(putResultNumba, stream)
        d_stockPrice = cuda.to_device(stockPrice, stream)
        d_optionStrike = cuda.to_device(optionStrike, stream)
        d_optionYears = cuda.to_device(optionYears, stream)

        for i in range(iterations):
            black_scholes_cuda[griddim, blockdim,
                               stream](d_callResult, d_putResult, d_stockPrice,
                                       d_optionStrike, d_optionYears, RISKFREE,
                                       VOLATILITY)
        d_callResult.copy_to_host(callResultNumba, stream)
        d_putResult.copy_to_host(putResultNumba, stream)
        stream.synchronize()

        delta = np.abs(callResultNumpy - callResultNumba)
        L1norm = delta.sum() / np.abs(callResultNumpy).sum()

        max_abs_err = delta.max()
        self.assertTrue(L1norm < 1e-13)
        self.assertTrue(max_abs_err < 1e-13)
Example #38
0
            array[i] *= 2
            array[i] /= 2


data = []
data_gpu = []
gpu_out = []
streams = []

for _ in range(NUM_ARRAYS):
    streams.append(cuda.stream())
    data.append(np.random.randn(ARRAY_LEN).astype('float32'))

t_start = perf_counter()

for k in range(NUM_ARRAYS):
    data_gpu.append(cuda.to_device(data[k], stream=streams[k]))

for k in range(NUM_ARRAYS):
    kernel[1, 64, streams[k]](data_gpu[k])

for k in range(NUM_ARRAYS):
    gpu_out.append(data_gpu[k].copy_to_host(stream=streams[k]))

t_end = perf_counter()

for k in range(NUM_ARRAYS):
    assert (np.allclose(gpu_out[k], data[k]))

print(f'Total time: {t_end - t_start: .2f} s')
    weight_ar[128] = 2.3
    weight_ar[129] = 2.3
    weight_ar[130] = 2.3
    weight_ar[191] = 2.3
    weight_ar[192] = 2.3
    weight_ar[193] = 2.3
    weight_ar[254] = 2.3
    weight_ar[255] = 2.3
    weight_ar[256] = 2.3
    weight_ar[257] = 2.3

if rowlen > 149:
    weight_ar[22486] = 2.54

# Explicitly copy to device
d_weight_ar = cuda.to_device(weight_ar)
d_nonzero_ar = cuda.to_device(nonzero_ar)
d_scan_ar = cuda.to_device(scan_ar)

# scanf_ar is a data structure to hold the final, corrected scan
scanf_ar = np.zeros((arrayszplus, ), dtype=np.uint32)
d_scanf_ar = cuda.to_device(scanf_ar)

# We now have the arrays in d_carrylist, which needs to be scanned, so that it
# can be added to each block of d_scan_ar. If d_carry is of large
# size, then we need to recursively scan until we do a single scan on
# the multiprocessor

#
# Make up a list of carry vectors and allocate device memory
#
 def test_index_1d(self):
     arr = np.arange(10)
     darr = cuda.to_device(arr)
     for i in range(arr.size):
         self.assertEqual(arr[i], darr[i])
Example #41
0
 def to_device(self, hostary, stream):
     return cuda.to_device(hostary, stream=stream)
 def test_strided_index_1d(self):
     arr = np.arange(10)
     darr = cuda.to_device(arr)
     for i in range(arr.size):
         np.testing.assert_equal(arr[i::2], darr[i::2].copy_to_host())
 def test_strides(self):
     arr = np.ones(20)
     darr = cuda.to_device(arr)
     arr[::2] = 500
     darr[::2] = 500
     np.testing.assert_array_equal(darr.copy_to_host(), arr)
 def test_broadcast(self):
     arr = np.arange(5 * 7).reshape(5, 7)
     darr = cuda.to_device(arr)
     arr[:, 2] = 500
     darr[:, 2] = 500
     np.testing.assert_array_equal(darr.copy_to_host(), arr)
Example #45
0
import numpy as np
from numba import cuda


@cuda.jit
def square_device(a, out):
    idx = cuda.grid(1)
    out[idx] = a[idx]**2


n = 4096
a = np.arange(n)

d_a = cuda.to_device(a)
d_out = cuda.device_array(shape=(n, ), dtype=np.float32)

threads = 32
blocks = 128

square_device[blocks, threads](d_a, d_out)
Example #46
0
domain = [0.0, 1.0, 0.0, 1.0]

D = 1.0  # Diffusion coefficient

x_spacing = 1.0 / float(c.shape[0])
y_spacing = 1.0 / float(c.shape[1])

# Store spacing as inverse square to avoid repeated division
inv_xspsq = 1.0 / (x_spacing**2)
inv_yspsq = 1.0 / (y_spacing**2)

# Satisfy stability condition
time_step = 0.25 * min(x_spacing, y_spacing)**2

# Copy this array to the device, and create a new device array to hold updated value
d_c = cuda.to_device(c)
d_c_new = cuda.device_array(c.shape, dtype=np.float)

threads_per_block = (32, 32)
blocks_per_grid = (dim // 30, dim // 30)

t1 = timer()  # Start timer

# Evolve forward 2000 steps
for step in range(2000):

    # Launch the kernel
    diffusion_kernel_shared[blocks_per_grid,
                            threads_per_block](D, inv_xspsq, inv_yspsq,
                                               time_step, d_c, d_c_new)
Example #47
0
    """
    row, col = cuda.grid(2)
    if row < C.shape[0] and col < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[row, k] * B[k, col]
        C[row, col] = tmp
        
# Host code

# Initialize the data arrays
A = numpy.full((2400, 1200), 3, numpy.float) # matrix containing all 3's
B = numpy.full((1200, 2200), 4, numpy.float) # matrix containing all 4's

# Copy the arrays to the device
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)

# Allocate memory on the device for the result
C_global_mem = cuda.device_array((2400, 2200))

# Configure the blocks
threadsperblock = (30, 30)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

# Start the kernel 
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)

# Copy the result back to the host
    # Read the input image
    original = cv2.imread("data/original0.png")
    #original = cv2.resize(original, None, fx=0.25, fy=0.25)
    original = cv2.split(cv2.cvtColor(original, cv2.COLOR_BGR2YCR_CB))[0]
    original = original.astype(np.uint64) / 255
    #original = np.full_like(original, 1)
    dim = np.array([original.shape[1], original.shape[0]])
    print(original.shape)

    p = np.array([64]) # pixels to interpolate in row
    max_levels = np.array([int(math.log2(original.shape[1]))]) # summation levels in CUDA for logn performance
    template = np.zeros_like(original)


    # Create images in GPU
    d_original = cuda.to_device(np.ascontiguousarray(original), stream=stream)
    d_interpolated = cuda.to_device(np.ascontiguousarray(template), stream=stream)
    d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream)
    d_p = cuda.to_device(np.ascontiguousarray(p), stream=stream)

    interpolate_image[512,512](d_original, d_p, d_dim, d_interpolated)


""" FOR PSNR
    # Compute image squared error original-interpolated
    d_squared_error = cuda.to_device(np.ascontiguousarray(template), stream=stream)
    image_squared_error[512,512](d_original, d_interpolated, d_dim, d_squared_error)

    # Sum squared errors
    d_max_levels = cuda.to_device(np.ascontiguousarray(max_levels), stream=stream)
    row_sum[512,512](d_squared_error, d_dim, d_max_levels)
pad_size = int(kz_degree*(kz_window-1)/2)

times_no_mem = np.zeros(300, dtype=np.float)
times_mem = np.zeros(300, dtype=np.float)
x = np.zeros(300, dtype=np.int32)

for i in range(0, 300):
    dt = 0.1
    timePoints = np.arange(0, i+dt, dt)
    signal_original = np.sin(2*np.pi*0.05*timePoints)
    result = np.zeros(signal_original.size)
    signal = np.append(np.zeros(pad_size),np.append(signal_original,np.zeros(pad_size))) # Padded left and right

    # time transfer
    t_transfer_signal = synchronous_kernel_timeit(lambda: cuda.to_device(signal), number=100)
    t_transfer_coeffs = synchronous_kernel_timeit(lambda: cuda.to_device(kz_coeffs), number=100)
    t_transfer_result = synchronous_kernel_timeit(lambda: cuda.device_array(result.size), number=100)
    t_transfer = t_transfer_signal + t_transfer_result + t_transfer_coeffs

    # time no mem
    t_no_mem = time_filter(1, 100, signal, kz_coeffs, result)

    # time mem
    signal_dev = cuda.to_device(signal)
    result_dev = cuda.device_array(result.size)
    coeffs_dev = cuda.to_device(kz_coeffs)
    t_mem = time_filter(1, 100, signal_dev, coeffs_dev, result_dev)

    times_mem[i] = t_transfer+t_mem
    times_no_mem[i] = t_no_mem
    c1[0] = int64(temp1)
    c1[1] = int64(temp2)
    c1[2] = int64(temp3)
    c1[3] = int64(temp4)
    c1[4] = int64(temp5)
    c1[5] = int64(temp6)

    a = [temp1, temp2, temp3, temp4, temp5, temp6]
    for i in range(c1.shape[0]):
        #       for j in range(0,5):
        c1[i] = a[i]
    return a


# Copy the arrays to the device
n1 = cuda.to_device(n1)
dev_inp = cuda.to_device(one)  # alloc and copy input data

test(dev_inp, dev_inp)  # invoke the gufunc

dev_inp.copy_to_host(one)
# Copy the result back to the host
C = dev_inp.copy_to_host()


#thread and parallel
def test(x1, c1):
    i = 0
    e1 = x1[i]
    e2 = x1[i + 1]
    x2 = n1[e1]
Example #51
0
def cubeOverlap(gtNums, dtNums, gts, dts, hAxis, criterion, gpuId):
    '''
    gtNums: array of #gt with shape (#frame, ), dtype uint8
    dtNums: array of #dt with shape (#frame, ), dtype uint8
    gts: array of gt with shape (#frame, maxGTnum, itemNum), dtype float32
    dts: array of dt with shape (#frame, maxDTnum, itemNum), dtype float32
    hAxis: height axis (uint8)
    criterion: overlap criterion (int8)
    gpuId: device id
    item info: x, y, z, dx, dy, dz, heading
    '''
    assert gtNums.shape == dtNums.shape, 'gtNums {} frames while dtNums {} frames'.format(
        gtNums.shape, dtNums.shape)
    assert gtNums.dtype == np.uint8, 'gtNums: uint8 expected but {} given'.format(
        gtNums.dtype)
    assert dtNums.dtype == np.uint8, 'dtNums: uint8 expected but {} given'.format(
        dtNums.dtype)
    assert gts.shape[0] == dts.shape[
        0], 'gts {} frames while dts {} frames'.format(gts.shape[0],
                                                       dts.shape[0])
    assert gts.dtype == np.float32, 'gts: float32 expected but {} given'.format(
        gts.dtype)
    assert dts.dtype == np.float32, 'dts: float32 expected but {} given'.format(
        dts.dtype)
    assert gtNums.shape[0] == gts.shape[
        0], 'gtNums {} frames while gts {} frames'.format(
            gtNums.shape[0], gts.shape[0])

    assert hAxis in set([0, 1, 2]), 'invalid hAxis {}'.format(hAxis)
    hAxis = np.uint8(hAxis)
    assert criterion in set(
        [-1, 0, 1]), 'invalid overlap criterion {}'.format(criterion)
    criterion = np.int8(criterion)

    overlaps = np.zeros((gts.shape[0], gts.shape[1], dts.shape[1]), np.float32)
    if not np.all(overlaps.shape):
        return overlaps

    assert gts.shape[-1] == 7, 'invalid gt shape {}'.format(gts.shape)
    assert dts.shape[-1] == 7, 'invalid dt shape {}'.format(dts.shape)

    device = cuda.select_device(gpuId)
    # if device.id != gpuId:
    #     cuda.close()
    #     device = cuda.select_device(gpuId)
    assert device is None or device.id == gpuId, 'wake up {}th gpu rather than requested {}th gpu'.format(
        device.id, gpuId)

    blocksPerGrid = gts.shape[0]
    threadsPerBlock = gts.shape[1]
    gtCubeNum = np.uint8(gts.shape[1])
    dtCubeNum = np.uint8(dts.shape[1])
    stream = cuda.stream()
    with stream.auto_synchronize():
        gtNumsGPU = cuda.to_device(gtNums, stream)
        dtNumsGPU = cuda.to_device(dtNums, stream)
        gtsGPU = cuda.to_device(gts.reshape(-1), stream)
        dtsGPU = cuda.to_device(dts.reshape(-1), stream)
        overlapsGPU = cuda.to_device(overlaps.reshape(-1), stream)
        cubeOverlap_kernel[blocksPerGrid, threadsPerBlock,
                           stream](gtNumsGPU, dtNumsGPU, gtCubeNum, dtCubeNum,
                                   gtsGPU, dtsGPU, overlapsGPU, hAxis,
                                   criterion)
        overlapsGPU.copy_to_host(overlaps.reshape(-1), stream)
    return overlaps
    def test_prefix_select(self):
        arr = np.arange(5 * 7).reshape(5, 7, order='F')

        darr = cuda.to_device(arr)
        self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
Example #53
0
def _gpu_stump(
    T_A_fname,
    T_B_fname,
    m,
    range_stop,
    excl_zone,
    M_T_fname,
    Σ_T_fname,
    QT_fname,
    QT_first_fname,
    μ_Q_fname,
    σ_Q_fname,
    k,
    ignore_trivial=True,
    range_start=1,
    device_id=0,
):
    """
    A Numba CUDA version of STOMP for parallel computation of the
    matrix profile, matrix profile indices, left matrix profile indices,
    and right matrix profile indices.

    Parameters
    ----------
    T_A_fname : str
        The file name for the time series or sequence for which to compute
        the matrix profile

    T_B_fname : str
        The file name for the time series or sequence that will be used to annotate T_A.
        For every subsequence in T_A, its nearest neighbor in T_B will be recorded.

    m : int
        Window size

    range_stop : int
        The index value along T_B for which to stop the matrix profile
        calculation. This parameter is here for consistency with the
        distributed `stumped` algorithm.

    excl_zone : int
        The half width for the exclusion zone relative to the current
        sliding window

    M_T_fname : str
        The file name for the sliding mean of time series, `T`

    Σ_T_fname : str
        The file name for the sliding standard deviation of time series, `T`

    QT_fname : str
        The file name for the dot product between some query sequence,`Q`,
        and time series, `T`

    QT_first_fname : str
        The file name for the QT for the first window relative to the current
        sliding window

    μ_Q_fname : str
        The file name for the mean of the query sequence, `Q`, relative to
        the current sliding window

    σ_Q_fname : str
        The file name for the standard deviation of the query sequence, `Q`,
        relative to the current sliding window

    k : int
        The total number of sliding windows to iterate over

    ignore_trivial : bool
        Set to `True` if this is a self-join. Otherwise, for AB-join, set this to
        `False`. Default is `True`.

    range_start : int
        The starting index value along T_B for which to start the matrix
        profile calculation. Default is 1.

    device_id : int
        The (GPU) device number to use. The default value is `0`.

    Returns
    -------
    profile_fname : str
        The file name for the matrix profile

    indices_fname : str
        The file name for the matrix profile indices. The first column of the
        array consists of the matrix profile indices, the second column consists
        of the left matrix profile indices, and the third column consists of the
        right matrix profile indices.

    Notes
    -----
    `DOI: 10.1109/ICDM.2016.0085 \
    <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__

    See Table II, Figure 5, and Figure 6

    Timeseries, T_A, will be annotated with the distance location
    (or index) of all its subsequences in another times series, T_B.

    Return: For every subsequence, Q, in T_A, you will get a distance
    and index for the closest subsequence in T_A. Thus, the array
    returned will have length T_A.shape[0]-m+1. Additionally, the
    left and right matrix profiles are also returned.

    Note: Unlike in the Table II where T_A.shape is expected to be equal
    to T_B.shape, this implementation is generalized so that the shapes of
    T_A and T_B can be different. In the case where T_A.shape == T_B.shape,
    then our algorithm reduces down to the same algorithm found in Table II.

    Additionally, unlike STAMP where the exclusion zone is m/2, the default
    exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3).

    For self-joins, set `ignore_trivial = True` in order to avoid the
    trivial match.

    Note that left and right matrix profiles are only available for self-joins.
    """
    threads_per_block = config.STUMPY_THREADS_PER_BLOCK
    blocks_per_grid = math.ceil(k / threads_per_block)

    T_A = np.load(T_A_fname, allow_pickle=False)
    T_B = np.load(T_B_fname, allow_pickle=False)
    QT = np.load(QT_fname, allow_pickle=False)
    QT_first = np.load(QT_first_fname, allow_pickle=False)
    M_T = np.load(M_T_fname, allow_pickle=False)
    Σ_T = np.load(Σ_T_fname, allow_pickle=False)
    μ_Q = np.load(μ_Q_fname, allow_pickle=False)
    σ_Q = np.load(σ_Q_fname, allow_pickle=False)

    with cuda.gpus[device_id]:
        device_T_A = cuda.to_device(T_A)
        device_QT_odd = cuda.to_device(QT)
        device_QT_even = cuda.to_device(QT)
        device_QT_first = cuda.to_device(QT_first)
        device_μ_Q = cuda.to_device(μ_Q)
        device_σ_Q = cuda.to_device(σ_Q)
        if ignore_trivial:
            device_T_B = device_T_A
            device_M_T = device_μ_Q
            device_Σ_T = device_σ_Q
        else:
            device_T_B = cuda.to_device(T_B)
            device_M_T = cuda.to_device(M_T)
            device_Σ_T = cuda.to_device(Σ_T)

        profile = np.full((k, 3), np.inf)  # float64
        indices = np.full((k, 3), -1, dtype=np.int64)  # int64

        device_profile = cuda.to_device(profile)
        device_indices = cuda.to_device(indices)
        _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block](
            range_start - 1,
            device_T_A,
            device_T_B,
            m,
            device_QT_even,
            device_QT_odd,
            device_QT_first,
            device_M_T,
            device_Σ_T,
            device_μ_Q,
            device_σ_Q,
            k,
            ignore_trivial,
            excl_zone,
            device_profile,
            device_indices,
            False,
        )

        for i in range(range_start, range_stop):
            _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block](
                i,
                device_T_A,
                device_T_B,
                m,
                device_QT_even,
                device_QT_odd,
                device_QT_first,
                device_M_T,
                device_Σ_T,
                device_μ_Q,
                device_σ_Q,
                k,
                ignore_trivial,
                excl_zone,
                device_profile,
                device_indices,
                True,
            )

        profile = device_profile.copy_to_host()
        indices = device_indices.copy_to_host()
        profile = np.sqrt(profile)

        profile_fname = core.array_to_temp_file(profile)
        indices_fname = core.array_to_temp_file(indices)

    return profile_fname, indices_fname
Example #54
0
    def test_ex_matmul(self):
        """Test of matrix multiplication on various cases."""
        # magictoken.ex_import.begin
        from numba import cuda, float32
        import numpy as np
        import math
        # magictoken.ex_import.end

        # magictoken.ex_matmul.begin
        @cuda.jit
        def matmul(A, B, C):
            """Perform square matrix multiplication of C = A * B."""
            i, j = cuda.grid(2)
            if i < C.shape[0] and j < C.shape[1]:
                tmp = 0.
                for k in range(A.shape[1]):
                    tmp += A[i, k] * B[k, j]
                C[i, j] = tmp
        # magictoken.ex_matmul.end

        # magictoken.ex_run_matmul.begin
        x_h = np.arange(16).reshape([4, 4])
        y_h = np.ones([4, 4])
        z_h = np.zeros([4, 4])

        x_d = cuda.to_device(x_h)
        y_d = cuda.to_device(y_h)
        z_d = cuda.to_device(z_h)

        threadsperblock = (16, 16)
        blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
        blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)

        matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
        z_h = z_d.copy_to_host()
        print(z_h)
        print(x_h @ y_h)
        # magictoken.ex_run_matmul.end

        # magictoken.ex_fast_matmul.begin
        # Controls threads per block and shared memory usage.
        # The computation will be done on blocks of TPBxTPB elements.
        # TPB should not be larger than 32 in this example
        TPB = 16

        @cuda.jit
        def fast_matmul(A, B, C):
            """
            Perform matrix multiplication of C = A * B using CUDA shared memory.

            Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
            """
            # Define an array in the shared memory
            # The size and type of the arrays must be known at compile time
            sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
            sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

            x, y = cuda.grid(2)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bpg = cuda.gridDim.x    # blocks per grid

            # Each thread computes one element in the result matrix.
            # The dot product is chunked into dot products of TPB-long vectors.
            tmp = float32(0.)
            for i in range(bpg):
                # Preload data into shared memory
                sA[ty, tx] = 0
                sB[ty, tx] = 0
                if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:
                    sA[ty, tx] = A[y, tx + i * TPB]
                if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:
                    sB[ty, tx] = B[ty + i * TPB, x]

                # Wait until all threads finish preloading
                cuda.syncthreads()

                # Computes partial product on the shared memory
                for j in range(TPB):
                    tmp += sA[ty, j] * sB[j, tx]

                # Wait until all threads finish computing
                cuda.syncthreads()
            if y < C.shape[0] and x < C.shape[1]:
                C[y, x] = tmp
        # magictoken.ex_fast_matmul.end

        # magictoken.ex_run_fast_matmul.begin
        x_h = np.arange(16).reshape([4, 4])
        y_h = np.ones([4, 4])
        z_h = np.zeros([4, 4])

        x_d = cuda.to_device(x_h)
        y_d = cuda.to_device(y_h)
        z_d = cuda.to_device(z_h)

        threadsperblock = (TPB, TPB)
        blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
        blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)

        fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
        z_h = z_d.copy_to_host()
        print(z_h)
        print(x_h @ y_h)
        # magictoken.ex_run_fast_matmul.end

        # fast_matmul test(s)
        msg = "fast_matmul incorrect for shared memory, square case."
        self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)

        # magictoken.ex_run_nonsquare.begin
        x_h = np.arange(115).reshape([5, 23])
        y_h = np.ones([23, 7])
        z_h = np.zeros([5, 7])

        x_d = cuda.to_device(x_h)
        y_d = cuda.to_device(y_h)
        z_d = cuda.to_device(z_h)

        threadsperblock = (TPB, TPB)
        grid_y_max = max(x_h.shape[0], y_h.shape[0])
        grid_x_max = max(x_h.shape[1], y_h.shape[1])
        blockspergrid_x = math.ceil(grid_x_max / threadsperblock[0])
        blockspergrid_y = math.ceil(grid_y_max / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)

        fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
        z_h = z_d.copy_to_host()
        print(z_h)
        print(x_h @ y_h)
        # magictoken.ex_run_nonsquare.end

        # nonsquare fast_matmul test(s)
        msg = "fast_matmul incorrect for shared memory, non-square case."
        self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
        both = np.r_[left, right]
        res[np.sum(p[both >= input[i]])] += 1
    return res


X = np.arange(3, 7)
X = 10**X
neighborhood = 4

cpu_times = np.zeros(X.shape[0])
cpu_times_simple = cpu_times.copy()
cpu_times_jit = cpu_times.copy()
gpu_times = np.zeros(X.shape[0])

p = 1 << np.array(range(0, 2 * neighborhood), dtype='int32')
d_powers = cuda.to_device(p)

for i, x in enumerate(X):
    input = np.random.randint(0, 256, size=x).astype(np.uint8)

    print "Length: {0}".format(x)
    print "--------------"

    start = timer()
    h_cpu = extract_1dlbp_cpu(input, neighborhood, p)
    cpu_times[i] = timer() - start
    print "Finished on CPU: time: {0:3.5f}s".format(cpu_times[i])

    res = np.zeros(1 << (2 * neighborhood), dtype='int32')
    start = timer()
    h_cpu_simple = extract_1dlbp_gpu_debug(input, neighborhood, p, res)
def run_glm_ptr(nFolds,
                nAlphas,
                nLambdas,
                xtrain,
                ytrain,
                xtest,
                ytest,
                wtrain,
                write,
                display,
                nGPUs=1):
    """Runs ElasticNetH2O test"""
    use_gpu = nGPUs > 0

    if use_gpu == 1:

        from numba import cuda
        #nFolds, nAlphas, nLambdas = arg
        train_data_mat = cuda.to_device(xtrain)
        train_result_mat = cuda.to_device(ytrain)
        test_data_mat = cuda.to_device(xtest)
        test_result_mat = cuda.to_device(ytest)
        train_w_mat = cuda.to_device(wtrain)

        train_data_mat_ptr = train_data_mat.device_ctypes_pointer
        train_result_mat_ptr = train_result_mat.device_ctypes_pointer
        test_data_mat_ptr = test_data_mat.device_ctypes_pointer
        test_result_mat_ptr = test_result_mat.device_ctypes_pointer
        train_w = train_w_mat.device_ctypes_pointer

        print(train_data_mat_ptr)
        print(train_result_mat_ptr)
        print(test_data_mat_ptr)
        print(test_result_mat_ptr)

        import subprocess
        maxNGPUS = int(
            subprocess.check_output("nvidia-smi -L | wc -l", shell=True))
        print("Maximum Number of GPUS:", maxNGPUS)
        #nGPUs = maxNGPUS #choose all GPUs
        #nGPUs = 1

        n = train_data_mat.shape[1]
        mTrain = train_data_mat.shape[0]
        mValid = test_data_mat.shape[0]

    else:
        #nGPUs = 0
        n = xtrain.shape[1]
        mTrain = xtrain.shape[0]
        mValid = xtest.shape[0]

    print("No. of Features=%d mTrain=%d mValid=%d" % (n, mTrain, mValid))

    #Order of data
    fortran = 1
    print("fortran=%d" % (fortran))

    sourceDev = 0
    # should be passed from above if user set fit_intercept
    fit_intercept = True
    lambda_min_ratio = 1e-9
    store_full_path = 1
    double_precision = 0
    #variables
    if use_gpu == 1:
        from ctypes import c_void_p
        a, b = c_void_p(train_data_mat_ptr.value), c_void_p(
            train_result_mat_ptr.value)
        c, d = c_void_p(test_data_mat_ptr.value), c_void_p(
            test_result_mat_ptr.value)
        e = c_void_p(train_w.value)
        print(a, b, c, d, e)

    else:
        a, b = xtrain, ytrain
        c, d = xtest, ytest
        e = wtrain

    print("Setting up Solver")
    sys.stdout.flush()

    Solver = h2o4gpu.ElasticNetH2O
    enet = Solver(n_gpus=nGPUs,
                  order='c' if fortran else 'r',
                  fit_intercept=fit_intercept,
                  lambda_min_ratio=lambda_min_ratio,
                  n_lambdas=nLambdas,
                  n_folds=nFolds,
                  n_alphas=nAlphas,
                  verbose=5,
                  store_full_path=store_full_path)

    print("Solving")
    sys.stdout.flush()
    if use_gpu == 1:
        enet.fit_ptr(mTrain,
                     n,
                     mValid,
                     double_precision,
                     None,
                     a,
                     b,
                     c,
                     d,
                     e,
                     source_dev=sourceDev)
    else:
        enet.fit(a, b, c, d, e)


#t1 = time()
    print("Done Solving\n")
    sys.stdout.flush()

    error_train = printallerrors(display, enet, "Train", store_full_path)

    print('Predicting')
    sys.stdout.flush()
    if use_gpu == 1:
        pred_val = enet.predict_ptr(c, d)
    else:
        pred_val = enet.predict(c)
    print('Done Predicting')
    sys.stdout.flush()
    print('predicted values:\n', pred_val)

    error_test = printallerrors(display, enet, "Test", store_full_path)

    if write == 0:
        os.system('rm -f error.txt; '
                  'rm -f pred*.txt; '
                  'rm -f varimp.txt; '
                  'rm -f me*.txt; '
                  'rm -f stats.txt')
    from ..solvers.utils import finish
    finish(enet)

    return pred_val, error_train, error_test
Example #57
0
def similarity_gpu(
    emx,
    clusmethod,
    corrmethod,
    preout,
    postout,
    minexpr,
    maxexpr,
    minsamp,
    minclus,
    maxclus,
    criterion,
    mincorr,
    maxcorr,
    gsize,
    lsize,
    outfile):

    # allocate device buffers
    W = gsize
    N = emx.shape[1]
    N_pow2 = next_power_2(N)
    K = maxclus

    in_emx               = cuda.to_device(emx)
    in_index_cpu         = cuda.pinned_array((W, 2), dtype=np.int32)
    in_index_gpu         = cuda.device_array_like(in_index_cpu)
    work_x               = cuda.device_array((W, N_pow2), dtype=np.float32)
    work_y               = cuda.device_array((W, N_pow2), dtype=np.float32)
    work_gmm_data        = cuda.device_array((W, N, 2), dtype=np.float32)
    work_gmm_labels      = cuda.device_array((W, N), dtype=np.int8)
    work_gmm_pi          = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_mu          = cuda.device_array((W, K, 2), dtype=np.float32)
    work_gmm_sigma       = cuda.device_array((W, K, 2, 2), dtype=np.float32)
    work_gmm_sigmaInv    = cuda.device_array((W, K, 2, 2), dtype=np.float32)
    work_gmm_normalizer  = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_MP          = cuda.device_array((W, K, 2), dtype=np.float32)
    work_gmm_counts      = cuda.device_array((W, K), dtype=np.int32)
    work_gmm_logpi       = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_xm          = cuda.device_array((W, 2), dtype=np.float32)
    work_gmm_Sxm         = cuda.device_array((W, 2), dtype=np.float32)
    work_gmm_gamma       = cuda.device_array((W, N, K), dtype=np.float32)
    work_gmm_logL        = cuda.device_array((W, 1), dtype=np.float32)
    work_gmm_entropy     = cuda.device_array((W, 1), dtype=np.float32)
    out_K_cpu            = cuda.pinned_array((W,), dtype=np.int8)
    out_K_gpu            = cuda.device_array_like(out_K_cpu)
    out_labels_cpu       = cuda.pinned_array((W, N), dtype=np.int8)
    out_labels_gpu       = cuda.device_array_like(out_labels_cpu)
    out_correlations_cpu = cuda.pinned_array((W, K), dtype=np.float32)
    out_correlations_gpu = cuda.device_array_like(out_correlations_cpu)

    # iterate through global work blocks
    n_genes = emx.shape[0]
    n_total_pairs = n_genes * (n_genes - 1) // 2

    index_x = 1
    index_y = 0

    for i in range(0, n_total_pairs, gsize):
        # print("%8d %8d" % (i, n_total_pairs))

        # determine number of pairs
        n_pairs = min(gsize, n_total_pairs - i)

        # initialize index array
        index_x_ = index_x
        index_y_ = index_y

        for j in range(n_pairs):
            in_index_cpu[j] = index_x_, index_y_
            index_x_, index_y_ = pairwise_increment(index_x_, index_y_)

        # copy index array to device
        in_index_gpu.copy_to_device(in_index_cpu)

        # execute similarity kernel
        similarity_gpu_helper[gsize // lsize, lsize](
            n_pairs,
            in_emx,
            in_index_gpu,
            clusmethod,
            corrmethod,
            preout,
            postout,
            minexpr,
            maxexpr,
            minsamp,
            minclus,
            maxclus,
            criterion,
            work_x,
            work_y,
            work_gmm_data,
            work_gmm_labels,
            work_gmm_pi,
            work_gmm_mu,
            work_gmm_sigma,
            work_gmm_sigmaInv,
            work_gmm_normalizer,
            work_gmm_MP,
            work_gmm_counts,
            work_gmm_logpi,
            work_gmm_xm,
            work_gmm_Sxm,
            work_gmm_gamma,
            work_gmm_logL,
            work_gmm_entropy,
            out_K_gpu,
            out_labels_gpu,
            out_correlations_gpu
        )
        cuda.synchronize()

        # copy results from device
        out_K_gpu.copy_to_host(out_K_cpu)
        out_labels_gpu.copy_to_host(out_labels_cpu)
        out_correlations_gpu.copy_to_host(out_correlations_cpu)

        # save correlation matrix to output file
        index_x_ = index_x
        index_y_ = index_y

        for j in range(n_pairs):
            # extract pairwise results
            K = out_K_cpu[j]
            labels = out_labels_cpu[j]
            correlations = out_correlations_cpu[j, 0:K]

            # save pairwise results
            write_pair(
                index_x_, index_y_,
                K,
                labels,
                correlations,
                mincorr,
                maxcorr,
                outfile)

            # increment pairwise index
            index_x_, index_y_ = pairwise_increment(index_x_, index_y_)

        # update local pairwise index
        index_x = index_x_
        index_y = index_y_
Example #58
0
Test Cuda sinusoidal_decompression kernel
"""

t0 = time.time()
stream = cuda.stream()

h, w, _ = pano.shape
dim = np.array([w, h], dtype=np.uint32)
img_r = np.zeros_like(pano)
tmp = np.zeros_like(pano)
out_v = np.zeros_like(pano)
dec = np.zeros_like(pano)

out = np.zeros((h, w, 4), dtype=np.int32)

d_pano = cuda.to_device(np.ascontiguousarray(pano), stream=stream)
d_img_r = cuda.to_device(np.ascontiguousarray(img_r), stream=stream)
d_tmp = cuda.to_device(np.ascontiguousarray(tmp), stream=stream)
d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream)

d_out_tmp = cuda.to_device(np.ascontiguousarray(out), stream=stream)

sin_gpu.sinusoidal_compression_0[512, 512](d_pano, d_tmp, d_dim)
sin_gpu.sinusoidal_compression_1[512, 512](d_tmp, d_img_r, d_dim)

img_r = d_img_r.copy_to_host()
pix_count = int(h * math.acos(0.5) / math.pi)
img_r = img_r[:2 * pix_count + 2, :, :]
d_img_r = cuda.to_device(np.ascontiguousarray(img_r), stream=stream)
d_out_v = cuda.to_device(np.ascontiguousarray(out_v), stream=stream)
d_dec = cuda.to_device(np.ascontiguousarray(dec), stream=stream)
 def test_negative_slicing_1d(self):
     arr = np.arange(10)
     darr = cuda.to_device(arr)
     for i, j in product(range(-10, 10), repeat=2):
         np.testing.assert_array_equal(arr[i:j], darr[i:j].copy_to_host())
Example #60
0
def main():
    cu_discriminant = vectorize(['f4(f4, f4, f4)', 'f8(f8, f8, f8)'],
                                target='cuda')(poly.discriminant)

    N = 1e+8 // 2

    print('Data size', N)

    A, B, C = poly.generate_input(N, dtype=np.float32)
    D = np.empty(A.shape, dtype=A.dtype)

    stream = cuda.stream()

    print('== One')

    ts = time()

    with stream.auto_synchronize():
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)
        dC = cuda.to_device(C, stream)
        dD = cuda.to_device(D, stream, copy=False)
        cu_discriminant(dA, dB, dC, out=dD, stream=stream)
        dD.to_host(stream)

    te = time()


    total_time = (te - ts)

    print('Execution time %.4f' % total_time)
    print('Throughput %.2f' % (N / total_time))

    print('== Chunked')

    chunksize = 1e+7
    chunkcount = N // chunksize

    print('Chunk size', chunksize)

    sA = np.split(A, chunkcount)
    sB = np.split(B, chunkcount)
    sC = np.split(C, chunkcount)
    sD = np.split(D, chunkcount)

    device_ptrs = []

    ts = time()

    with stream.auto_synchronize():
        for a, b, c, d in zip(sA, sB, sC, sD):
            dA = cuda.to_device(a, stream)
            dB = cuda.to_device(b, stream)
            dC = cuda.to_device(c, stream)
            dD = cuda.to_device(d, stream, copy=False)
            cu_discriminant(dA, dB, dC, out=dD, stream=stream)
            dD.to_host(stream)
            device_ptrs.extend([dA, dB, dC, dD])

    te = time()

    total_time = (te - ts)

    print('Execution time %.4f' % total_time)
    print('Throughput %.2f' % (N / total_time))


    if '-verify' in sys.argv[1:]:
        poly.check_answer(D, A, B, C)