Esempio n. 1
0
def blocked_fw(A, n):

    # Stages is the number of three-staged iterations each thread will need
    # to undertake. Block size is the size of blocks the adjacency matrix is
    # broken up into

    t0 = time.time()
    stages = int(n / TILE_WIDTH)

    print('Copying memory to device...\n')

    # We first allocate the memory on device, then copy from host to
    # device row by row

    A_global_mem = cuda.device_array((n, n), dtype=np.uint8)

    for i in range(n):
        A_global_mem[i] = A[i]
        if (i + 1) % 1000 == 0:
            print(f'Copied {i+1}/{n} rows')

    t1 = time.time()

    print(f'\nCopied memory to device. Took {t1 - t0}s')

    block_size = (TILE_WIDTH, TILE_WIDTH)

    print('Starting GPU blocked Floyd-Warshall')
    print(f'Number of stages = {stages}')
    print(f'Tile width = {TILE_WIDTH}')
    print(f'Data type of A elements is {type(A[0][0])} \n')

    # Grid size is the number of thread blocks which are used for each stage

    phase_1_grid = (1, 1)
    phase_2_grid = (stages, 2)
    phase_3_grid = (stages, stages)

    for k in range(stages):
        base = TILE_WIDTH * k
        phase_1_kernel[phase_1_grid, block_size](A_global_mem, n, base)
        phase_2_kernel[phase_2_grid, block_size](A_global_mem, n, k, base)
        phase_3_kernel[phase_3_grid, block_size](A_global_mem, n, k, base)

        cuda.synchronize()

        if (k + 1) % UPDATE_INTERVAL == 0:
            print(f'Completed stage {k+1}/{stages}')

    t2 = time.time()

    print(f'\nFinished GPU blocked Floyd-Warshall. Took {t2 - t1}s\n')
    print('Copying memory to host...')

    A = A_global_mem.copy_to_host()
    t3 = time.time()

    print(f'Copied memory to host. Took {t3 - t2}s\n')

    return A
Esempio n. 2
0
def main():
    n = 20000000
    x = np.arange(n).astype(np.int32)
    y = 2 * x

    # copy data to gpu
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)

    # Init an empty array to storage result
    gpu_result = cuda.device_array(n)
    cpu_result = np.empty(n)

    threads_per_block = 1024
#    blocks_per_grid = math.ceil(n / threads_per_block)
    blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n)
    cuda.synchronize()
    print("gpu vector add time " + str(time() - start))
    start = time()
    cpu_result = np.add(x, y)
    print("cpu vector add time " + str(time() - start))

    if (np.array_equal(cpu_result, gpu_result.copy_to_host())):
        print("result correct!")
Esempio n. 3
0
    def test_issue_2393(self):
        """
        Test issue of warp misalign address due to nvvm not knowing the
        alignment(? but it should have taken the natural alignment of the type)
        """
        num_weights = 2
        num_blocks = 48
        examples_per_block = 4
        threads_per_block = 1

        @cuda.jit
        def costs_func(d_block_costs):
            s_features = cuda.shared.array((examples_per_block, num_weights),
                                           float64)
            s_initialcost = cuda.shared.array(7, float64)  # Bug

            threadIdx = cuda.threadIdx.x

            prediction = 0
            for j in range(num_weights):
                prediction += s_features[threadIdx, j]

            d_block_costs[0] = s_initialcost[0] + prediction

        block_costs = np.zeros(num_blocks, dtype=np.float64)
        d_block_costs = cuda.to_device(block_costs)

        costs_func[num_blocks, threads_per_block](d_block_costs)

        cuda.synchronize()
Esempio n. 4
0
def compute_inv_mass(objects, mask_events, mask_objects, use_cuda):
    this_worker = get_worker_wrapper()
    NUMPY_LIB, backend = this_worker.NUMPY_LIB, this_worker.backend

    inv_mass = NUMPY_LIB.zeros(len(mask_events), dtype=np.float32)
    pt_total = NUMPY_LIB.zeros(len(mask_events), dtype=np.float32)
    if use_cuda:
        this_worker.kernels.compute_inv_mass_cudakernel[32, 1024](
            objects.offsets,
            objects.pt,
            objects.eta,
            objects.phi,
            objects.mass,
            mask_events,
            mask_objects,
            inv_mass,
            pt_total,
        )
        cuda.synchronize()
    else:
        this_worker.kernels.compute_inv_mass_kernel(
            objects.offsets,
            objects.pt,
            objects.eta,
            objects.phi,
            objects.mass,
            mask_events,
            mask_objects,
            inv_mass,
            pt_total,
        )
    return inv_mass, pt_total
Esempio n. 5
0
def peaks(image, return_numpy=True):
    """
    Detect all peaks from a full SLI measurement. Peaks will not be filtered
    in any way. To detect only significant peaks, filter the peaks by using
    the prominence as a threshold.

    Args:

        image: Complete SLI measurement image stack as a 2D/3D Numpy array

        return_numpy: Necessary if using `use_gpu`. Specifies if a CuPy or
                      Numpy array will be returned.

    Returns:

    2D/3D boolean image containing masking the peaks with `True`
    """
    gpu_image = cupy.array(image, dtype='float32')
    resulting_peaks = cupy.zeros(gpu_image.shape, dtype='int8')

    blocks_per_grid, threads_per_block = prepare_kernel_execution(gpu_image)
    _peaks[blocks_per_grid, threads_per_block](gpu_image, resulting_peaks)
    cuda.synchronize()

    if return_numpy:
        peaks_cpu = cupy.asnumpy(resulting_peaks)
        del resulting_peaks

        return peaks_cpu.astype('bool')
    else:
        return resulting_peaks.astype('bool')
def _lombscargle(x, y, freqs, pgram, y_dot):

    if pgram.dtype == "float32":
        numba_type = float32
    elif pgram.dtype == "float64":
        numba_type = float64

    if (str(numba_type)) in _kernel_cache:
        kernel = _kernel_cache[(str(numba_type))]
    else:
        sig = _numba_lombscargle_signature(numba_type)
        if pgram.dtype == "float32":
            kernel = _kernel_cache[(str(numba_type))] = cuda.jit(
                sig, fastmath=True, max_registers=32
            )(_numba_lombscargle_32)
            print("Registers", kernel._func.get().attrs.regs)
        elif pgram.dtype == "float64":
            kernel = _kernel_cache[(str(numba_type))] = cuda.jit(
                sig, fastmath=True, max_registers=64
            )(_numba_lombscargle_64)
            print("Registers", kernel._func.get().attrs.regs)

    device_id = cp.cuda.Device()
    numSM = device_id.attributes["MultiProcessorCount"]
    threadsperblock = (128,)
    blockspergrid = (numSM * 20,)

    kernel[blockspergrid, threadsperblock](x, y, freqs, pgram, y_dot)

    cuda.synchronize()
Esempio n. 7
0
 def neighbour_list(self):
     with cuda.gpus[self.gpu]:
         if self.n != self.frame.n:
             self.bpg = int(self.frame.n // self.tpb + 1)
             self.d_nl = cuda.device_array((self.frame.n, self.n_guess),
                                           dtype=np.int32)
             self.d_nc = cuda.device_array((self.frame.n, ), dtype=np.int32)
         while True:
             cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0)
             # reset situation while build nlist
             self.cu_nlist[self.bpg, self.tpb](
                 self.frame.d_x, self.frame.d_box, self.frame.r_cut2,
                 self.clist.d_cell_map, self.clist.d_cell_list,
                 self.clist.d_cell_counts, self.clist.d_cells, self.d_nl,
                 self.d_nc, self.d_n_max, self.contain_self)
             p_n_max = self.d_n_max.copy_to_host()
             cuda.synchronize()
             # n_max = np.array([120])
             if p_n_max[0] > self.n_guess:
                 self.n_guess = p_n_max[0]
                 self.n_guess = self.n_guess + 8 - (self.n_guess & 7)
                 self.d_nl = cuda.device_array((self.frame.n, self.n_guess),
                                               dtype=np.int32)
             else:
                 break
     self.n = self.frame.n
Esempio n. 8
0
def select_muons_opposite_sign(muons, in_mask):
    out_mask = cupy.invert(muons.make_mask())
    select_opposite_sign_muons_cudakernel[32,
                                          1024](muons.charge, muons.offsets,
                                                in_mask, out_mask)
    cuda.synchronize()
    return out_mask
Esempio n. 9
0
def get_in_offsets(content, offsets, indices, mask_rows, mask_content):
    out = cupy.zeros(len(offsets) - 1, dtype=content.dtype)
    #out = -999.*cupy.ones(len(offsets) - 1, dtype=content.dtype) #to avoid histos being filled with 0 for non-existing objects, i.e. in events with no fat jets
    get_in_offsets_cudakernel[32, 1024](content, offsets, indices, mask_rows,
                                        mask_content, out)
    cuda.synchronize()
    return out
Esempio n. 10
0
def benchmark_argmin():
    SIZE_LG = 1000000
    SIZE_SM = 100
    DIM = 100

    a = np.random.rand(SIZE_LG, DIM).astype(np.float32)
    b = np.random.rand(SIZE_SM, DIM).astype(np.float32)
    c = np.zeros(SIZE_LG, dtype=np.int64)



    a_gpu = cuda.to_device(a)
    b_gpu = cuda.to_device(b)
    c_gpu = cuda.to_device(c)

    thread_in_batch, batches = 128, 64
    #thread_in_batch, batches = 32, 64

    cpu_argmin(a, b)

    gpu_argmin[batches, thread_in_batch](a_gpu, b_gpu, c_gpu)
    cuda.synchronize()

    #time.sleep(4)


    timestamp = time.time()
    cpu_argmin(a, b)
    print("CPU time",  time.time() - timestamp)

    timestamp = time.time()
    gpu_argmin[batches, thread_in_batch](a_gpu, b_gpu, c_gpu)
    cuda.synchronize()
    print("GPU time", time.time() - timestamp)
Esempio n. 11
0
 def __setitem__(self, key, val):
     if key in self._vars:
         # get data
         data = self.__getitem__('_data')
         _var2idx = self.__getitem__('_var2idx')
         idx = _var2idx[key]
         # gpu setattr
         if profile.run_on_gpu():
             gpu_data = self.get_cuda_data()
             if data.shape[1] <= profile._num_thread_gpu:
                 num_thread = data.shape[1]
                 num_block = 1
             else:
                 num_thread = profile._num_thread_gpu
                 num_block = math.ceil(data.shape[1] /
                                       profile._num_thread_gpu)
             if np.isscalar(val):
                 gpu_set_scalar_val[num_block, num_thread](gpu_data, val,
                                                           idx)
             else:
                 if val.shape[0] != data.shape[1]:
                     raise ValueError(
                         f'Wrong value dimension {val.shape[0]} != {data.shape[1]}'
                     )
                 gpu_set_vector_val[num_block, num_thread](gpu_data, val,
                                                           idx)
             cuda.synchronize()
         # cpu setattr
         else:
             data[idx] = val
     elif key in ['_data', '_var2idx', '_idx2var']:
         raise KeyError(f'"{key}" cannot be modified.')
     else:
         raise KeyError(f'"{key}" is not defined in {type(self).__name__}, '
                        f'only finds "{str(self._keys)}".')
Esempio n. 12
0
def main():
    # 初始化矩阵
    M = 6000
    N = 4800
    P = 4000
    A = np.random.random((M, N))  # 随机生成的 [M x N] 矩阵
    B = np.random.random((N, P))  # 随机生成的 [N x P] 矩阵

    A_device = cuda.to_device(A)
    B_device = cuda.to_device(B)
    C_device = cuda.device_array((M, P))  # [M x P] 矩阵

    # 执行配置
    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    blocks_per_grid_x = int(math.ceil(A.shape[0] / BLOCK_SIZE))
    blocks_per_grid_y = int(math.ceil(B.shape[1] / BLOCK_SIZE))
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    start = time()
    matmul[blocks_per_grid, threads_per_block](A_device, B_device, C_device)
    cuda.synchronize()
    print("matmul time :" + str(time() - start))

    start = time()
    matmul_shared_memory[blocks_per_grid,
                         threads_per_block](A_device, B_device, C_device)
    cuda.synchronize()
    print("matmul with shared memory time :" + str(time() - start))
    C = C_device.copy_to_host()
Esempio n. 13
0
    def process(self, neutrons):
        """
        Propagate a buffer of particles through this guide.
        Adjusts the buffer to include only the particles that exit,
        at the moment of exit.

        Parameters:
        neutrons: a buffer containing the particles
        """
        (entrance_width, entrance_height, R0, Qc, alpha, m, W) = self.nature
        neutron_array = neutrons_as_npyarr(neutrons)
        neutron_array.shape = -1, ndblsperneutron

        threads_per_block = 256
        number_of_blocks = ceil(len(neutrons) / threads_per_block)
        guide_process[number_of_blocks,
                      threads_per_block](entrance_width, entrance_height, R0,
                                         Qc, alpha, m, W, self.sides,
                                         neutron_array)
        cuda.synchronize()

        mask = array(list(map(lambda weight: weight > 0, neutron_array.T[9])),
                     dtype=bool)
        neutrons.resize(count_nonzero(mask), neutrons[0])
        neutrons.from_npyarr(neutron_array[mask])
Esempio n. 14
0
 def neighbour_list(self):
     with cuda.gpus[self.gpu]:
         while True:
             cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0)
             # reset situation while build nlist
             self.cu_nlist[self.bpg, self.tpb](self.system.d_x,
                                               self.d_last_x,
                                               self.system.d_box,
                                               self.r_cut2,
                                               self.clist.d_cell_map,
                                               self.clist.d_cell_list,
                                               self.clist.d_cell_counts,
                                               self.clist.d_cells,
                                               self.d_nl,
                                               self.d_nc,
                                               self.d_n_max,
                                               self.d_situation)
             self.d_n_max.copy_to_host(self.p_n_max)
             cuda.synchronize()
             # n_max = np.array([120])
             if self.p_n_max[0] > self.n_guess:
                 self.n_guess = self.p_n_max[0]
                 self.n_guess = self.n_guess + 8 - (self.n_guess & 7)
                 self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32)
             else:
                 break
Esempio n. 15
0
def float_2_rgb(data, cmap, limits):
    """
    data is a single channel image as a NumPy array.
    """

    assert (data.ndim == 2)

    # Dimensions.
    H, W = data.shape

    # The output image.
    img = np.zeros((H, W, 3), dtype=np.uint8)

    dData = cuda.to_device(data)
    dCMap = cuda.to_device(cmap)
    dImg = cuda.to_device(img)

    # CUDA threads dimensions.
    CUDA_THREADS = 16
    gridX = int(np.floor(W / CUDA_THREADS))
    gridY = int(np.floor(H / CUDA_THREADS))

    # CUDA execution.
    cuda.synchronize()
    k_convert_float_2_rgb[[gridX, gridY, 1],
                          [CUDA_THREADS, CUDA_THREADS, 1]](dData, dCMap,
                                                           limits[0],
                                                           limits[1], dImg)
    cuda.synchronize()

    img = dImg.copy_to_host()

    return img
    def calc_quality_metrics(self, archive_d_qual, observed_mean_arr_qual,
                             rows_, rows_curr_gpu):
        blocks_per_grid_quality = min([rows_, 1000])
        threads_per_block_quality = int(rows_ / blocks_per_grid_quality) + 1
        quality_sum = cuda.to_device(np.zeros(shape=rows_, dtype=np.int32))

        @cuda.jit
        def quality_sum_kernal(archive_d, qual_sum, rows_this_curr):
            x = cuda.grid(1)
            # execute for each row
            if x >= rows_this_curr[0]:
                return
            sum_ = 0
            for i in range(int(NEIGHBOURS_ARCHIVE_SIZE / 2)):
                sum_ += archive_d[x, i]
            qual_sum[x] = sum_

        quality_sum_kernal[blocks_per_grid_quality,
                           threads_per_block_quality](archive_d_qual,
                                                      quality_sum,
                                                      rows_curr_gpu)
        cuda.synchronize()
        quality_sum = quality_sum.copy_to_host()

        observed_means = observed_mean_arr_qual.copy_to_host()[0:rows_]

        return np.mean(quality_sum), np.std(quality_sum), np.mean(
            observed_means), np.std(observed_means)
Esempio n. 17
0
def main():
    arr = np.arange(0, 10, 1)
    threadsperblock = 32
    blockspergrid = (arr.size + (threadsperblock - 1))
    increment_by_one[blockspergrid, threadsperblock](arr)
    cuda.synchronize()
    print(arr)
Esempio n. 18
0
def main():
    n = 20000000
    x = np.arange(n).astype(np.int32)
    y = 2 * x

    # 拷贝数据到设备端
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)
    # 在显卡设备上初始化一块用于存放GPU计算结果的空间
    gpu_result = cuda.device_array(n)
    cpu_result = np.empty(n)

    threads_per_block = 1024
    blocks_per_grid = math.ceil(n / threads_per_block)
    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result,
                                                n)
    cuda.synchronize()
    print("gpu vector add time " + str(time() - start))
    start = time()
    cpu_result = np.add(x, y)
    print("cpu vector add time " + str(time() - start))

    if (np.array_equal(cpu_result, gpu_result.copy_to_host())):
        print("result correct!")
Esempio n. 19
0
def cdist_dtw_cuda(x1, x2= None, global_constraint=0, sakoe_chiba_radius=None, itakura_max_slope=None):
    x1 = to_time_series_dataset(x1)
    if x2 is not None:
        x2 = to_time_series_dataset(x2)
        mask = compute_mask(x1[0], x2[0], global_constraint=global_constraint, sakoe_chiba_radius=sakoe_chiba_radius,
                            itakura_max_slope=itakura_max_slope)
        matrix = numpy.zeros((len(x1), len(x2)), dtype=numpy.float64)
        x2 = cuda.to_device(x2)
    else:
        mask = compute_mask(x1[0], x1[0], global_constraint=global_constraint, sakoe_chiba_radius=sakoe_chiba_radius, itakura_max_slope=itakura_max_slope)
        matrix = numpy.zeros((len(x1), len(x1)), dtype=numpy.float64)

    x1 = cuda.to_device(x1)
    matrix = cuda.to_device(matrix)
    mask = cuda.to_device(mask)
    # it is a small trick to make SH a global constrant
    # otherwise it does not work in cudajit cuda.local.array() fucntion
    def funс_sh():
        global SH
        SH = int(x1.shape[1] + 1)
        return SH
    funс_sh()
    threadsperblock = (16, 16)
    blockspergrid_x = math.ceil(matrix.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(matrix.shape[1] / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)
    # increment_a_2D_array[blockspergrid, threadsperblock](an_array)
    cdist_dtw[blockspergrid, threadsperblock](x1, matrix, mask); cuda.synchronize()
    matrix_res = numpy.asarray(matrix)
    matrix = matrix_res + matrix_res.T
    return matrix
Esempio n. 20
0
 def update(self):
     with cuda.gpus[self.gpu]:
         while True:
             self.p_out_of_box[0] = -1
             cu_set_to_int[self.bpg_cell, self.tpb](self.d_cell_counts, 0)
             self.cu_cell_list[self.bpg,
                               self.tpb](self.system.d_x, self.system.d_box,
                                         self.d_ibox, self.d_cell_list,
                                         self.d_cell_counts, self.d_cells,
                                         self.d_cell_max, self.d_out_of_box)
             self.d_cell_max.copy_to_host(self.p_cell_max)
             self.d_out_of_box.copy_to_host(self.p_out_of_box)
             cuda.synchronize()
             if self.p_out_of_box[0] != -1:
                 err_coor = ''.join(
                     ["{0: .4f} ".format(_) for _ in self.p_out_of_box[1:]])
                 raise ValueError("Error, particle %d %s is out of box!" %
                                  (int(self.p_out_of_box[0]), err_coor))
             if self.p_cell_max[0] > self.cell_guess:
                 self.cell_guess = self.p_cell_max[0]
                 self.cell_guess = self.cell_guess + 8 - (self.cell_guess
                                                          & 7)
                 self.d_cell_list = cuda.device_array(
                     (self.n_cell, self.cell_guess), dtype=np.int32)
             else:
                 break
Esempio n. 21
0
    def test_issue_2393(self):
        """
        Test issue of warp misalign address due to nvvm not knowing the
        alignment(? but it should have taken the natural alignment of the type)
        """
        num_weights = 2
        num_blocks = 48
        examples_per_block = 4
        threads_per_block = 1

        @cuda.jit
        def costs_func(d_block_costs):
            s_features = cuda.shared.array((examples_per_block, num_weights),
                                           float64)
            s_initialcost = cuda.shared.array(7, float64)  # Bug

            threadIdx = cuda.threadIdx.x

            prediction = 0
            for j in range(num_weights):
                prediction += s_features[threadIdx, j]

            d_block_costs[0] = s_initialcost[0] + prediction

        block_costs = np.zeros(num_blocks, dtype=np.float64)
        d_block_costs = cuda.to_device(block_costs)

        costs_func[num_blocks, threads_per_block](d_block_costs)

        cuda.synchronize()
Esempio n. 22
0
def main():
    n = 20000000
    x = np.arange(n).astype(np.int32)
    y = 2 * x

    # 拷贝数据到设备端
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)

    # 在显卡设备上初始化一块用于存放GPU计算结果的空间
    # 这很好解释了为什么gpu计算显存时需要考虑特征图大小!!!
    gpu_result = cuda.device_array(n)
    cpu_result = np.empty(n)
    gpu_result_cpu = np.empty(n)

    threads_per_block = 1024
    blocks_per_grid = math.ceil(n / threads_per_block)

    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x, y, gpu_result_cpu, n)
    cuda.synchronize()
    print("gpu(non-optmi) vector add time " + str(time() - start))

    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result,
                                                n)  #gpu 需要启动时间,越算越快
    cuda.synchronize()
    print("gpu vector add time " + str(time() - start))

    start = time()
    cpu_result = np.add(x, y)  #cpu 不需要启动时间
    print("cpu vector add time " + str(time() - start))

    if (np.array_equal(cpu_result, gpu_result.copy_to_host())):
        print("result correct!")
Esempio n. 23
0
def histogram_from_vector(data, weights, bins, mask=None):
    assert len(data) == len(weights)

    allowed_dtypes = [cupy.float32, cupy.int32, cupy.int8]
    assert data.dtype in allowed_dtypes
    assert weights.dtype in allowed_dtypes
    assert bins.dtype in allowed_dtypes

    # Allocate output arrays
    nblocks = 64
    nthreads = 256
    out_w = cupy.zeros((nblocks, len(bins) - 1), dtype=cupy.float32)
    out_w2 = cupy.zeros((nblocks, len(bins) - 1), dtype=cupy.float32)

    # Fill output
    if len(data) > 0:
        if mask is None:
            fill_histogram[nblocks, nthreads](data, weights, bins, out_w,
                                              out_w2)
        else:
            assert len(data) == len(mask)
            fill_histogram_masked[nblocks, nthreads](data, weights, bins, mask,
                                                     out_w, out_w2)

    cuda.synchronize()

    out_w = out_w.sum(axis=0)
    out_w2 = out_w2.sum(axis=0)

    return cupy.asnumpy(out_w), cupy.asnumpy(out_w2), cupy.asnumpy(bins)
Esempio n. 24
0
def guided_filter_3(mask, img):
    """
    mask: A 1-channel mask image, numpy array. dtype == np.uint8. Two-value mask.
    img: A 1-channel image, numpy array.
    """

    # Check the dimensions.
    assert (mask.shape[0] == img.shape[0])
    assert (mask.shape[1] == img.shape[1])

    # Get the dimensions
    height = mask.shape[0]
    width = mask.shape[1]

    # Transfer memory to the CUDA device.
    dMask = cuda.to_device(mask)
    dImg = cuda.to_device(img)

    # Filter.
    cuda.synchronize()
    k_filter_horizontal[[1, int(height / 16) + 1, 1], [1, 16, 1]](dMask, dImg)
    k_filter_vertical[[int(width / 16) + 1, 1, 1], [16, 1, 1]](dMask, dImg)
    cuda.synchronize()

    # Transfer memory to the host.
    mask = dMask.copy_to_host()
    img = dImg.copy_to_host()

    return mask, img
Esempio n. 25
0
def main():
    # https://habr.com/post/317328/

    from numba import cuda
    import numpy as np
    # import matplotlib.pyplot as plt
    from time import time

    from numpy.core.tests.test_mem_overlap import xrange

    n = 512
    blockdim = 16, 16
    griddim = int(n / blockdim[0]), int(n / blockdim[1])

    L = 1.
    h = L / n
    dt = 0.1 * h ** 2
    nstp = 5000

    @cuda.jit("void(float64[:], float64[:])")
    def nextstp_gpu(u0, u):
        i, j = cuda.grid(2)

        u00 = u0[i + n * j]
        if i > 0:
            uim1 = u0[i - 1 + n * j]
        else:
            uim1 = 0.
        if i < n - 1:
            uip1 = u0[i + 1 + n * j]
        else:
            uip1 = 0.
        if j > 0:
            ujm1 = u0[i + n * (j - 1)]
        else:
            ujm1 = 0.
        if j < n - 1:
            ujp1 = u0[i + n * (j + 1)]
        else:
            ujp1 = 1.

        d2x = (uim1 - 2. * u00 + uip1)
        d2y = (ujm1 - 2. * u00 + ujp1)
        u[i + n * j] = u00 + (dt / h / h) * (d2x + d2y)

    u0 = np.full(n * n, 0., dtype=np.float64)
    u = np.full(n * n, 0., dtype=np.float64)

    st = time()

    d_u0 = cuda.to_device(u0)
    d_u = cuda.to_device(u)
    for i in xrange(0, int(nstp / 2)):
        nextstp_gpu[griddim, blockdim](d_u0, d_u)
        nextstp_gpu[griddim, blockdim](d_u, d_u0)

    cuda.synchronize()
    u0 = d_u0.copy_to_host()
    print('time on GPU = ', time() - st)
Esempio n. 26
0
 def show(self):
     cell_list = self.clist.d_cell_list.copy_to_host()
     cell_map = self.clist.d_cell_map.copy_to_host()
     cell_counts = self.clist.d_cell_counts.copy_to_host()
     nl = self.d_nl.copy_to_host()
     nc = self.d_nc.copy_to_host()
     cuda.synchronize()
     return cell_list, cell_counts, cell_map, nl, nc
Esempio n. 27
0
 def test_print_array(self):
     """
     Eyeballing required
     """
     jcuprintary = cuda.jit('void(float32[:])')(cuprintary)
     A = numpy.arange(10, dtype=numpy.float32)
     jcuprintary[2, 5](A)
     cuda.synchronize()
Esempio n. 28
0
def sum_in_offsets(struct, content, mask_rows, mask_content, dtype=None):
    if not dtype:
        dtype = content.dtype
    sum_offsets = cupy.zeros(len(struct.offsets) - 1, dtype=dtype)
    sum_in_offsets_cudakernel[32, 1024](content, struct.offsets, mask_rows,
                                        mask_content, sum_offsets)
    cuda.synchronize()
    return sum_offsets
Esempio n. 29
0
 def test_print_array(self):
     """
     Eyeballing required
     """
     jcuprintary = cuda.jit('void(float32[:])')(cuprintary)
     A = np.arange(10, dtype=np.float32)
     jcuprintary[2, 5](A)
     cuda.synchronize()
Esempio n. 30
0
def searchsorted(arr, vals, side="right"):
    ret = cupy.zeros_like(vals, dtype=cupy.int32)
    if side == "right":
        searchsorted_kernel_right[32, 1024](vals, arr, ret)
    elif side == "left":
        searchsorted_kernel_left[32, 1024](vals, ret, arr)
    cuda.synchronize()
    return ret
Esempio n. 31
0
def min_in_offsets(offsets, content, mask_rows=None, mask_content=None):
    max_offsets = cupy.zeros(len(offsets) - 1, dtype=content.dtype)
    mask_rows, mask_content = make_masks(offsets, content, mask_rows,
                                         mask_content)
    min_in_offsets_cudakernel[32, 1024](offsets, content, mask_rows,
                                        mask_content, max_offsets)
    cuda.synchronize()
    return max_offsets
Esempio n. 32
0
def testin():
    N = 2000
    M = 2000
    h = np.asarray(np.float32(2) + np.random.random((N, M)), dtype=np.float32)
    n = np.asarray(np.random.random((N, M)), dtype=np.float32)
    u = np.asarray(np.random.random((N + 1, M)), dtype=np.float32)
    v = np.asarray(np.random.random((N, M + 1)), dtype=np.float32)
    f = np.asarray(np.random.random((N, M)), dtype=np.float32)
    dx = np.float32(0.1)
    dy = np.float32(0.2)
    #p.g = np.float32(1.0)
    nu = np.float32(1.0)

    out_u = np.asarray(np.random.random((M, N + 1)), dtype=np.float32)

    threadsperblock = (16, 32)  # (16,16)
    blockspergrid_x = (u.shape[0] + threadsperblock[0]) // threadsperblock[0]
    blockspergrid_y = (u.shape[1] + threadsperblock[1]) // threadsperblock[1]
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    print("here we go", u.shape)
    print("blocks per grid", blockspergrid)
    print("threads per block", threadsperblock)
    try:
        for cu_u_driver in (cu_u_driver_global, ):
            print(cu_u_driver)
            #           h1 = cuda.to_device(h)
            #           n1 = cuda.to_device(n)
            #           u1 = cuda.to_device(u)
            v1 = cuda.to_device(v)
            #           f1 = cuda.to_device(f)
            out_u1 = cuda.to_device(out_u)
            ts = []
            for i in range(10):
                t = mytime()  # time.process_time()
                for j in range(100):
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)
                    cu_u_driver[blockspergrid, threadsperblock](v1, out_u1)

                cuda.synchronize()
                t2 = mytime()  # time.process_time()
                ts.append(t - t2)
            #  time.sleep(1)
            print("cuda")
            print(np.median(ts), np.min(ts), np.max(ts), np.std(ts))
            print(ts)
    finally:
        print("cuda closer")
        cuda.close()
    print("all done")
Esempio n. 33
0
def time_this(kernel, gridsz, blocksz, args):
    timings = []
    cuda.synchronize()
    for i in range(10): # best of 10
        ts = timer()
        kernel[gridsz, blocksz](*args)
        cuda.synchronize()
        te = timer()
        timings.append(te - ts)

    return sum(timings) / len(timings)
Esempio n. 34
0
def time_this(kernel, gridsz, blocksz, args):
    timings = []
    cuda.synchronize()
    try:
        for i in range(10): # best of 10
            ts = timer()
            kernel[gridsz, blocksz](*args)
            cuda.synchronize()
            te = timer()
            timings.append(te - ts)
    except cuda.errors.CudaDriverError, e:
        print 'exc suppressed', e
        return -1
Esempio n. 35
0
def captured_cuda_stdout():
    """
    Return a minimal stream-like object capturing the text output of
    either CUDA or the simulator.
    """
    if config.ENABLE_CUDASIM:
        # The simulator calls print() on Python stdout
        with captured_stdout() as stream:
            yield PythonTextCapture(stream)
    else:
        # The CUDA runtime writes onto the system stdout
        from numba import cuda
        fd = sys.__stdout__.fileno()
        with redirect_fd(fd) as stream:
            yield CUDATextCapture(stream)
            cuda.synchronize()
Esempio n. 36
0
def captured_cuda_stdout():
    """
    Return a minimal stream-like object capturing the text output of
    either CUDA or the simulator.
    """
    # Prevent accidentally capturing previously output text
    sys.stdout.flush()

    if config.ENABLE_CUDASIM:
        # The simulator calls print() on Python stdout
        with captured_stdout() as stream:
            yield PythonTextCapture(stream)
    else:
        # The CUDA runtime writes onto the system stdout
        from numba import cuda
        with redirect_c_stdout() as stream:
            yield CUDATextCapture(stream)
            cuda.synchronize()
Esempio n. 37
0
res_cuda = numpy.zeros_like(res)

d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_orders = cuda.to_device(orders)
d_C = cuda.to_device(C)
d_Ad = cuda.to_device(Ad)
d_dAd = cuda.to_device(dAd)
d_X = cuda.to_device(X)
d_res_cuda = cuda.to_device(res_cuda)

#warmup
jitted[int(N/512)+1, 512](d_a,d_b,d_orders,d_C,d_X,d_res_cuda,d_Ad,d_dAd)

cuda.synchronize()
t1 = time.time()
for t in range(T):
    jitted[int(N/512)+1, 512](d_a,d_b,d_orders,d_C,d_X,d_res_cuda,d_Ad,d_dAd)
cuda.synchronize()
t2 = time.time()
res_cuda = d_res_cuda.copy_to_host()

print("CUDA: {}" .format((t2-t1)/T))


# Compare with CPU impplementations (requires interpolation.py)

from interpolation.splines.eval_cubic_numba import vec_eval_cubic_spline_3

vec_eval_cubic_spline_3(a,b,orders,C,X,res)