Esempi in Python per threadfence, esempi in Python per numba.cuda.threadfence

Esempio n. 1

0

Mostra file

File: sweeps_numba_sgpu.py Progetto: nicelhc13/Parla.py

def compute_fluxes(work_items, nx, ny, nz, num_dirs, num_groups, I_flat,
                   sigma_flat, directions, sigma_a_s, tgroup_id, num_dirs_inv,
                   I_s0, I_s1, I_s2, I_s3, I_s4):
    block_base_index = cuda.shared.array((1, ), uint_t)
    if not cuda.threadIdx.x:
        block_base_index[0] = cuda.atomic.add(tgroup_id, 0, 1) * uint_t(
            cuda.blockDim.x)
    cuda.syncthreads()
    work_item_idx = block_base_index[0] + uint_t(cuda.threadIdx.x)
    if work_item_idx >= work_items.shape[0]:
        return
    idx = work_items[work_item_idx]
    sigma_flat_idx = idx
    # TODO: The uint_t specs for nx, ny, and nz here drastically change the result. WHY?
    sigma_x, sigma_y, sigma_z, dir_idx = unravel_4d_index(
        nx, ny, nz, directions.shape[0], idx)
    I_flat_base_idx = I_s0 * (sigma_x + 1) + I_s1 * (sigma_y +
                                                     1) + I_s2 * (sigma_z + 1)
    I_flat_idx = I_s0 * (sigma_x + 1) + I_s1 * (sigma_y + 1) + I_s2 * (
        sigma_z + 1) + I_s3 * dir_idx
    # Now change abstract indices in the iteration space into indices into I.
    # This is necessary since the beginning and end of each spatial axis
    # is used for boundary conditions.
    ix = sigma_x + uint_t(1)
    iy = sigma_y + uint_t(1)
    iz = sigma_z + uint_t(1)
    dirx = directions[dir_idx, 0]
    diry = directions[dir_idx, 1]
    dirz = directions[dir_idx, 2]
    x_has_sign = dirx < float_t(0.)
    y_has_sign = diry < float_t(0.)
    z_has_sign = dirz < float_t(0.)
    x_neighbor_idx = ix + uint_t(1) if x_has_sign else ix - uint_t(1)
    y_neighbor_idx = iy + uint_t(1) if y_has_sign else iy - uint_t(1)
    z_neighbor_idx = iz + uint_t(1) if z_has_sign else iz - uint_t(1)
    x_neighbor_flat_idx = I_flat_idx + I_s0 if x_has_sign else I_flat_idx - I_s0
    y_neighbor_flat_idx = I_flat_idx + I_s1 if y_has_sign else I_flat_idx - I_s1
    z_neighbor_flat_idx = I_flat_idx + I_s2 if z_has_sign else I_flat_idx - I_s2
    x_coef = -float_t(nx) if x_has_sign else float_t(nx)
    y_coef = -float_t(ny) if y_has_sign else float_t(ny)
    z_coef = -float_t(nz) if z_has_sign else float_t(nz)
    denominator = (sigma_a_s - x_coef * dirx - y_coef * diry - z_coef * dirz)
    # In full-blown versions of this code this sum is actually an inner product
    # that uses coefficients specific to this direction. Frequencies may also be
    # considered, but some kind of lower-dimensional thing is usually used
    # to store the scattering terms. This sum just runs over the directions.
    incoming_scattering = float_t(0.)
    sigma_dir_block_idx = sigma_flat_idx - dir_idx
    for j64 in range(uint_t(num_dirs)):
        j = uint_t(j64)
        #incoming_scattering += sigma[sigma_x, sigma_y, sigma_z, j]
        incoming_scattering += sigma_flat[sigma_dir_block_idx + j]
    incoming_scattering *= num_dirs_inv
    # For simplicity we're assuming all frequencies scatter the same, so
    # sum across frequencies now.
    x_factor = x_coef * dirx
    y_factor = y_coef * diry
    z_factor = z_coef * dirz
    div = float_t(1.) / denominator
    x_neighbor_flat_idx_last = x_neighbor_flat_idx + (num_groups - 1) * I_s4
    y_neighbor_flat_idx_last = y_neighbor_flat_idx + (num_groups - 1) * I_s4
    z_neighbor_flat_idx_last = z_neighbor_flat_idx + (num_groups - 1) * I_s4
    while True:
        cuda.threadfence()
        # Stop if the upstream neighbors aren't ready.
        #if (math.isnan(I[x_neighbor_idx, iy, iz, dir_idx, -1]) or
        #    math.isnan(I[ix, y_neighbor_idx, iz, dir_idx, -1]) or
        #    math.isnan(I[ix, iy, z_neighbor_idx, dir_idx, -1])):
        #    continue
        if (math.isnan(I_flat[x_neighbor_flat_idx_last])
                or math.isnan(I_flat[y_neighbor_flat_idx_last])
                or math.isnan(I_flat[z_neighbor_flat_idx_last])):
            continue
        # For simplicity we're assuming all frequencies scatter the same, so
        # sum across frequencies now.
        for k64 in range(num_groups):
            k = uint_t(k64)
            #numerator = (incoming_scattering -
            #             x_factor * I[x_neighbor_idx,iy,iz,dir_idx,k] -
            #             y_factor * I[ix,y_neighbor_idx,iz,dir_idx,k] -
            #             z_factor * I[ix,iy,z_neighbor_idx,dir_idx,k])
            numerator = (incoming_scattering -
                         x_factor * I_flat[x_neighbor_flat_idx + k * I_s4] -
                         y_factor * I_flat[y_neighbor_flat_idx + k * I_s4] -
                         z_factor * I_flat[z_neighbor_flat_idx + k * I_s4])
            flux = numerator * div
            if k == num_groups - uint_t(1):
                cuda.threadfence()
            #I[ix,iy,iz,dir_idx,k] = flux
            I_flat[I_flat_idx + k * I_s4] = flux
        break

Esempio n. 2

0

Mostra file

File: test_sync.py Progetto: bmk10/GridMateBook_1.18

def use_threadfence(ary):
    ary[0] += 123
    cuda.threadfence()
    ary[0] += 321

Esempio n. 3

0

Mostra file

File: utils.py Progetto: xKuZz/tfg

def min_index_reduction(my_data, my_min, my_idx, aux, lock):
    block_mins = cuda.shared.array(shape=32, dtype=numba.float32)
    block_mins_idx = cuda.shared.array(shape=32, dtype=numba.int32)
    row = cuda.blockIdx.y
    col = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    if col < my_data.shape[1]:
        data1 = my_data[row, col]
    else:
        data1 = np.inf
    idx1 = col

    # 1. Reducción min_index en cada warp
    data2 = cuda.shfl_up_sync(-1, data1, 1)
    idx2 = cuda.shfl_up_sync(-1, idx1, 1)
    if data2 < data1:
        data1 = data2
        idx1 = idx2

    data2 = cuda.shfl_up_sync(-1, data1, 2)
    idx2 = cuda.shfl_up_sync(-1, idx1, 2)
    if data2 < data1:
        data1 = data2
        idx1 = idx2

    data2 = cuda.shfl_up_sync(-1, data1, 4)
    idx2 = cuda.shfl_up_sync(-1, idx1, 4)
    if data2 < data1:
        data1 = data2
        idx1 = idx2

    data2 = cuda.shfl_up_sync(-1, data1, 8)
    idx2 = cuda.shfl_up_sync(-1, idx1, 8)
    if data2 < data1:
        data1 = data2
        idx1 = idx2

    data2 = cuda.shfl_up_sync(-1, data1, 16)
    idx2 = cuda.shfl_up_sync(-1, idx1, 16)
    if data2 < data1:
        data1 = data2
        idx1 = idx2
    if cuda.threadIdx.x % 32 == 31:
        block_mins[cuda.threadIdx.x // 32] = data1
        block_mins_idx[cuda.threadIdx.x // 32] = idx1

    cuda.syncthreads()

    # 2. Reducción del bloque completo en el primer warp
    if cuda.threadIdx.x < 32:
        if cuda.threadIdx.x < cuda.blockDim.x // 32:
            data2 = block_mins[cuda.threadIdx.x]
            idx2 = block_mins_idx[cuda.threadIdx.x]
        else:
            data2 = np.inf
            idx2 = col

        data3 = cuda.shfl_up_sync(-1, data2, 1)
        idx3 = cuda.shfl_up_sync(-1, idx2, 1)
        if data3 < data2:
            data2 = data3
            idx2 = idx3

        data3 = cuda.shfl_up_sync(-1, data2, 2)
        idx3 = cuda.shfl_up_sync(-1, idx2, 2)
        if data3 < data2:
            data2 = data3
            idx2 = idx3

        data3 = cuda.shfl_up_sync(-1, data2, 4)
        idx3 = cuda.shfl_up_sync(-1, idx2, 4)
        if data3 < data2:
            data2 = data3
            idx2 = idx3

        data3 = cuda.shfl_up_sync(-1, data2, 8)
        idx3 = cuda.shfl_up_sync(-1, idx2, 8)
        if data3 < data2:
            data2 = data3
            idx2 = idx3

        data3 = cuda.shfl_up_sync(-1, data2, 16)
        idx3 = cuda.shfl_up_sync(-1, idx2, 16)
        if data3 < data2:
            data2 = data3
            idx2 = idx3

    if cuda.threadIdx.x == 31:
        aux[cuda.blockIdx.y, cuda.blockIdx.x] = 0
        while cuda.atomic.compare_and_swap(lock, 0, 1) == 1:
            continue
        if data2 < my_min[0]:
            my_min[0] = data2
            my_idx[0] = cuda.blockIdx.y
            my_idx[1] = idx2
        cuda.threadfence()
        lock[0] = 0