def compute_fluxes(work_items, nx, ny, nz, num_dirs, num_groups, I_flat, sigma_flat, directions, sigma_a_s, tgroup_id, num_dirs_inv, I_s0, I_s1, I_s2, I_s3, I_s4): block_base_index = cuda.shared.array((1, ), uint_t) if not cuda.threadIdx.x: block_base_index[0] = cuda.atomic.add(tgroup_id, 0, 1) * uint_t( cuda.blockDim.x) cuda.syncthreads() work_item_idx = block_base_index[0] + uint_t(cuda.threadIdx.x) if work_item_idx >= work_items.shape[0]: return idx = work_items[work_item_idx] sigma_flat_idx = idx # TODO: The uint_t specs for nx, ny, and nz here drastically change the result. WHY? sigma_x, sigma_y, sigma_z, dir_idx = unravel_4d_index( nx, ny, nz, directions.shape[0], idx) I_flat_base_idx = I_s0 * (sigma_x + 1) + I_s1 * (sigma_y + 1) + I_s2 * (sigma_z + 1) I_flat_idx = I_s0 * (sigma_x + 1) + I_s1 * (sigma_y + 1) + I_s2 * ( sigma_z + 1) + I_s3 * dir_idx # Now change abstract indices in the iteration space into indices into I. # This is necessary since the beginning and end of each spatial axis # is used for boundary conditions. ix = sigma_x + uint_t(1) iy = sigma_y + uint_t(1) iz = sigma_z + uint_t(1) dirx = directions[dir_idx, 0] diry = directions[dir_idx, 1] dirz = directions[dir_idx, 2] x_has_sign = dirx < float_t(0.) y_has_sign = diry < float_t(0.) z_has_sign = dirz < float_t(0.) x_neighbor_idx = ix + uint_t(1) if x_has_sign else ix - uint_t(1) y_neighbor_idx = iy + uint_t(1) if y_has_sign else iy - uint_t(1) z_neighbor_idx = iz + uint_t(1) if z_has_sign else iz - uint_t(1) x_neighbor_flat_idx = I_flat_idx + I_s0 if x_has_sign else I_flat_idx - I_s0 y_neighbor_flat_idx = I_flat_idx + I_s1 if y_has_sign else I_flat_idx - I_s1 z_neighbor_flat_idx = I_flat_idx + I_s2 if z_has_sign else I_flat_idx - I_s2 x_coef = -float_t(nx) if x_has_sign else float_t(nx) y_coef = -float_t(ny) if y_has_sign else float_t(ny) z_coef = -float_t(nz) if z_has_sign else float_t(nz) denominator = (sigma_a_s - x_coef * dirx - y_coef * diry - z_coef * dirz) # In full-blown versions of this code this sum is actually an inner product # that uses coefficients specific to this direction. Frequencies may also be # considered, but some kind of lower-dimensional thing is usually used # to store the scattering terms. This sum just runs over the directions. incoming_scattering = float_t(0.) sigma_dir_block_idx = sigma_flat_idx - dir_idx for j64 in range(uint_t(num_dirs)): j = uint_t(j64) #incoming_scattering += sigma[sigma_x, sigma_y, sigma_z, j] incoming_scattering += sigma_flat[sigma_dir_block_idx + j] incoming_scattering *= num_dirs_inv # For simplicity we're assuming all frequencies scatter the same, so # sum across frequencies now. x_factor = x_coef * dirx y_factor = y_coef * diry z_factor = z_coef * dirz div = float_t(1.) / denominator x_neighbor_flat_idx_last = x_neighbor_flat_idx + (num_groups - 1) * I_s4 y_neighbor_flat_idx_last = y_neighbor_flat_idx + (num_groups - 1) * I_s4 z_neighbor_flat_idx_last = z_neighbor_flat_idx + (num_groups - 1) * I_s4 while True: cuda.threadfence() # Stop if the upstream neighbors aren't ready. #if (math.isnan(I[x_neighbor_idx, iy, iz, dir_idx, -1]) or # math.isnan(I[ix, y_neighbor_idx, iz, dir_idx, -1]) or # math.isnan(I[ix, iy, z_neighbor_idx, dir_idx, -1])): # continue if (math.isnan(I_flat[x_neighbor_flat_idx_last]) or math.isnan(I_flat[y_neighbor_flat_idx_last]) or math.isnan(I_flat[z_neighbor_flat_idx_last])): continue # For simplicity we're assuming all frequencies scatter the same, so # sum across frequencies now. for k64 in range(num_groups): k = uint_t(k64) #numerator = (incoming_scattering - # x_factor * I[x_neighbor_idx,iy,iz,dir_idx,k] - # y_factor * I[ix,y_neighbor_idx,iz,dir_idx,k] - # z_factor * I[ix,iy,z_neighbor_idx,dir_idx,k]) numerator = (incoming_scattering - x_factor * I_flat[x_neighbor_flat_idx + k * I_s4] - y_factor * I_flat[y_neighbor_flat_idx + k * I_s4] - z_factor * I_flat[z_neighbor_flat_idx + k * I_s4]) flux = numerator * div if k == num_groups - uint_t(1): cuda.threadfence() #I[ix,iy,iz,dir_idx,k] = flux I_flat[I_flat_idx + k * I_s4] = flux break
def use_threadfence(ary): ary[0] += 123 cuda.threadfence() ary[0] += 321
def min_index_reduction(my_data, my_min, my_idx, aux, lock): block_mins = cuda.shared.array(shape=32, dtype=numba.float32) block_mins_idx = cuda.shared.array(shape=32, dtype=numba.int32) row = cuda.blockIdx.y col = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if col < my_data.shape[1]: data1 = my_data[row, col] else: data1 = np.inf idx1 = col # 1. Reducción min_index en cada warp data2 = cuda.shfl_up_sync(-1, data1, 1) idx2 = cuda.shfl_up_sync(-1, idx1, 1) if data2 < data1: data1 = data2 idx1 = idx2 data2 = cuda.shfl_up_sync(-1, data1, 2) idx2 = cuda.shfl_up_sync(-1, idx1, 2) if data2 < data1: data1 = data2 idx1 = idx2 data2 = cuda.shfl_up_sync(-1, data1, 4) idx2 = cuda.shfl_up_sync(-1, idx1, 4) if data2 < data1: data1 = data2 idx1 = idx2 data2 = cuda.shfl_up_sync(-1, data1, 8) idx2 = cuda.shfl_up_sync(-1, idx1, 8) if data2 < data1: data1 = data2 idx1 = idx2 data2 = cuda.shfl_up_sync(-1, data1, 16) idx2 = cuda.shfl_up_sync(-1, idx1, 16) if data2 < data1: data1 = data2 idx1 = idx2 if cuda.threadIdx.x % 32 == 31: block_mins[cuda.threadIdx.x // 32] = data1 block_mins_idx[cuda.threadIdx.x // 32] = idx1 cuda.syncthreads() # 2. Reducción del bloque completo en el primer warp if cuda.threadIdx.x < 32: if cuda.threadIdx.x < cuda.blockDim.x // 32: data2 = block_mins[cuda.threadIdx.x] idx2 = block_mins_idx[cuda.threadIdx.x] else: data2 = np.inf idx2 = col data3 = cuda.shfl_up_sync(-1, data2, 1) idx3 = cuda.shfl_up_sync(-1, idx2, 1) if data3 < data2: data2 = data3 idx2 = idx3 data3 = cuda.shfl_up_sync(-1, data2, 2) idx3 = cuda.shfl_up_sync(-1, idx2, 2) if data3 < data2: data2 = data3 idx2 = idx3 data3 = cuda.shfl_up_sync(-1, data2, 4) idx3 = cuda.shfl_up_sync(-1, idx2, 4) if data3 < data2: data2 = data3 idx2 = idx3 data3 = cuda.shfl_up_sync(-1, data2, 8) idx3 = cuda.shfl_up_sync(-1, idx2, 8) if data3 < data2: data2 = data3 idx2 = idx3 data3 = cuda.shfl_up_sync(-1, data2, 16) idx3 = cuda.shfl_up_sync(-1, idx2, 16) if data3 < data2: data2 = data3 idx2 = idx3 if cuda.threadIdx.x == 31: aux[cuda.blockIdx.y, cuda.blockIdx.x] = 0 while cuda.atomic.compare_and_swap(lock, 0, 1) == 1: continue if data2 < my_min[0]: my_min[0] = data2 my_idx[0] = cuda.blockIdx.y my_idx[1] = idx2 cuda.threadfence() lock[0] = 0