def sum_reduction_kernel(A, partial_sums): """ The example demonstrates a reduction kernel implemented as a ``kernel`` function. """ local_id = dppy.get_local_id(0) global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) group_id = dppy.get_group_id(0) local_sums = dppy.local.array(64, int32) # Copy from global to local memory local_sums[local_id] = A[global_id] # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 if local_id == 0: partial_sums[group_id] = local_sums[0]
def f(a): lm = dppy.local.array(1, dtype) lm[0] = a[0] dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE) op(lm, 0, 1) dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE) a[0] = lm[0]
def sum_reduction_kernel(A, input_size, partial_sums): local_id = dppy.get_local_id(0) global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) group_id = dppy.get_group_id(0) local_sums = dppy.local.array(64, int32) local_sums[local_id] = 0 if global_id < input_size: local_sums[local_id] = A[global_id] # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 if local_id == 0: partial_sums[group_id] = local_sums[0]
def reverse_array(A): lm = dppy.local.array(shape=10, dtype=np.float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i]
def private_memory_kernel(A): memory = numba_dppy.private.array(shape=1, dtype=np.float32) i = numba_dppy.get_global_id(0) # preload memory[0] = i numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # memory will not hold correct deterministic result if it is not # private to each thread. A[i] = memory[0] * 2
def private_memory_kernel(A): i = numba_dppy.get_global_id(0) prvt_mem = numba_dppy.private.array(shape=1, dtype=np.float32) prvt_mem[0] = i numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = prvt_mem[0] * 2
def twice(A): i = dppy.get_global_id(0) d = A[i] # no argument defaults to global mem fence dppy.barrier() A[i] = d * 2
def twice(A): i = dppy.get_global_id(0) d = A[i] dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2