def cl_reduce(function, output, input, shared, group_size, initial=0.0): i = c_uint(0) lid = clrt.get_local_id(0) gid = clrt.get_group_id(0) gsize = clrt.get_num_groups(0) gs2 = group_size * 2 stride = gs2 * gsize i = gid * gs2 + lid shared[lid] = initial while i < input.size: shared[lid] = function(shared[lid], input[i]) shared[lid] = function(shared[lid], input[i + group_size]) i += stride clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) #The clyther compiler identifies this loop as a constant a # unrolls this loop for cgs in [512, 256, 128, 64, 32, 16, 8, 4, 2]: #acts as a preprocessor define #if (group_size >= 512) etc. if group_size >= cgs: if lid < cgs / 2: shared[lid] = function(shared[lid], shared[lid + cgs / 2]) clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) if lid == 0: output[gid] = shared[0]
def cl_reduce(function, output, input, shared, group_size, initial=0.0): i = c_uint(0) lid = clrt.get_local_id(0) gid = clrt.get_group_id(0) gsize = clrt.get_num_groups(0) gs2 = group_size * 2 stride = gs2 * gsize i = gid * gs2 + lid shared[lid] = initial while i < input.size: shared[lid] = function(shared[lid], input[i]) shared[lid] = function(shared[lid], input[i + group_size]) i += stride clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) #The clyther compiler identifies this loop as a constant a # unrolls this loop for cgs in [512 , 256, 128, 64, 32, 16, 8, 4, 2]: #acts as a preprocessor define #if (group_size >= 512) etc. if group_size >= cgs: if lid < cgs / 2: shared[lid] = function(shared[lid] , shared[lid + cgs / 2]) clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) if lid == 0: output[gid] = shared[0]
def reduce_kernel(function, output, array, shared, group_size): lid = clrt.get_local_id(0) gid = clrt.get_group_id(0) stride = group_size i = c_uint(gid * group_size + lid) igs = i + group_size tmp = array[i] if igs < array.size: tmp = function(tmp, array[igs]) i += stride * 2 while i < array.size: tmp = function(tmp, array[i]) i += stride shared[lid] = tmp clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) #The clyther compiler identifies this loop as a constant a # unrolls this loop for cgs in [512, 256, 128, 64, 32, 16, 8, 4, 2]: #acts as a preprocessor define #if (group_size >= 512) etc. if group_size >= cgs: if lid < cgs / 2: shared[lid] = function(shared[lid], shared[lid + cgs / 2]) clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) if lid == 0: output[gid] = shared[0]
def reduce_kernel(function, output, array, shared, group_size): lid = clrt.get_local_id(0) gid = clrt.get_group_id(0) stride = group_size i = c_uint(gid * group_size + lid) igs = i + group_size tmp = array[i] if igs < array.size: tmp = function(tmp, array[igs]) i += stride*2 while i < array.size: tmp = function(tmp, array[i]) i += stride shared[lid] = tmp clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) #The clyther compiler identifies this loop as a constant a # unrolls this loop for cgs in [512 , 256, 128, 64, 32, 16, 8, 4, 2]: #acts as a preprocessor define #if (group_size >= 512) etc. if group_size >= cgs: if lid < cgs / 2: shared[lid] = function(shared[lid] , shared[lid + cgs / 2]) clrt.barrier(clrt.CLK_LOCAL_MEM_FENCE) if lid == 0: output[gid] = shared[0]