def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_side, 2 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 def launch_iter(n): if n == 0: return launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_side, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.info_a.d_params) # Split the launch into multiple rounds, possibly (slightly) reducing # work overlap but avoiding stalls when working on a device with an # active X session. TODO: characterize performance impact, autodetect BLOCK_SIZE = 16 for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE): launch_iter(BLOCK_SIZE) launch_iter(nrounds%BLOCK_SIZE) nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.))) launch('flush_atom', self.mod, self.stream_a, 256, (nblocks, nblocks), u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
def apply(self, fb, gprof, params, dim, tc, stream=None): gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) dsc = mkdsc(dim, 1) tref = mktref(self.mod, 'chan1_src') set_blur_width(self.mod, fb.pool, stream=stream) launch2('apply_gamma', self.mod, stream, dim, fb.d_left, fb.d_front, f32(0.1)) tref.set_address_2d(fb.d_left, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, fb.d_left, i32(3), i32(0), texrefs=[tref]) launch2('haloclip', self.mod, stream, dim, fb.d_front, fb.d_left, gam)
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event( cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def __init__(self, start_address): super().__init__(start_address) self.start_address = start_address self.bg_block_bitmap = i32(0) self.bg_inode_bitmap = i32(0) self.bg_inode_table = i32(0) self.bg_free_blocks_count = i16(0) self.bg_free_inodes_count = i16(0) self.bg_used_dirs_count = i16(0) self.bg_pad = i16(0) self.bg_reserved = [byte(0)] * 12
def apply(self, fb, gprof, params, dim, tc, stream=None): gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) dsc = mkdsc(dim, 1) tref = mktref(self.mod, "chan1_src") set_blur_width(self.mod, fb.pool, stream=stream) launch2("apply_gamma", self.mod, stream, dim, fb.d_side, fb.d_front, f32(0.1)) tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride) launch2("den_blur_1c", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride) launch2("den_blur_1c", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref]) launch2("haloclip", self.mod, stream, dim, fb.d_front, fb.d_side, gam)
def launchC(name, mod, stream, dim, fb, *args): launch( name, mod, stream, (32, 8, 1), (int(np.ceil(dim.w / 32.0)), int(np.ceil(dim.h / 8.0))), fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h), *args )
def apply(self, fb, gprof, params, dim, tc, stream=None): # Helper variables and functions to keep it clean sb = 16 * dim.astride bs = sb * dim.ah dsc = mkdsc(dim, 4) tref = mktref(self.mod, 'chan4_src') grad_dsc = mkdsc(dim, 1) grad_tref = mktref(self.mod, 'chan1_src') set_blur_width(self.mod, fb.pool, stream=stream) for pattern in range(self.directions): # Scale spatial parameter so that a "pixel" is equivalent to an # actual pixel at 1080p sstd = params.spatial_std(tc) * dim.w / 1920. tref.set_address_2d(fb.d_front, dsc, sb) # Blur density two octaves along sampling vector, ultimately # storing in the side buffer launch2('den_blur', self.mod, stream, dim, fb.d_back, i32(pattern), i32(0), texrefs=[tref]) grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4) launch2('den_blur_1c', self.mod, stream, dim, fb.d_left, i32(pattern), i32(1), texrefs=[grad_tref]) grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4) launch2('bilateral', self.mod, stream, dim, fb.d_back, i32(pattern), i32(self.radius), f32(sstd), f32(params.color_std(tc)), f32(params.density_std(tc)), f32(params.density_pow(tc)), f32(params.gradient(tc)), texrefs=[tref, grad_tref]) fb.flip()
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) dsc = mkdsc(dim, 4) tref = mktref(self.mod, "chan4_src") set_blur_width(self.mod, fb.pool, params.width(tc), stream) launch2("apply_gamma_full_hi", self.mod, stream, dim, fb.d_side, fb.d_front, f32(gam - 1)) tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(1), i32(0), texrefs=[tref]) launch2("smearclip", self.mod, stream, dim, fb.d_front, fb.d_side, f32(gam - 1), lin, lingam)
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) dsc = mkdsc(dim, 4) tref = mktref(self.mod, 'chan4_src') set_blur_width(self.mod, fb.pool, params.width(tc), stream) launch2('apply_gamma_full_hi', self.mod, stream, dim, fb.d_left, fb.d_front, f32(gam - 1)) tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_left, i32(3), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_left, i32(1), i32(0), texrefs=[tref]) launch2('smearclip', self.mod, stream, dim, fb.d_front, fb.d_left, f32(gam - 1), lin, lingam)
def conv( # input and kernel tensors x: OpenCLTensor, kernel: OpenCLTensor, # strides strides: tuple) -> OpenCLTensor: # get device device = x.device # get dimensions ndim = len(x.shape) kernel_dim = len(kernel.shape) - 2 # without in- and output channels # TODO: flatten additional dimensions in input assert ndim == kernel_dim + 2 assert len(strides) == kernel_dim # build output tensor out_image_shape = tuple( (s - k) // st + 1 for s, k, st in zip(x.shape[-kernel_dim:], kernel.shape[2:], strides)) out_shape = x.shape[:-kernel_dim - 1] + (kernel.shape[0], ) + out_image_shape out_tensor = device.Tensor.empty(out_shape, dtype=x.dtype) # build kernel knl = cache_build_conv_kernel(device.context, kernel_dim=kernel_dim, dtype=x.dtype) # set input arguemnts knl.set_args( # input and kernel data x.contiguous().data, kernel.contiguous().data, out_tensor.data, # in- and output shape i32(kernel.shape[1]), i32(prod(x.shape[-kernel_dim:])), *(i32(s) for s in x.shape[-kernel_dim:]), *(i32(s) for s in out_image_shape), # kernel shape without output channels i32(prod(kernel.shape[-kernel_dim:])), *(i32(s) for s in kernel.shape[-kernel_dim:]), # strides *(i32(st) for st in strides)) # execute kernel global_shape = [prod(out_image_shape), kernel.shape[0], x.shape[0]] # flat-out-image, out-channels, batch local_shape = None cl.enqueue_nd_range_kernel(device.queue, knl, global_shape, local_shape).wait() # return output tensor return out_tensor
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim), ), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim),), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def dot( # inputs X: OpenCLTensor, Y: OpenCLTensor, # kernel information block_size: int = 8 * 16, work_per_thread: int = 8) -> OpenCLTensor: assert 3 >= len(X.shape) == len(Y.shape) >= 2 assert X.shape[:-2] == Y.shape[:-2] assert X.shape[-1] == Y.shape[-2] # get tensor information device = X.device n, M, N, K = len(X.shape), X.shape[-2], Y.shape[-1], X.shape[-1] # flatten batch dimensions X = X.reshape(-1, M, K) Y = Y.reshape(-1, K, N) assert X.shape[0] == Y.shape[0], "Batches do not align! (%i != %i)" % ( X.shape[0], Y.shape[0]) # pad inputs to be multiple of block size in both directions X = _match_blocks(X, block_size) Y = _match_blocks(Y, block_size) # create output tensor B, pad_M, pad_N, pad_K = X.shape[0], X.shape[1], Y.shape[2], X.shape[2] B, pad_M, pad_N, pad_K = i32(B), i32(pad_M), i32(pad_N), i32(pad_K) O = device.Tensor.empty(shape=(B, pad_M, pad_N), dtype=X.dtype) # TODO: broadcast dtype # kernel global and local thread layout global_shape = [B, pad_M // work_per_thread, pad_N // work_per_thread] local_shape = [1] + [block_size // work_per_thread] * 2 # build and call kernel knl = cache_build_dot_kernel(device.context, X.dtype, Y.dtype, O.dtype, block_size, work_per_thread) e = knl(device.queue, global_shape, local_shape, X.contiguous().data, Y.contiguous().data, O.data, i32(X.offset), i32(Y.offset), pad_M, pad_N, pad_K) e.wait() # remove padding from output idx = (slice(0, B) if (n == 3) else 0, slice(0, M), slice(0, N)) return O[idx]
def atom( op: str, # input / output tensors additional_read: tuple = tuple( ), # by default we only read the values of tensors mentioned in output output=( 'o', ), # output tensors, if not mentioned in named tensors then a new tensor is created # kernel information block_size: int = 256, # local block size # inputs ** named_tensors # all named tensors (except output tensors) needed for execution of op ) -> Tuple[OpenCLTensor]: # separate tensors from scalars named_scalars = { n: v for n, v in named_tensors.items() if not isinstance(v, OpenCLTensor) } named_tensors = { n: v for n, v in named_tensors.items() if isinstance(v, OpenCLTensor) } # separate names and values tensor_names, tensors = zip(*named_tensors.items()) tensor_names, tensors = tuple(tensor_names), tuple(tensors) if len(named_scalars) > 0: scalar_names, scalars = zip(*named_scalars.items()) scalar_names, scalars = tuple(scalar_names), tuple(scalars) else: scalar_names, scalars = tuple(), tuple() # get device and dtype t0 = tensors[0] device, dtype = t0.device, t0.dtype shapes = (t.shape for t in tensors) strides = (t.strides for t in tensors) # broadcast shape shape = map(max, zip_longest(*map(reversed, shapes), fillvalue=1)) shape = tuple(map(i32, shape))[::-1] ndim, numel = len(shape), prod(shape) # create output tensors if necessary for out in output: if out not in tensor_names: tensor_names += (out, ) tensors += (device.Tensor.empty(shape, dtype=dtype), ) # build strides strides = tuple((i32(0), ) * (ndim - len(t.strides)) + tuple( map(lambda st_sh: i32(st_sh[0] if st_sh[1] > 1 else 0), zip(t.strides, t.shape))) for t in tensors) # collapse contiguous dimensions to minimize index computations in kernel if ndim > 1: shape, strides = _collapse_contiguous_dims(shape, strides) ndim = len(shape) # by default we read only tensors that are not in output read = tuple(n for n in tensor_names if n not in output) + additional_read buffer_dtypes = tuple(map(lambda t: t.dtype, tensors)) scalar_dtypes = tuple(map(lambda s: np.dtype(type(s)), scalars)) # build kernel and set arguments knl = cache_build_atom_kernel(device.context, op=op, buffers=tensor_names, buffer_dtypes=buffer_dtypes, scalars=scalar_names, scalar_dtypes=scalar_dtypes, ndim=ndim, read=read, write=output) knl.set_args( *(t.data for t in tensors), # buffers *(t.type(s) for t, s in zip(scalar_dtypes, scalars)), # scalars *shape, *chain(*strides), # shapes and strides *(i32(t.offset) for t in tensors), # offsets i32(numel) # number of elements to compute ) # execute kernel and return output tensors cl.enqueue_nd_range_kernel(device.queue, knl, [ceil(numel / block_size) * block_size], [block_size]).wait() return tuple(t for n, t in zip(tensor_names, tensors) if n in output)
def __init__(self, start_address): super().__init__(start_address) self.start_address = 1024 self.s_inodes_count = i32(0) self.s_inodes_count = i32(0) self.s_blocks_count = i32(0) self.s_r_blocks_count = i32(0) self.s_free_blocks_count = i32(0) self.s_free_inodes_count = i32(0) self.s_first_data_block = i32(0) self.s_log_block_size = i32(0) self.s_log_frag_size = i32(0) self.s_blocks_per_group = i32(0) self.s_frags_per_group = i32(0) self.s_inodes_per_group = i32(0) self.s_mtime = i32(0) self.s_wtime = i32(0) self.s_mnt_count = i16(0) self.s_max_mnt_count = i16(0) self.s_magic = i16(0) self.s_state = i16(0) self.s_errors = i16(0) self.s_minor_rev_level = i16(0) self.s_lastcheck = i32(0) self.s_checkinterval = i32(0) self.s_creator_os = i32(0) self.s_rev_level = i32(0) self.s_def_resuid = i16(0) self.s_def_resgid = i16(0) # EXT2_DYNAMIC_REV Specific self.s_first_ino = i32(0) self.s_inode_size = i16(0) self.s_block_group_nr = i16(0) self.s_feature_compat = i32(0) self.s_feature_incompat = i32(0) self.s_feature_ro_compat = i32(0) # o 104 s 16 self.s_uuid = "some string?" # o 120 s 16 self.s_volume_name = "16 bytes volume name, mostly unusued. A valid volume name would consist of only " \ "ISO-Latin-1 characters and be 0 terminated. " # o 136 s 64 self.s_last_mounted = "64 bytes directory path where the file system was last mounted. While not normally " \ "used, it could serve for auto-finding the mountpoint when not indicated on the command " \ "line. Again the path should be zero terminated for compatibility reasons. Valid path " \ "is constructed from ISO-Latin-1 characters. " # o 200 s 4 self.s_algo_bitmap = "32bit value used by compression algorithms to determine the compression method(s) used." # Performance Hints self.s_prealloc_blocks = byte(0) self.s_prealloc_dir_blocks = byte(0) self.allignment = i16(0) # o 206 # Journaling Support self.s_journal_uuid = "16-byte value containing the uuid of the journal superblock. See Ext3 Journaling for " \ "more information. " self.s_journal_inum = i32(0) self.s_journal_dev = i32(0) self.s_last_orphan = i32(0) # Directory Indexing Support self.s_hash_seed = [i32(0)] * 4 self.s_def_hash_version = byte(0) self.padding = [byte(0)] * 3 # reserved for future expansion # Other options self.s_default_mount_options = i32(0) self.s_first_meta_bg = i32(0) # o 264 s 760 - reserved for future revisions self.unused = [byte(0)] * 760
def __init__(self, start_address): super().__init__(start_address) self.start_address = start_address # bg_inode_table self.i_mode = i16(0) self.i_uid = i16(0) self.i_size = i32(0) self.i_atime = i32(0) self.i_ctime = i32(0) self.i_mtime = i32(0) self.i_dtime = i32(0) self.i_gid = i16(0) self.i_links_count = i16(0) self.i_blocks = i32(0) self.i_flags = i32(0) self.i_osd1 = i32(0) self.i_block = [i32(0)] * 15 # 12 blocks - direct block # 13th entry in this array is the block number of the first indirect block # 14th entry in this array is the block number of the first doubly-indirect block # 15th entry in this array is the block number of the triply-indirect block self.i_generation = i32(0) self.i_file_acl = i32(0) self.i_dir_acl = i32(0) self.i_faddr = i32(0) self.i_osd2 = [byte(0)] * 12 # 96 bit OS dependant
def launchC(name, mod, stream, dim, fb, *args): launch(name, mod, stream, (32, 8, 1), (int(np.ceil(dim.w / 32.)), int(np.ceil(dim.h / 8.))), fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h), *args)
def reduce( reduction: str, # reduction expression using variables 'a' and 'b' # input tensor T: OpenCLTensor, # options axis: Tuple[int], neutral: str = "0", # kernel information group_size: int = 128) -> OpenCLTensor: # get device device = T.device # total number of elements to reduce reduce_numel = prod((T.shape[i] for i in axis)) keep_numel = prod( (T.shape[i] for i in range(len(T.shape)) if i not in axis)) n_work_groups = ceil(reduce_numel / (group_size * 2)) # number of work-groups needed # build output tensor shape = tuple(s if i not in axis else 1 for i, s in enumerate(T.shape)) shape = (1, ) if len(shape) == 0 else shape # output tensor also stores partial sums of each iterations, thus n_work_groups O = device.Tensor.empty(shape + (n_work_groups, ), dtype=T.dtype) # transpose to have reduction dimensions at last if len(axis) < len(T.shape): perm = list(range(len(T.shape))) for i, j in enumerate(axis, 1): perm[-i], perm[j] = perm[j], perm[-i] T = T.transpose(*perm) # build kernels use_strides = (len(axis) < len(T.shape) and not T.is_contiguous()) knl = cache_build_reduction_kernel( device.context, reduction=reduction, dtype=T.dtype, neutral=neutral, ndim=len(T.shape) if use_strides else 0, # set to 0 if not needed to prevent compiling a new kernel use_strides=use_strides, block_size=group_size) next_knl = cache_build_reduction_kernel(device.context, reduction=reduction, dtype=T.dtype, neutral=neutral, ndim=0, use_strides=False, block_size=group_size) # build additional strided input arguments stride_args = (*(i32(s) for s in T.shape), *(i32(st) for st in T.strides)) if use_strides else tuple() while (reduce_numel > 1): knl.set_args(T.data, O.data, i32(T.offset), *stride_args, i32(reduce_numel)) e = cl.enqueue_nd_range_kernel( device.queue, knl, [keep_numel, n_work_groups * group_size], [1, group_size]) # update values T = O # input of further iterations is output of current iteration reduce_numel = n_work_groups n_work_groups = ceil(reduce_numel / (group_size * 2)) knl = next_knl stride_args = tuple() # wait for queue to finish e.wait() # remove partial sums stored in last dimension of output return device.Tensor(O.data, shape=shape, dtype=O.dtype)