def build(self): SOURCE = """ #define HEIGHT {{HEIGHT}} #define WIDTH {{WIDTH}} {{{LIBS}}} {{{STRUCTS}}} __kernel void {{NAME}}({{ARGS}}, __write_only image2d_t _outtex) { int _x = get_global_id(0); int _y = get_global_id(1); int __in_key = _x + _y*HEIGHT; int2 __FIELD = (int2)(_x, _y); write_imagef(_outtex, __FIELD, {{MAP_EXPR}}); } """ arguments, strcts, cl_arg_declr, includes, _ = kernel_helpers.process_arguments_declaration( self.ctx.devices[0], self.arguments ) src = pystache.render( SOURCE, { "ARGS": ", ".join(cl_arg_declr), "NAME": self.kernel_name, "WIDTH": self.size[0], "HEIGHT": self.size[1], "LIBS": "\n".join("#include <{}>".format(i) for i in includes), "STRUCTS": "\n".join(strcts), "MAP_EXPR": self.map_expr, }, ) self._kernel_args = [a[0] for a in arguments] + ["out_texture"] self._kernel = cl.Program(self.ctx, src.encode("ascii")).build()
def build(self, descending=False): arguments = [('r_{}'.format(i), 'global', x[0], '*r_{}'.format(i)) for i, x in enumerate(self._blocksizes)] arguments, strcts, cl_arg_declr, includes, _ = kernel_helpers.process_arguments_declaration(self.ctx.devices[0], arguments) related_buffer_blocks = [self._build_related_buffer_block(i, (arguments[i][2], x[1])) for i, x in enumerate(self._blocksizes)] src = pystache.render(self.SORT_KERNEL, { 'LIBS': '\n'.join('#include <{}>'.format(i) for i in includes), 'STRUCTS': '\n'.join(strcts), 'RELATED_BUFFERS': ', '.join(cl_arg_declr), 'SHIFT_RELATED_BUFFERS': '\n'.join(related_buffer_blocks), 'ORDER': '<' if descending else '>', 'DIMENSION': self._key_shape[1] }).encode('ascii') self._program = cl.Program(self.ctx, src).build()
def build(self, caardinality=1, dimension=1): if hasattr(self.kernel_code, "build"): self.kernel_code.build() if type(self.kernel_code) is str: kernel_code = self.kernel_code else: kernel_code = kernel_helpers.get_attribute_or_item(self.kernel_code, "kernel_code") if self.kernel_code is None: raise ValueError("invalid kernel_code") libraries = self.libraries or "" kernel_code_libs = kernel_helpers.get_attribute_or_item(self.kernel_code, "libraries") if kernel_code_libs is not None: libraries += "\n" + kernel_code_libs arguments = self.arguments or [] kernel_code_args = kernel_helpers.get_attribute_or_item(self.kernel_code, "arguments") if kernel_code_args is not None: arguments += kernel_code_args includes = self.includes or [] kernel_code_args = kernel_helpers.get_attribute_or_item(self.kernel_code, "includes") if kernel_code_args is not None: includes += kernel_code_args # find structures. # XXX # - helper function arguments, strcts, cl_arg_declr, arg_includes, arg_defines = kernel_helpers.process_arguments_declaration( self.ctx.devices[0], arguments ) includes += arg_includes defines = arg_defines cl_defines = ["#define {}".format(d) for d in defines] cl_includes = ["#include <{}>".format(path) for path in set(includes)] shape = self.shape or [self.in_blocksize] shape_def = ["#define DIM{} {}".format(*a) for a in enumerate(shape)] # deprecated backward compatibility shape_def += ["#define SHAPE{} {}".format(*a) for a in enumerate(shape)] cl_constants = [("IN_BLOCK_SIZE", self.in_blocksize), ("OUT_BLOCK_SIZE", self.out_blocksize)] cl_item_var = "\n".join( [ "int __id = get_{}_id(0);".format("global" if self.threads is None else "group"), "int __in_offset = __id*IN_BLOCK_SIZE;", "int __out_offset = __id*OUT_BLOCK_SIZE;", ] ) if self.threads is not None: # XXX # - check for bool (get shape) # - and so on ... if hasattr(self.threads, "__call__"): self._kernel_local, thread_constants, itemsrc = self.threads(self) nthreads = len(thread_constants) cl_item_var += itemsrc else: # default thread layout self._kernel_local = self.threads thread_constants = self._kernel_local nthreads = len(self._kernel_local) get_local_id = lambda i: "get_local_id({})".format(i) itemid = "int{n} __item_id = (int{n})({ids});".format( n="" if nthreads == 1 else nthreads, ids=",".join([get_local_id(i) for i in range(0, len(thread_constants))]), ) if nthreads == 1: cl_item_var += ( itemid + """ int __item = __item_id; int __itemT = __item_id; """ ) elif nthreads == 2: cl_item_var += ( itemid + """ int __item = THREAD_X*__item_id.x+__item_id.y; int __itemT = THREAD_X*__item_id.y+__item_id.x; """ ) elif nthreads == 3: cl_item_var += ( itemid + """ int __item = THREAD_X*__item_id.x+__item_id.y;; int __itemT = THREAD_X*__item_id.y+__item_id.x; """ ) if nthreads == 1: cl_constants.append(("THREAD_X", thread_constants[0])) elif nthreads == 2: cl_constants.append(("THREAD_X", thread_constants[0])) cl_constants.append(("THREAD_Y", thread_constants[1])) elif nthreads == 3: cl_constants.append(("THREAD_X", thread_constants[0])) cl_constants.append(("THREAD_Y", thread_constants[1])) cl_constants.append(("THREAD_Z", thread_constants[2])) else: # XXX # - does a n>3 case make sense? check opencl specs... raise NotImplemented("not implemented yet") src = pystache.render( ShapedKernel._SOURCE, { "INCLUDES": "\n".join(cl_includes), "DEFINES": "\n".join(cl_defines), "STRUCTS": "\n".join(strcts), "PROCEDURE": kernel_code, "IDS": cl_item_var, "CONSTANTS": "\n".join(["#define {} {}".format(*x) for x in cl_constants]), "PROCEDURE_ARGUMENTS": ", \n ".join(cl_arg_declr), "PROCEDURE_FUNCTIONS": libraries, "KERNEL_NAME": self.name, "IN_LAYOUT": "\n".join(shape_def), }, ) self._kernel = cl.Program(self.ctx, src.encode("ascii")).build() self._kernel_args = [a[0] for a in arguments] self.src = src return src
def build(self, caardinality=1, dimension=1): if hasattr(self.map_expr, 'build'): self.map_expr.build() if type(self.map_expr) is str: map_expr = self.map_expr else: map_expr = kernel_helpers.get_attribute_or_item(self.map_expr, 'map_expr') if self.map_expr is None: raise ValueError('invalid map_expr') libraries = self.libraries or '' map_expr_libs = kernel_helpers.get_attribute_or_item(self.map_expr, 'libraries') if map_expr_libs is not None: libraries += '\n'+map_expr_libs arguments = self.arguments or [] map_expr_args = kernel_helpers.get_attribute_or_item(self.map_expr, 'arguments') if map_expr_args is not None: arguments += map_expr_args includes = self.includes or [] map_expr_args = kernel_helpers.get_attribute_or_item(self.map_expr, 'includes') if map_expr_args is not None: includes += map_expr_args # find structures. # XXX # - helper function arguments, strcts, cl_arg_declr, arg_includes, _ = kernel_helpers.process_arguments_declaration(self.ctx.devices[0], arguments) includes += arg_includes cl_includes = ['#include <{}>'.format(path) for path in set(includes)] shape = self.block_shape or [self.in_blocksize] shape_def = ['#define DIM{} {}'.format(*a) for a in enumerate(shape)] # deprecated backward compatibility shape_def += ['#define SHAPE{} {}'.format(*a) for a in enumerate(shape)] cl_constants = [ ('IN_BLOCK_SIZE', self.in_blocksize), ('OUT_BLOCK_SIZE', self.out_blocksize) ] cl_item_var = '\n'.join([ 'int __id = get_{}_id(0);'.format('global' if self.threads is None else 'group'), 'int __in_offset = __id*IN_BLOCK_SIZE;', 'int __out_offset = __id*OUT_BLOCK_SIZE;', ]) if self.threads is not None: # XXX # - check for bool (get shape) # - and so on ... if hasattr(self.threads, '__call__'): self._kernel_local, thread_constants, itemsrc = self.threads(self) nthreads = len(thread_constants) cl_item_var += itemsrc else: # default thread layout self._kernel_local = self.threads thread_constants = self._kernel_local nthreads = len(self._kernel_local) get_local_id = lambda i: 'get_local_id({})'.format(i) itemid = 'int{n} __item_id = (int{n})({ids});'.format( n='' if nthreads == 1 else nthreads, ids=','.join([get_local_id(i) for i in range(0, len(thread_constants))])) if nthreads == 1: cl_item_var += itemid+""" int __item = __item_id; int __itemT = __item_id; """ elif nthreads == 2: cl_item_var += itemid+""" int __item = THREAD_X*__item_id.x+__item_id.y; int __itemT = THREAD_X*__item_id.y+__item_id.x; """ elif nthreads == 3: cl_item_var += itemid+""" int __item = THREAD_X*__item_id.x+__item_id.y;; int __itemT = THREAD_X*__item_id.y+__item_id.x; """ if nthreads == 1: cl_constants.append(('THREAD_X', thread_constants[0])) elif nthreads == 2: cl_constants.append(('THREAD_X', thread_constants[0])) cl_constants.append(('THREAD_Y', thread_constants[1])) elif nthreads == 3: cl_constants.append(('THREAD_X', thread_constants[0])) cl_constants.append(('THREAD_Y', thread_constants[1])) cl_constants.append(('THREAD_Z', thread_constants[2])) else: # XXX # - does a n>3 case make sense? check opencl specs... raise NotImplemented('not implemented yet') src = pystache.render(Blockwise._SOURCE, { 'INCLUDES' : '\n'.join(cl_includes), 'STRUCTS' : '\n'.join(strcts), 'PROCEDURE' : map_expr, 'IDS' : cl_item_var, 'CONSTANTS' : '\n'.join(['#define {} {}'.format(*x) for x in cl_constants]) , 'PROCEDURE_ARGUMENTS': ', \n '.join(cl_arg_declr), 'PROCEDURE_FUNCTIONS': libraries, 'KERNEL_NAME' : self.name, 'IN_LAYOUT' : '\n'.join(shape_def), }) self._kernel = cl.Program(self.ctx, src.encode('ascii')).build() self._kernel_args = [a[0] for a in arguments] return src