def build(self):
        SOURCE = """
        #define HEIGHT {{HEIGHT}}
        #define WIDTH {{WIDTH}}
        {{{LIBS}}}
        {{{STRUCTS}}}
        __kernel void {{NAME}}({{ARGS}}, __write_only image2d_t _outtex) {
            int _x = get_global_id(0);
            int _y = get_global_id(1);

            int __in_key = _x + _y*HEIGHT;

            int2 __FIELD = (int2)(_x, _y);
            write_imagef(_outtex, __FIELD, {{MAP_EXPR}});
        }
        """
        arguments, strcts, cl_arg_declr, includes, _ = kernel_helpers.process_arguments_declaration(
            self.ctx.devices[0], self.arguments
        )
        src = pystache.render(
            SOURCE,
            {
                "ARGS": ", ".join(cl_arg_declr),
                "NAME": self.kernel_name,
                "WIDTH": self.size[0],
                "HEIGHT": self.size[1],
                "LIBS": "\n".join("#include <{}>".format(i) for i in includes),
                "STRUCTS": "\n".join(strcts),
                "MAP_EXPR": self.map_expr,
            },
        )

        self._kernel_args = [a[0] for a in arguments] + ["out_texture"]
        self._kernel = cl.Program(self.ctx, src.encode("ascii")).build()
	def build(self, descending=False):
		arguments = [('r_{}'.format(i), 'global', x[0], '*r_{}'.format(i)) for i, x in enumerate(self._blocksizes)]
		arguments, strcts, cl_arg_declr, includes, _ = kernel_helpers.process_arguments_declaration(self.ctx.devices[0], arguments)

		related_buffer_blocks = [self._build_related_buffer_block(i, (arguments[i][2], x[1]))  for i, x in enumerate(self._blocksizes)]

		src = pystache.render(self.SORT_KERNEL, {
			'LIBS': '\n'.join('#include <{}>'.format(i) for i in includes),
			'STRUCTS': '\n'.join(strcts),
		    'RELATED_BUFFERS': ', '.join(cl_arg_declr),
		    'SHIFT_RELATED_BUFFERS': '\n'.join(related_buffer_blocks),
		    'ORDER': '<' if descending else '>',
		    'DIMENSION': self._key_shape[1]
		}).encode('ascii')

		self._program = cl.Program(self.ctx, src).build()
    def build(self, caardinality=1, dimension=1):
        if hasattr(self.kernel_code, "build"):
            self.kernel_code.build()

        if type(self.kernel_code) is str:
            kernel_code = self.kernel_code
        else:
            kernel_code = kernel_helpers.get_attribute_or_item(self.kernel_code, "kernel_code")
            if self.kernel_code is None:
                raise ValueError("invalid kernel_code")

        libraries = self.libraries or ""
        kernel_code_libs = kernel_helpers.get_attribute_or_item(self.kernel_code, "libraries")
        if kernel_code_libs is not None:
            libraries += "\n" + kernel_code_libs

        arguments = self.arguments or []
        kernel_code_args = kernel_helpers.get_attribute_or_item(self.kernel_code, "arguments")
        if kernel_code_args is not None:
            arguments += kernel_code_args

        includes = self.includes or []
        kernel_code_args = kernel_helpers.get_attribute_or_item(self.kernel_code, "includes")
        if kernel_code_args is not None:
            includes += kernel_code_args

        # find structures.
        # XXX
        # - helper function
        arguments, strcts, cl_arg_declr, arg_includes, arg_defines = kernel_helpers.process_arguments_declaration(
            self.ctx.devices[0], arguments
        )
        includes += arg_includes
        defines = arg_defines

        cl_defines = ["#define {}".format(d) for d in defines]
        cl_includes = ["#include <{}>".format(path) for path in set(includes)]

        shape = self.shape or [self.in_blocksize]
        shape_def = ["#define DIM{} {}".format(*a) for a in enumerate(shape)]  # deprecated backward compatibility
        shape_def += ["#define SHAPE{} {}".format(*a) for a in enumerate(shape)]

        cl_constants = [("IN_BLOCK_SIZE", self.in_blocksize), ("OUT_BLOCK_SIZE", self.out_blocksize)]

        cl_item_var = "\n".join(
            [
                "int __id = get_{}_id(0);".format("global" if self.threads is None else "group"),
                "int __in_offset = __id*IN_BLOCK_SIZE;",
                "int __out_offset = __id*OUT_BLOCK_SIZE;",
            ]
        )

        if self.threads is not None:
            # XXX
            # - check for bool (get shape)
            # - and so on ...
            if hasattr(self.threads, "__call__"):
                self._kernel_local, thread_constants, itemsrc = self.threads(self)
                nthreads = len(thread_constants)
                cl_item_var += itemsrc

            else:
                # default thread layout
                self._kernel_local = self.threads
                thread_constants = self._kernel_local
                nthreads = len(self._kernel_local)

                get_local_id = lambda i: "get_local_id({})".format(i)
                itemid = "int{n} __item_id = (int{n})({ids});".format(
                    n="" if nthreads == 1 else nthreads,
                    ids=",".join([get_local_id(i) for i in range(0, len(thread_constants))]),
                )
                if nthreads == 1:
                    cl_item_var += (
                        itemid
                        + """
                        int __item = __item_id;
                        int __itemT = __item_id;
                    """
                    )
                elif nthreads == 2:
                    cl_item_var += (
                        itemid
                        + """
                        int __item = THREAD_X*__item_id.x+__item_id.y;
                        int __itemT = THREAD_X*__item_id.y+__item_id.x;
                    """
                    )
                elif nthreads == 3:
                    cl_item_var += (
                        itemid
                        + """
                        int __item = THREAD_X*__item_id.x+__item_id.y;;
                        int __itemT = THREAD_X*__item_id.y+__item_id.x;
                    """
                    )

            if nthreads == 1:
                cl_constants.append(("THREAD_X", thread_constants[0]))
            elif nthreads == 2:
                cl_constants.append(("THREAD_X", thread_constants[0]))
                cl_constants.append(("THREAD_Y", thread_constants[1]))
            elif nthreads == 3:
                cl_constants.append(("THREAD_X", thread_constants[0]))
                cl_constants.append(("THREAD_Y", thread_constants[1]))
                cl_constants.append(("THREAD_Z", thread_constants[2]))
            else:
                # XXX
                # - does a n>3 case make sense? check opencl specs...
                raise NotImplemented("not implemented yet")

        src = pystache.render(
            ShapedKernel._SOURCE,
            {
                "INCLUDES": "\n".join(cl_includes),
                "DEFINES": "\n".join(cl_defines),
                "STRUCTS": "\n".join(strcts),
                "PROCEDURE": kernel_code,
                "IDS": cl_item_var,
                "CONSTANTS": "\n".join(["#define {} {}".format(*x) for x in cl_constants]),
                "PROCEDURE_ARGUMENTS": ", \n    ".join(cl_arg_declr),
                "PROCEDURE_FUNCTIONS": libraries,
                "KERNEL_NAME": self.name,
                "IN_LAYOUT": "\n".join(shape_def),
            },
        )
        self._kernel = cl.Program(self.ctx, src.encode("ascii")).build()
        self._kernel_args = [a[0] for a in arguments]
        self.src = src
        return src
    def build(self, caardinality=1, dimension=1):
        if hasattr(self.map_expr, 'build'):
            self.map_expr.build()

        if type(self.map_expr) is str:
            map_expr = self.map_expr
        else:
            map_expr = kernel_helpers.get_attribute_or_item(self.map_expr, 'map_expr')
            if self.map_expr is None:
                raise ValueError('invalid map_expr')

        libraries = self.libraries or ''
        map_expr_libs = kernel_helpers.get_attribute_or_item(self.map_expr, 'libraries')
        if map_expr_libs is not None:
            libraries += '\n'+map_expr_libs

        arguments = self.arguments or []
        map_expr_args = kernel_helpers.get_attribute_or_item(self.map_expr, 'arguments')
        if map_expr_args is not None:
            arguments += map_expr_args

        includes = self.includes or []
        map_expr_args = kernel_helpers.get_attribute_or_item(self.map_expr, 'includes')
        if map_expr_args is not None:
            includes += map_expr_args

        # find structures.
        # XXX
        # - helper function
        arguments, strcts, cl_arg_declr, arg_includes, _ = kernel_helpers.process_arguments_declaration(self.ctx.devices[0], arguments)
        includes += arg_includes

        cl_includes = ['#include <{}>'.format(path) for path in set(includes)]

        shape = self.block_shape or [self.in_blocksize]
        shape_def = ['#define DIM{} {}'.format(*a) for a in enumerate(shape)] # deprecated backward compatibility
        shape_def += ['#define SHAPE{} {}'.format(*a) for a in enumerate(shape)]

        cl_constants = [
            ('IN_BLOCK_SIZE', self.in_blocksize),
            ('OUT_BLOCK_SIZE', self.out_blocksize)
        ]

        cl_item_var = '\n'.join([
            'int __id = get_{}_id(0);'.format('global' if self.threads is None else 'group'),
            'int __in_offset = __id*IN_BLOCK_SIZE;',
            'int __out_offset = __id*OUT_BLOCK_SIZE;',
        ])


        if self.threads is not None:
            # XXX
            # - check for bool (get shape)
            # - and so on ...
            if hasattr(self.threads, '__call__'):
                self._kernel_local, thread_constants, itemsrc = self.threads(self)
                nthreads = len(thread_constants)
                cl_item_var += itemsrc

            else:
                # default thread layout
                self._kernel_local = self.threads
                thread_constants = self._kernel_local
                nthreads = len(self._kernel_local)

                get_local_id = lambda i: 'get_local_id({})'.format(i)
                itemid = 'int{n} __item_id = (int{n})({ids});'.format(
                    n='' if nthreads == 1 else nthreads,
                    ids=','.join([get_local_id(i) for i in range(0, len(thread_constants))]))
                if nthreads == 1:
                    cl_item_var += itemid+"""
                        int __item = __item_id;
                        int __itemT = __item_id;
                    """
                elif nthreads == 2:
                    cl_item_var += itemid+"""
                        int __item = THREAD_X*__item_id.x+__item_id.y;
                        int __itemT = THREAD_X*__item_id.y+__item_id.x;
                    """
                elif nthreads == 3:
                    cl_item_var += itemid+"""
                        int __item = THREAD_X*__item_id.x+__item_id.y;;
                        int __itemT = THREAD_X*__item_id.y+__item_id.x;
                    """

            if nthreads == 1:
                cl_constants.append(('THREAD_X', thread_constants[0]))
            elif nthreads == 2:
                cl_constants.append(('THREAD_X', thread_constants[0]))
                cl_constants.append(('THREAD_Y', thread_constants[1]))
            elif nthreads == 3:
                cl_constants.append(('THREAD_X', thread_constants[0]))
                cl_constants.append(('THREAD_Y', thread_constants[1]))
                cl_constants.append(('THREAD_Z', thread_constants[2]))
            else:
                # XXX
                # - does a n>3 case make sense? check opencl specs...
                raise NotImplemented('not implemented yet')

        src = pystache.render(Blockwise._SOURCE, {
            'INCLUDES'           : '\n'.join(cl_includes),
            'STRUCTS'            : '\n'.join(strcts),
            'PROCEDURE'          :  map_expr,
            'IDS'                : cl_item_var,
            'CONSTANTS'          : '\n'.join(['#define {} {}'.format(*x) for x in cl_constants])     ,
            'PROCEDURE_ARGUMENTS': ', \n    '.join(cl_arg_declr),
            'PROCEDURE_FUNCTIONS': libraries,
            'KERNEL_NAME'        : self.name,
            'IN_LAYOUT'          : '\n'.join(shape_def),
        })

        self._kernel = cl.Program(self.ctx, src.encode('ascii')).build()
        self._kernel_args = [a[0] for a in arguments]
        return src