Beispiel #1
0
    def visit_KernelCall(self, node):
        args = []
        for i, arg in enumerate(node.args):
            size = SizeOf(SymbolRef(arg.name))
            setter = clSetKernelArg(node.name, i, size, Ref(SymbolRef(arg.name)))
            setter.lift(params=arg._lift_params)
            args.append(setter)

        kernel_decl = SymbolRef(node.name, cl.cl_kernel())
        kernel_symbol = kernel_decl.copy()
        call = clEnqueueNDRangeKernel(node.location.symbol.copy(), kernel_symbol, work_dim=Constant(1), global_size=node.global_size, local_size=node.local_size)

        kernel = RefConverter().visit(node.kernel)
        for param in kernel.params:
            param.type = param.type.ptr_type
        kernel.defn.insert(0, Assign(SymbolRef("i", c_int()), get_global_id(0)))
        kernel_src = kernel.codegen()
        call.body.append(CppComment(kernel_src))

        context = node.location.queue.context
        kernel_ptr = cl.clCreateProgramWithSource(context, kernel_src).build()[node.name]
        call.lift(params=[(kernel_decl, kernel_ptr)])

        return args + [call]
Beispiel #2
0
    def visit_FunctionDecl(self, node):
        # This function grabs the input and output grid names which are used to
        self.local_block = SymbolRef.unique()
        # generate the proper array macros.
        arg_cfg = self.arg_cfg

        global_size = arg_cfg[0].shape

        if self.testing:
            local_size = (1, 1, 1)
        else:
            desired_device_number = -1
            device = cl.clGetDeviceIDs()[desired_device_number]
            lcs = LocalSizeComputer(global_size, device)
            local_size = lcs.compute_local_size_bulky()
            virtual_global_size = lcs.compute_virtual_global_size(local_size)
            self.global_size = global_size
            self.local_size = local_size
            self.virtual_global_size = virtual_global_size

        super(StencilOclTransformer, self).visit_FunctionDecl(node)
        for index, param in enumerate(node.params[:-1]):
            # TODO: Transform numpy type to ctype
            param.type = ct.POINTER(ct.c_float)()
            param.set_global()
            param.set_const()
        node.set_kernel()
        node.params[-1].set_global()
        node.params[-1].type = ct.POINTER(ct.c_float)()
        node.params.append(SymbolRef(self.local_block.name,
                                     ct.POINTER(ct.c_float)()))
        node.params[-1].set_local()
        node.defn = node.defn[0]

        # if boundary handling is copy we have to generate a collection of
        # boundary kernels to handle the on-gpu boundary copy
        if self.is_copied:
            device = cl.clGetDeviceIDs()[-1]
            self.boundary_handlers = boundary_kernel_factory(
                self.ghost_depth, self.output_grid,
                node.params[0].name,
                node.params[-2].name,  # second last parameter is output
                device
            )
            boundary_kernels = [
                FunctionDecl(
                    name=boundary_handler.kernel_name,
                    params=node.params,
                    defn=boundary_handler.generate_ocl_kernel_body(),
                )
                for boundary_handler in self.boundary_handlers
            ]

            self.project.files.append(OclFile('kernel', [node]))

            for dim, boundary_kernel in enumerate(boundary_kernels):
                boundary_kernel.set_kernel()
                self.project.files.append(OclFile(kernel_dim_name(dim),
                                                  [boundary_kernel]))

            self.boundary_kernels = boundary_kernels

            # ctree.browser_show_ast(node)
            # import ctree
            # ctree.browser_show_ast(boundary_kernels[0])
        else:
            self.project.files.append(OclFile('kernel', [node]))

        # print(self.project.files[0])
        # print(self.project.files[-1])

        defn = [
            ArrayDef(
                SymbolRef('global', ct.c_ulong()), arg_cfg[0].ndim,
                [Constant(d) for d in self.virtual_global_size]
            ),
            ArrayDef(
                SymbolRef('local', ct.c_ulong()), arg_cfg[0].ndim,
                [Constant(s) for s in local_size]
                # [Constant(s) for s in [512, 512]]  # use this line to force a
                # opencl local size error
            ),
            Assign(SymbolRef("error_code", ct.c_int()), Constant(0)),
        ]
        setargs = [clSetKernelArg(
            SymbolRef('kernel'), Constant(d),
            FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]),
            Ref(SymbolRef('buf%d' % d))
        ) for d in range(len(arg_cfg) + 1)]
        from functools import reduce
        import operator
        local_mem_size = reduce(
            operator.mul,
            (size + 2 * self.kernel.ghost_depth[index]
             for index, size in enumerate(local_size)),
            ct.sizeof(cl.cl_float())
        )
        setargs.append(
            clSetKernelArg(
                'kernel', len(arg_cfg) + 1,
                local_mem_size,
                NULL()
            )
        )

        defn.extend(setargs)
        enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [
            SymbolRef('queue'), SymbolRef('kernel'),
            Constant(self.kernel.dim), NULL(),
            SymbolRef('global'), SymbolRef('local'),
            Constant(0), NULL(), NULL()
        ])

        defn.extend(check_ocl_error(enqueue_call, "clEnqueueNDRangeKernel"))

        params = [
            SymbolRef('queue', cl.cl_command_queue()),
            SymbolRef('kernel', cl.cl_kernel())
        ]
        if self.is_copied:
            for dim, boundary_kernel in enumerate(self.boundary_kernels):
                defn.extend([
                    ArrayDef(
                        SymbolRef(global_for_dim_name(dim), ct.c_ulong()),
                        arg_cfg[0].ndim,
                        [Constant(d)
                         for d in self.boundary_handlers[dim].global_size]
                    ),
                    ArrayDef(
                        SymbolRef(local_for_dim_name(dim), ct.c_ulong()),
                        arg_cfg[0].ndim,
                        [Constant(s) for s in
                         self.boundary_handlers[dim].local_size]
                    )
                ])
                setargs = [clSetKernelArg(
                    SymbolRef(kernel_dim_name(dim)), Constant(d),
                    FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]),
                    Ref(SymbolRef('buf%d' % d))
                ) for d in range(len(arg_cfg) + 1)]
                setargs.append(
                    clSetKernelArg(
                        SymbolRef(kernel_dim_name(dim)), len(arg_cfg) + 1,
                        local_mem_size,
                        NULL()
                    )
                )
                defn.extend(setargs)

                enqueue_call = FunctionCall(
                    SymbolRef('clEnqueueNDRangeKernel'), [
                        SymbolRef('queue'), SymbolRef(kernel_dim_name(dim)),
                        Constant(self.kernel.dim), NULL(),
                        SymbolRef(global_for_dim_name(dim)),
                        SymbolRef(local_for_dim_name(dim)),
                        Constant(0), NULL(), NULL()
                    ]
                )
                defn.append(enqueue_call)

                params.extend([
                    SymbolRef(kernel_dim_name(dim), cl.cl_kernel())
                ])

        # finish_call = FunctionCall(SymbolRef('clFinish'),
        # [SymbolRef('queue')])
        # defn.append(finish_call)
        # finish_call = [
        #     Assign(
        #         SymbolRef("error_code", ct.c_int()),
        #         FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')])
        #     ),
        #     If(
        #         NotEq(SymbolRef("error_code"), Constant(0)),
        #         FunctionCall(
        #             SymbolRef("printf"),
        #             [
        #                 String("OPENCL KERNEL RETURNED ERROR CODE %d"),
        #                 SymbolRef("error_code")
        #             ]
        #         )
        #     )
        # ]

        finish_call = check_ocl_error(
            FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]),
            "clFinish"
        )
        defn.extend(finish_call)
        defn.append(Return(SymbolRef("error_code")))

        params.extend(SymbolRef('buf%d' % d, cl.cl_mem())
                      for d in range(len(arg_cfg) + 1))

        control = FunctionDecl(ct.c_int32(), "stencil_control",
                               params=params,
                               defn=defn)

        return control
Beispiel #3
0
    def visit_FunctionDecl(self, node):
        """
        :param node:
        :type node: FunctionDef
        """
        if node.kernel is True:
            return node
        for index, arg in enumerate(node.params):
            self.arg_cfg_dict[arg.name] = self.arg_cfg[index]
            if hasattr(self.arg_cfg[index], 'ndpointer'):
                arg.type = self.arg_cfg[index].ndpointer()
            else:
                arg.type = self.arg_cfg[index].ctype()

        self.params = node.params
        node.defn = list(filter(None, map(self.visit, node.defn)))
        params = [
            SymbolRef('queue', cl.cl_command_queue()),
            SymbolRef('kernel', cl.cl_kernel())
        ]
        params.extend(SymbolRef('buf%d' % d, cl.cl_mem())
                      for d in range(len(self.arg_cfg)))
        local_size = 4
        defn = [
            ArrayDef(
                SymbolRef('global', ct.c_ulong()), Constant(self.ndim),
                [Constant(d) for d in self.shape]
            ),
            ArrayDef(
                SymbolRef('local', ct.c_ulong()), Constant(self.ndim),
                [Constant(local_size) for _ in self.shape]
            )
        ]
        setargs = [clSetKernelArg(
                SymbolRef('kernel'), Constant(index),
                Constant(ct.sizeof(arg.ctype)),
                Ref(SymbolRef('buf%d' % index))
            ) for index, arg in enumerate(self.arg_cfg)]
        defn.extend(setargs)
        enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [
            SymbolRef('queue'), SymbolRef('kernel'),
            Constant(self.ndim), NULL(),
            SymbolRef('global'), SymbolRef('local'),
            Constant(0), NULL(), NULL()
        ])
        finish_call = FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')])
        defn.extend([enqueue_call, finish_call])
        header = StringTemplate("""
            #ifdef __APPLE__
            #include <OpenCL/opencl.h>
            #else
            #include <CL/cl.h>
            #endif
            """)
        node.params = params
        node.defn = defn
        self.fusable_nodes.append(
            KernelCall(node, self.project.files[-1].body[0], self.shape, defn[0],
                       tuple(local_size for _ in self.shape), defn[1], enqueue_call,
                       finish_call, setargs)
        )
        return [header, node]