Ejemplo n.º 1
0
    def __init__(self, halo, grid, dimension, in_grid_name="in_grid", out_grid_name="out_grid", device=None):
        """

        :param halo: the halo shape
        :param grid: the shape of the grid that stencil is to be applied to
        :param dimension: the dimension this kernel applies to
        :return:
        """
        self.halo = tuple(halo)
        self.grid = grid
        self.shape = grid.shape
        self.in_grid_name = in_grid_name
        self.out_grid_name = out_grid_name

        # check for some pathologies and raise exception if any are present
        if len(halo) != len(self.shape):
            raise StencilException("halo {} can't apply to grid shape {}".format(self.halo, self.shape))

        if dimension < 0 or dimension >= len(self.shape):
            raise StencilException("dimension {} too big for grid shape {}".format(dimension, self.shape))

        # halo or grid to small
        if any([x < 1 or y < 1 for x, y in zip(self.halo, self.shape)]):
            raise StencilException(
                "halo {} can't be bigger than grid {} in any dimension".format(self.halo, self.shape)
            )
        # no interior points in a dimension
        if any([s <= 2*h for h, s in zip(self.halo, self.shape)]):
            raise StencilException(
                "halo {} can't span grid shape {} in any dimension".format(self.halo, self.shape)
            )

        self.dimension = dimension
        self.dimensions = len(grid.shape)

        self.device = device

        self.global_size, self.global_offset = self.compute_global_size()
        lcs = LocalSizeComputer(self.global_size, device=device)
        self.local_size = lcs.compute_local_size_thin()
        self.virtual_global_size = lcs.compute_virtual_global_size(self.local_size)

        self.kernel_name = OclBoundaryCopier.kernel_name(self.dimension)
    def test_local_size_computer_bulky(self):
        test_grid_shape = [4, 128]
        lsc = LocalSizeComputer(test_grid_shape, MockIrisPro)
        local_size = lsc.compute_local_size_bulky()
        print("{} ls {}".format(test_grid_shape, local_size))

        sizes = [
            [[4], (4,), (4,)],
            [[5], (5,), (5,)],
            [[255], (255,), (255,)],
            [[1023], (1023,), (341,)],
            [[1024], (1024,), (512,)],
            [[1025], (205,), (205,)],

            [[1, 4], (1, 1), (1, 4)],
            [[4, 1], (4, 1), (4, 1)],
            [[4, 4], (4, 1), (4, 4)],
            [[4, 128], (4, 1), (4, 128)],
            [[128, 4], (128, 1), (32, 4)],
            [[128, 7], (128, 1), (32, 7)],
            [[128, 128], (128, 1), (16, 32)],

            [[4, 4, 4], (4, 1, 1), (4, 4, 4)],
            [[4, 4, 512], (4, 1, 1), (4, 4, 32)],
            [[512, 512, 4], (512, 1, 1), (8, 8, 4)],
            [[512, 512, 512], (512, 1, 1), (8, 8, 8)],

            [[3, 3, 666], (3, 1, 1), (3, 3, 37)],
            [[99, 99, 99], (99, 1, 1), (3, 11, 11)],
        ]
        for grid_shape, expected_cpu_local_size, expected_gpu_local_size in sizes:
            print("size {!s:16s}".format(grid_shape), end="")
            cpu_local_size = LocalSizeComputer(grid_shape, MockCPU).compute_local_size_bulky()
            gpu_local_size = LocalSizeComputer(grid_shape, MockIrisPro).compute_local_size_bulky()

            print(" d0 cpu local_size {!s:15s} gpu local_size {!s:15s}".format(cpu_local_size, gpu_local_size))

            self.assertListEqual(list(cpu_local_size), list(expected_cpu_local_size))
            self.assertListEqual(list(gpu_local_size), list(expected_gpu_local_size))
Ejemplo n.º 3
0
    def visit_FunctionDecl(self, node):
        # This function grabs the input and output grid names which are used to
        self.local_block = SymbolRef.unique()
        # generate the proper array macros.
        arg_cfg = self.arg_cfg

        global_size = arg_cfg[0].shape

        if self.testing:
            local_size = (1, 1, 1)
        else:
            desired_device_number = -1
            device = cl.clGetDeviceIDs()[desired_device_number]
            lcs = LocalSizeComputer(global_size, device)
            local_size = lcs.compute_local_size_bulky()
            virtual_global_size = lcs.compute_virtual_global_size(local_size)
            self.global_size = global_size
            self.local_size = local_size
            self.virtual_global_size = virtual_global_size

        super(StencilOclTransformer, self).visit_FunctionDecl(node)
        for index, param in enumerate(node.params[:-1]):
            # TODO: Transform numpy type to ctype
            param.type = ct.POINTER(ct.c_float)()
            param.set_global()
            param.set_const()
        node.set_kernel()
        node.params[-1].set_global()
        node.params[-1].type = ct.POINTER(ct.c_float)()
        node.params.append(SymbolRef(self.local_block.name,
                                     ct.POINTER(ct.c_float)()))
        node.params[-1].set_local()
        node.defn = node.defn[0]

        # if boundary handling is copy we have to generate a collection of
        # boundary kernels to handle the on-gpu boundary copy
        if self.is_copied:
            device = cl.clGetDeviceIDs()[-1]
            self.boundary_handlers = boundary_kernel_factory(
                self.ghost_depth, self.output_grid,
                node.params[0].name,
                node.params[-2].name,  # second last parameter is output
                device
            )
            boundary_kernels = [
                FunctionDecl(
                    name=boundary_handler.kernel_name,
                    params=node.params,
                    defn=boundary_handler.generate_ocl_kernel_body(),
                )
                for boundary_handler in self.boundary_handlers
            ]

            self.project.files.append(OclFile('kernel', [node]))

            for dim, boundary_kernel in enumerate(boundary_kernels):
                boundary_kernel.set_kernel()
                self.project.files.append(OclFile(kernel_dim_name(dim),
                                                  [boundary_kernel]))

            self.boundary_kernels = boundary_kernels

            # ctree.browser_show_ast(node)
            # import ctree
            # ctree.browser_show_ast(boundary_kernels[0])
        else:
            self.project.files.append(OclFile('kernel', [node]))

        # print(self.project.files[0])
        # print(self.project.files[-1])

        defn = [
            ArrayDef(
                SymbolRef('global', ct.c_ulong()), arg_cfg[0].ndim,
                [Constant(d) for d in self.virtual_global_size]
            ),
            ArrayDef(
                SymbolRef('local', ct.c_ulong()), arg_cfg[0].ndim,
                [Constant(s) for s in local_size]
                # [Constant(s) for s in [512, 512]]  # use this line to force a
                # opencl local size error
            ),
            Assign(SymbolRef("error_code", ct.c_int()), Constant(0)),
        ]
        setargs = [clSetKernelArg(
            SymbolRef('kernel'), Constant(d),
            FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]),
            Ref(SymbolRef('buf%d' % d))
        ) for d in range(len(arg_cfg) + 1)]
        from functools import reduce
        import operator
        local_mem_size = reduce(
            operator.mul,
            (size + 2 * self.kernel.ghost_depth[index]
             for index, size in enumerate(local_size)),
            ct.sizeof(cl.cl_float())
        )
        setargs.append(
            clSetKernelArg(
                'kernel', len(arg_cfg) + 1,
                local_mem_size,
                NULL()
            )
        )

        defn.extend(setargs)
        enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [
            SymbolRef('queue'), SymbolRef('kernel'),
            Constant(self.kernel.dim), NULL(),
            SymbolRef('global'), SymbolRef('local'),
            Constant(0), NULL(), NULL()
        ])

        defn.extend(check_ocl_error(enqueue_call, "clEnqueueNDRangeKernel"))

        params = [
            SymbolRef('queue', cl.cl_command_queue()),
            SymbolRef('kernel', cl.cl_kernel())
        ]
        if self.is_copied:
            for dim, boundary_kernel in enumerate(self.boundary_kernels):
                defn.extend([
                    ArrayDef(
                        SymbolRef(global_for_dim_name(dim), ct.c_ulong()),
                        arg_cfg[0].ndim,
                        [Constant(d)
                         for d in self.boundary_handlers[dim].global_size]
                    ),
                    ArrayDef(
                        SymbolRef(local_for_dim_name(dim), ct.c_ulong()),
                        arg_cfg[0].ndim,
                        [Constant(s) for s in
                         self.boundary_handlers[dim].local_size]
                    )
                ])
                setargs = [clSetKernelArg(
                    SymbolRef(kernel_dim_name(dim)), Constant(d),
                    FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]),
                    Ref(SymbolRef('buf%d' % d))
                ) for d in range(len(arg_cfg) + 1)]
                setargs.append(
                    clSetKernelArg(
                        SymbolRef(kernel_dim_name(dim)), len(arg_cfg) + 1,
                        local_mem_size,
                        NULL()
                    )
                )
                defn.extend(setargs)

                enqueue_call = FunctionCall(
                    SymbolRef('clEnqueueNDRangeKernel'), [
                        SymbolRef('queue'), SymbolRef(kernel_dim_name(dim)),
                        Constant(self.kernel.dim), NULL(),
                        SymbolRef(global_for_dim_name(dim)),
                        SymbolRef(local_for_dim_name(dim)),
                        Constant(0), NULL(), NULL()
                    ]
                )
                defn.append(enqueue_call)

                params.extend([
                    SymbolRef(kernel_dim_name(dim), cl.cl_kernel())
                ])

        # finish_call = FunctionCall(SymbolRef('clFinish'),
        # [SymbolRef('queue')])
        # defn.append(finish_call)
        # finish_call = [
        #     Assign(
        #         SymbolRef("error_code", ct.c_int()),
        #         FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')])
        #     ),
        #     If(
        #         NotEq(SymbolRef("error_code"), Constant(0)),
        #         FunctionCall(
        #             SymbolRef("printf"),
        #             [
        #                 String("OPENCL KERNEL RETURNED ERROR CODE %d"),
        #                 SymbolRef("error_code")
        #             ]
        #         )
        #     )
        # ]

        finish_call = check_ocl_error(
            FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]),
            "clFinish"
        )
        defn.extend(finish_call)
        defn.append(Return(SymbolRef("error_code")))

        params.extend(SymbolRef('buf%d' % d, cl.cl_mem())
                      for d in range(len(arg_cfg) + 1))

        control = FunctionDecl(ct.c_int32(), "stencil_control",
                               params=params,
                               defn=defn)

        return control