コード例 #1
0
ファイル: ocl.py プロジェクト: sinayoko/stencil_code
def gen_decls(dim, ghost_depth):
    thread_id = get_local_id(dim - 1)
    num_threads = get_local_size(dim - 1)
    block_size = Add(
        get_local_size(dim - 1),
        Constant(ghost_depth[dim - 1] * 2)
    )
    for d in reversed(range(0, dim - 1)):
        base = get_local_size(dim - 1)
        for s in range(d, dim - 2):
            base = Mul(get_local_size(s + 1), base)

        thread_id = Add(
            Mul(get_local_id(d), base),
            thread_id
        )
        num_threads = Mul(get_local_size(d), num_threads)
        block_size = Mul(
            Add(get_local_size(d), Constant(ghost_depth[d] * 2)),
            block_size
        )
    return thread_id, num_threads, block_size
コード例 #2
0
ファイル: ocl.py プロジェクト: sinayoko/stencil_code
 def local_array_macro(self, point):
     dim = len(self.output_grid.shape)
     index = get_local_id(dim)
     for d in reversed(range(dim)):
         index = Add(
             Mul(
                 index,
                 Add(
                     get_local_size(d),
                     Constant(2 * self.ghost_depth[d])
                 ),
             ),
             point[d]
         )
     return FunctionCall(SymbolRef("local_array_macro"), point)
コード例 #3
0
ファイル: main.py プロジェクト: pyprogrammer/XorReduction
    def transform(self, tree, program_config):
        A = program_config[0]
        len_A = np.prod(A.shape)
        inner_type = A.dtype.type()
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type
        apply_one.params[1].type = inner_type


        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("len", ct.c_int())
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),                          # getting the group id for this work group
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),                        # getting the global id for this work item
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),                          # getting the local id for this work item
                                        For(Assign(SymbolRef('i', ct.c_int()), Constant(1)),                                # for(int i=1; i<WORK_GROUP_SIZE; i *= 2)
                                            Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)),                                  
                                            MulAssign(SymbolRef('i'), Constant(2)),
                                            [
                                                If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))),     # if statement checks 
                                                          Constant(0)),
                                                       Lt(Add(SymbolRef('globalId'), SymbolRef('i')),
                                                          SymbolRef("len"))),
                                                   [
                                                       Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')),
                                                              FunctionCall(SymbolRef('apply'),
                                                                           [
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        SymbolRef('globalId')),
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        Add(SymbolRef('globalId'),
                                                                                            SymbolRef('i')))
                                                                           ])),
                                                   ]
                                                ),
                                                FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')])
                                            ]
                                        ),
                                        If(Eq(SymbolRef('localId'), Constant(0)),
                                           [
                                               Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')),
                                                      ArrayRef(SymbolRef('A'), SymbolRef('globalId')))
                                           ]
                                        )
                                    ]
        ).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $n;
            size_t local = $local;
            intptr_t len = $length;
            cl_mem swap;
            for (int runs = 0; runs < $run_limit ; runs++){
                clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
                clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
                clSetKernelArg(kernel, 2, sizeof(intptr_t), &len);
                clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
                swap = buf;
                buf = out_buf;
                out_buf = swap;
                len  = len/local + (len % local != 0);
            }
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)),
              'length': Constant(len_A),
              'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE)))
        })

        proj = Project([kernel, CFile("generated", [control])])
        fn = ConcreteXorReduction()

        program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
コード例 #4
0
ファイル: ocl.py プロジェクト: sinayoko/stencil_code
    def visit_InteriorPointsLoop(self, node):
        dim = len(self.output_grid.shape)
        self.kernel_target = node.target
        condition = And(
            Lt(get_global_id(0),
               Constant(self.arg_cfg[0].shape[0] - self.ghost_depth[0])),
            GtE(get_global_id(0),
                Constant(self.ghost_depth[0]))
        )
        for d in range(1, len(self.arg_cfg[0].shape)):
            condition = And(
                condition,
                And(
                    Lt(get_global_id(d),
                       Constant(self.arg_cfg[0].shape[d] -
                                self.ghost_depth[d])),
                    GtE(get_global_id(d),
                        Constant(self.ghost_depth[d]))
                )
            )
        body = []

        self.macro_defns = [
            CppDefine("local_array_macro", ["d%d" % i for i in range(dim)],
                      self.gen_local_macro()),
            CppDefine("global_array_macro", ["d%d" % i for i in range(dim)],
                      self.gen_global_macro())
        ]
        body.extend(self.macro_defns)

        global_idx = 'global_index'
        self.output_index = global_idx
        body.append(Assign(SymbolRef('global_index', ct.c_int()),
                    self.gen_global_index()))

        self.load_mem_block = self.load_shared_memory_block(
            self.local_block, self.ghost_depth)
        body.extend(self.load_mem_block)
        body.append(FunctionCall(SymbolRef("barrier"),
                                 [SymbolRef("CLK_LOCAL_MEM_FENCE")]))
        for d in range(0, dim):
            body.append(Assign(SymbolRef('local_id%d' % d, ct.c_int()),
                               Add(get_local_id(d),
                                   Constant(self.ghost_depth[d]))))
            self.var_list.append("local_id%d" % d)

        for child in map(self.visit, node.body):
            if isinstance(child, list):
                self.stencil_op.extend(child)
            else:
                self.stencil_op.append(child)

        conditional = None
        for dim in range(len(self.output_grid.shape)):
            if self.virtual_global_size[dim] != self.global_size[dim]:
                if conditional is None:
                    conditional = Lt(get_global_id(dim),
                                     Constant(self.global_size[dim]))
                else:
                    conditional = And(conditional,
                                      Lt(get_global_id(dim),
                                         Constant(self.global_size[dim])))

        if conditional is not None:
            body.append(If(conditional, self.stencil_op))
        else:
            body.extend(self.stencil_op)

        # this does not help fix the failure
        # body.append(FunctionCall(SymbolRef("barrier"),
        #                          [SymbolRef("CLK_GLOBAL_MEM_FENCE")]))
        # body.extend(self.stencil_op)
        #
        # this line does seem to fix the problem, seems to suggest some timing
        # issue
        #
        # body.append(If(conditional,
        #                [StringTemplate("out_grid[global_index]+=0;")]))
        #
        # the following fixes the problem too, suggests timing issues
        #
        # body.append(FunctionCall(SymbolRef("printf"), [String("gid %d\\n"),
        #                                                SymbolRef("global_index")]))
        # from ctree.ocl.macros import get_group_id
        # body.append(
        #     FunctionCall(
        #         SymbolRef("printf"),
        #         [
        #             String("group_id %2d %2d gid %2d %2d %2d\\n"),
        #             get_global_id(0),
        #             get_group_id(1),
        #             get_global_id(0),
        #             get_global_id(1),
        #             SymbolRef('global_index'),
        #         ]
        #     )
        # )
        return body
コード例 #5
0
    def transform(self, tree, program_config):
        dirname = self.config_to_dirname(program_config)
        A = program_config[0]
        len_A = np.prod(A.shape)
        data_type = get_c_type_from_numpy_dtype(A.dtype)        # Get the ctype class for the data type for the parameters
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree).find(FunctionDecl)
          
        apply_one.name = 'apply'                                # Naming our kernel method

        # Assigning a data_type instance for the  #
        # return type, and the parameter types... #
        apply_one.return_type = data_type()                     
        apply_one.params[0].type = data_type()
        apply_one.params[1].type = data_type()

        responsible_size = int(len_A / WORK_GROUP_SIZE)         # Get the appropriate number of threads for parallelizing
        
        # Creating our controller function (called "apply_kernel") to control #
        # the parallelizing of our computation, using ctree syntax...         #
        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("localData", pointer()).set_local()
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),
                                        Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())),
                                               ArrayRef(SymbolRef('A'), SymbolRef('globalId'))
                                               ),
                                        For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)),
                                            PostInc(SymbolRef('offset')),
                                            [
                                                Assign(SymbolRef('localResult'),
                                                       FunctionCall(apply_one.name, [SymbolRef('localResult'),
                                                                              ArrayRef(SymbolRef('A'),
                                                                                       Add(SymbolRef('globalId'),
                                                                                           Mul(SymbolRef('offset'),
                                                                                               Constant(WORK_GROUP_SIZE))))])
                                                       ),
                                            ]
                                        ),
                                            Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')),
                                                SymbolRef('localResult')
                                               ),
                                            barrier(CLK_LOCAL_MEM_FENCE()),
                                        If(Eq(SymbolRef('globalId'), Constant(0)),
                                           [
                                                Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'),
                                                                                                                   ArrayRef(SymbolRef('localData'),Constant(x))]))
                                                for x in range(1, WORK_GROUP_SIZE)
                                           ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))]
                                        )
                                    ]
        ).set_kernel()

        # Hardcoded OpenCL code to compensate to begin execution of parallelized computation 
        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $local;
            size_t local = $local;
            intptr_t len = $length;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
            clSetKernelArg(kernel, 2, local * sizeof(int), NULL);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2),
              'length': Constant(len_A),
        })

        ocl_kernel = OclFile("kernel", [apply_one, apply_kernel])
        c_controller = CFile("generated", [control])
        return [ocl_kernel, c_controller]
コード例 #6
0
    def transform(self, tree, program_config):
        A = program_config[0]
        len_A = np.prod(A.shape)
        inner_type = get_c_type_from_numpy_dtype(A.dtype)()
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type
        apply_one.params[1].type = inner_type
        responsible_size = int(len_A / WORK_GROUP_SIZE)
        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("localData", pointer()).set_local()
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),
                                        Assign(SymbolRef('localResult', ct.c_int()),
                                               ArrayRef(SymbolRef('A'), SymbolRef('globalId'))
                                               )
                                        ] +
                                        [Assign(SymbolRef('localResult'),
                                                FunctionCall(SymbolRef('apply'),
                                                             [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))]))
                                            for i in range(1, responsible_size)] +
                                        [
                                            Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')),
                                                SymbolRef('localResult')
                                               ),
                                            barrier(CLK_LOCAL_MEM_FENCE()),
                                        If(Eq(SymbolRef('globalId'), Constant(0)),
                                           [
                                                Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'),
                                                                                                                   ArrayRef(SymbolRef('localData'),Constant(x))]))
                                                for x in range(1, WORK_GROUP_SIZE)
                                           ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))]
                                        )
                                    ]
        ).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $local;
            size_t local = $local;
            intptr_t len = $length;
            cl_mem swap;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
            clSetKernelArg(kernel, 2, local * sizeof(int), NULL);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2),
              'length': Constant(len_A)
        })

        c_controller = CFile("generated", [control])
        return [kernel, c_controller]