Ejemplo n.º 1
0
    def generate_body(self, kernel, codegen_state):
        from cgen import Block

        body = Block()

        # {{{ declare temporaries

        body.extend(
            idi.cgen_declarator
            for tv in six.itervalues(kernel.temporary_variables)
            for idi in tv.decl_info(kernel.target, is_written=True, index_dtype=kernel.index_dtype)
        )

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops

        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line

        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains
Ejemplo n.º 2
0
def generate_host_or_device_program(codegen_state, schedule_index):
    ast_builder = codegen_state.ast_builder
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)

    from functools import partial

    from loopy.codegen.control import build_loop_nest
    if codegen_state.is_generating_device_code:
        from loopy.schedule import CallKernel
        assert isinstance(codegen_state.kernel.linearization[schedule_index],
                          CallKernel)

        from loopy.codegen.loop import set_up_hw_parallel_loops
        codegen_result = set_up_hw_parallel_loops(
                codegen_state, schedule_index,
                next_func=partial(build_loop_nest,
                    schedule_index=schedule_index + 1))
    else:
        codegen_result = build_loop_nest(codegen_state, schedule_index)

    if (codegen_state.is_generating_device_code
            or codegen_state.is_entrypoint):
        codegen_result = merge_codegen_results(
                codegen_state,
                ast_builder.generate_top_of_body(codegen_state)
                + temp_decls
                + [codegen_result],
                collapse=False)

        cur_prog = codegen_result.current_program(codegen_state)
        body_ast = cur_prog.ast
        fdecl_ast = ast_builder.get_function_declaration(
                codegen_state, codegen_result, schedule_index)

        fdef_ast = ast_builder.get_function_definition(
                codegen_state, codegen_result,
                schedule_index, fdecl_ast, body_ast)

        codegen_result = codegen_result.with_new_program(
                codegen_state,
                cur_prog.copy(
                    ast=ast_builder.process_ast(fdef_ast),
                    body_ast=ast_builder.process_ast(body_ast)))

    return codegen_result
Ejemplo n.º 3
0
def generate_host_or_device_program(codegen_state, schedule_index):
    ast_builder = codegen_state.ast_builder
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)

    from functools import partial

    from loopy.codegen.control import build_loop_nest
    if codegen_state.is_generating_device_code:
        from loopy.schedule import CallKernel
        assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel)

        from loopy.codegen.loop import set_up_hw_parallel_loops
        codegen_result = set_up_hw_parallel_loops(
                codegen_state, schedule_index,
                next_func=partial(build_loop_nest,
                    schedule_index=schedule_index + 1))
    else:
        codegen_result = build_loop_nest(codegen_state, schedule_index)

    codegen_result = merge_codegen_results(
            codegen_state,
            ast_builder.generate_top_of_body(codegen_state)
            + temp_decls
            + [codegen_result],
            collapse=False)

    cur_prog = codegen_result.current_program(codegen_state)
    body_ast = cur_prog.ast
    fdecl_ast = ast_builder.get_function_declaration(
            codegen_state, codegen_result, schedule_index)

    fdef_ast = ast_builder.get_function_definition(
            codegen_state, codegen_result,
            schedule_index, fdecl_ast, body_ast)

    codegen_result = codegen_result.with_new_program(
            codegen_state,
            cur_prog.copy(
                ast=fdef_ast,
                body_ast=body_ast))

    return codegen_result
Ejemplo n.º 4
0
    def generate_body(self, kernel, codegen_state):
        from cgen import Block
        body = Block()

        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_is_local = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute
        from loopy.codegen import POD  # uses the correct complex type

        class ConstRestrictPointer(Pointer):
            def get_decl_pair(self):
                sub_tp, sub_decl = self.subdecl.get_decl_pair()
                return sub_tp, ("*const restrict %s" % sub_decl)

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    if idi.shape:
                        temp_var_decl = ArrayOf(temp_var_decl,
                                " * ".join(str(s) for s in idi.shape))

                    temp_decls.append(
                            self.wrap_temporary_decl(temp_var_decl, tv.is_local))

            else:
                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_is_local.setdefault(tv.base_storage, []).append(
                        tv.is_local)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.is_local)

                    # The 'restrict' part of this is a complete lie--of course
                    # all these temporaries are aliased. But we're promising to
                    # not use them to shovel data from one representation to the
                    # other. That counts, right?

                    cast_decl = ConstRestrictPointer(cast_decl)
                    temp_var_decl = ConstRestrictPointer(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = POD(self, np.int8, bs_name)
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, base_storage_to_is_local[bs_name])
            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            body.append(bs_var_decl)

        body.extend(temp_decls)

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops
        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line
        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains