Esempio n. 1
0
    def get_temporary_decls(self, codegen_state, schedule_state):
        from genpy import Assign, Comment, Line

        def alloc_nbytes(tv):
            from functools import reduce
            from operator import mul
            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)

        from pymbolic.mapper.stringifier import PREC_NONE
        ecm = self.get_expression_to_code_mapper(codegen_state)

        global_temporaries = self._get_global_temporaries(codegen_state)
        if not global_temporaries:
            return []

        return [Comment("{{{ allocate global temporaries"),
                Line()] + [
                    Assign(
                        tv.name, "allocator(%s)" %
                        ecm(alloc_nbytes(tv), PREC_NONE, "i"))
                    for tv in global_temporaries
                ] + [
                    Assign(
                        "_global_temporaries", "[{tvs}]".format(tvs=", ".join(
                            tv.name for tv in global_temporaries)))
                ] + [Line(), Comment("}}}"), Line()]
Esempio n. 2
0
    def get_temporary_decls(self, codegen_state, schedule_state):
        from genpy import Assign, Comment, Line

        def alloc_nbytes(tv):
            from six.moves import reduce
            from operator import mul
            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)

        from loopy.kernel.data import temp_var_scope

        global_temporaries = sorted(
            (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables)
            if tv.scope == temp_var_scope.GLOBAL),
            key=lambda tv: tv.name)

        from pymbolic.mapper.stringifier import PREC_NONE
        ecm = self.get_expression_to_code_mapper(codegen_state)

        if not global_temporaries:
            return [Assign("_global_temporaries", "[]"), Line()]

        return [
            Comment("{{{ allocate global temporaries"),
            Line()] + [
            Assign(tv.name, "allocator(%s)" %
                ecm(alloc_nbytes(tv), PREC_NONE, "i"))
                for tv in global_temporaries] + [
            Assign("_global_temporaries", "[{tvs}]".format(tvs=", ".join(
                tv.name for tv in global_temporaries)))] + [
            Line(),
            Comment("}}}"),
            Line()]
Esempio n. 3
0
    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        if not gsize:
            gsize = (1, )
        if not lsize:
            lsize = (1, )

        all_args = codegen_state.implemented_data_info + extra_args

        value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \
            generate_value_arg_setup(
                    codegen_state.kernel,
                    [self.target.device],
                    all_args)
        arry_arg_code = generate_array_arg_setup(codegen_state.kernel,
                                                 all_args,
                                                 arg_idx_to_cl_arg_idx)

        from genpy import Suite, Assign, Assert, Line, Comment
        from pymbolic.mapper.stringifier import PREC_NONE

        import pyopencl.version as cl_ver
        if cl_ver.VERSION < (2020, 2):
            from warnings import warn
            warn("Your kernel invocation will likely fail because your "
                 "version of PyOpenCL does not support allow_empty_ndrange. "
                 "Please upgrade to version 2020.2 or newer.")

        # TODO: Generate finer-grained dependency structure
        return Suite([
            Comment("{{{ enqueue %s" % name),
            Line(),
            Assign("_lpy_knl", "_lpy_cl_kernels." + name),
            Assert("_lpy_knl.num_args == %d" % cl_arg_count),
            Line(),
            value_arg_code,
            arry_arg_code,
            Assign(
                "_lpy_evt",
                "%(pyopencl_module_name)s.enqueue_nd_range_kernel("
                "queue, _lpy_knl, "
                "%(gsize)s, %(lsize)s, "
                # using positional args because pybind is slow with kwargs
                "None, "  # offset
                "wait_for, "
                "True, "  # g_times_l
                "True, "  # allow_empty_ndrange
                ")" %
                dict(pyopencl_module_name=self.target.pyopencl_module_name,
                     gsize=ecm(gsize, prec=PREC_NONE, type_context="i"),
                     lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))),
            Assign("wait_for", "[_lpy_evt]"),
            Line(),
            Comment("}}}"),
            Line(),
        ])
Esempio n. 4
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        result = []

        from pymbolic.mapper.stringifier import PREC_NONE
        from genpy import Assign

        for tv in sorted(
                kernel.temporary_variables.values(),
                key=lambda tv: tv.name):
            if tv.shape:
                result.append(
                        Assign(
                            tv.name,
                            "_lpy_np.empty(%s, dtype=%s)"
                            % (
                                ecm(tv.shape, PREC_NONE, "i"),
                                "_lpy_np."+(
                                    tv.dtype.numpy_dtype.name
                                    if tv.dtype.numpy_dtype.name != "bool"
                                    else "bool8")
                                )))

        return result
Esempio n. 5
0
    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        if not gsize:
            gsize = (1,)
        if not lsize:
            lsize = (1,)

        all_args = codegen_state.implemented_data_info + extra_args

        value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \
            generate_value_arg_setup(
                    codegen_state.kernel,
                    [self.target.device],
                    all_args)
        arry_arg_code = generate_array_arg_setup(
            codegen_state.kernel,
            all_args,
            arg_idx_to_cl_arg_idx)

        from genpy import Suite, Assign, Assert, Line, Comment
        from pymbolic.mapper.stringifier import PREC_NONE

        # TODO: Generate finer-grained dependency structure
        return Suite([
            Comment("{{{ enqueue %s" % name),
            Line(),
            Assign("_lpy_knl", "_lpy_cl_kernels."+name),
            Assert("_lpy_knl.num_args == %d" % cl_arg_count),
            Line(),
            value_arg_code,
            arry_arg_code,
            Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel("
                "queue, _lpy_knl, "
                "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
                % dict(
                    pyopencl_module_name=self.target.pyopencl_module_name,
                    gsize=ecm(gsize, prec=PREC_NONE, type_context="i"),
                    lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))),
            Assign("wait_for", "[_lpy_evt]"),
            Line(),
            Comment("}}}"),
            Line(),
            ])
Esempio n. 6
0
    def emit_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        if insn.atomicity:
            raise NotImplementedError("atomic ops in Python")

        from pymbolic.mapper.stringifier import PREC_NONE
        from genpy import Assign

        return Assign(ecm(insn.assignee, prec=PREC_NONE, type_context=None),
                      ecm(insn.expression, prec=PREC_NONE, type_context=None))
Esempio n. 7
0
    def get_function_definition(self, codegen_state, codegen_result,
                                schedule_index, function_decl, function_body):
        from loopy.kernel.data import TemporaryVariable
        args = (["_lpy_cl_kernels", "queue"] + [
            idi.name for idi in codegen_state.implemented_data_info
            if not issubclass(idi.arg_class, TemporaryVariable)
        ] + ["wait_for=None", "allocator=None"])

        from genpy import (For, Function, Suite, Import, ImportAs, Return,
                           FromImport, If, Assign, Line, Statement as S)
        return Function(
            codegen_result.current_program(codegen_state).name,
            args,
            Suite([
                FromImport("struct", ["pack as _lpy_pack"]),
                ImportAs("pyopencl", "_lpy_cl"),
                Import("pyopencl.tools"),
                Line(),
                If(
                    "allocator is None",
                    Assign("allocator",
                           "_lpy_cl_tools.DeferredAllocator(queue.context)")),
                Line(),
            ] + [
                Line(),
                function_body,
                Line(),
            ] + [
                For(
                    "_tv",
                    "_global_temporaries",
                    # free global temporaries
                    S("_tv.release()"))
            ] + [
                Line(),
                Return("_lpy_evt"),
            ]))
Esempio n. 8
0
def generate_value_arg_setup(kernel, devices, implemented_data_info):
    options = kernel.options

    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False]*len(devices)

    else:
        count_bug_per_dev = [
                has_struct_arg_count_bug(dev)
                if dev is not None else False
                for dev in devices]

    if any(dev is None for dev in devices):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
                "workarounds for broken OpenCL implementations "
                "(such as those relating to complex numbers) "
                "may not be enabled when needed"
                .format(knl_name=kernel.name))

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    from genpy import (
            Comment, Line, If, Raise, Assign, Statement as S, Suite)

    result = []
    gen = result.append

    for arg_idx, idi in enumerate(implemented_data_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if not issubclass(idi.arg_class, lp.ValueArg):
            assert issubclass(idi.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        gen(Comment("{{{ process %s" % idi.name))
        gen(Line())

        if not options.skip_arg_checks:
            gen(If("%s is None" % idi.name,
                Raise('RuntimeError("input argument \'{name}\' '
                        'must be supplied")'.format(name=idi.name))))

        if idi.dtype.is_integral():
            gen(Comment("cast to Python int to avoid trouble "
                "with struct packing or Boost.Python"))
            if sys.version_info < (3,):
                py_type = "long"
            else:
                py_type = "int"

            gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
            gen(Line())

        if idi.dtype.is_composite():
            gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
            cl_arg_idx += 1

        elif idi.dtype.is_complex():
            assert isinstance(idi.dtype, NumpyType)

            dtype = idi.dtype

            if warn_about_arg_count_bug:
                warn("{knl_name}: arguments include complex numbers, and "
                        "some (but not all) of the target devices mishandle "
                        "struct kernel arguments (hence the workaround is "
                        "disabled".format(
                            knl_name=kernel.name))

            if dtype.numpy_dtype == np.complex64:
                arg_char = "f"
            elif dtype.numpy_dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % dtype)

            if (work_around_arg_count_bug
                    and dtype.numpy_dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.real)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                        "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                        .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1
            else:
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}{arg_char}', "
                    "{arg_var}.real, {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

            fp_arg_count += 2

        elif isinstance(idi.dtype, NumpyType):
            if idi.dtype.dtype.kind == "f":
                fp_arg_count += 1

            gen(S(
                "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
                % (cl_arg_idx, idi.dtype.dtype.char, idi.name)))

            cl_arg_idx += 1

        else:
            raise LoopyError("do not know how to pass argument of type '%s'"
                    % idi.dtype)

        gen(Line())

        gen(Comment("}}}"))
        gen(Line())

    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
Esempio n. 9
0
 def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
     from genpy import Assign
     return Assign(name, val_str)
Esempio n. 10
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from genpy import Assign, Comment, Line
        from collections import defaultdict
        from numbers import Number
        import pymbolic.primitives as prim

        def alloc_nbytes(tv):
            from functools import reduce
            from operator import mul
            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)

        from pymbolic.mapper.stringifier import PREC_NONE
        ecm = self.get_expression_to_code_mapper(codegen_state)

        global_temporaries = self._get_global_temporaries(codegen_state)
        if not global_temporaries:
            return []

        # {{{ allocate space for the base_storage

        base_storage_sizes = defaultdict(set)

        for tv in global_temporaries:
            if tv.base_storage:
                base_storage_sizes[tv.base_storage].add(tv.nbytes)

        # }}}

        allocated_var_names = []
        code_lines = []
        code_lines.append(Line())
        code_lines.append(Comment("{{{ allocate global temporaries"))
        code_lines.append(Line())

        for name, sizes in base_storage_sizes.items():
            if all(isinstance(s, Number) for s in sizes):
                size = max(sizes)
            else:
                size = prim.Max(tuple(sizes))

            allocated_var_names.append(name)
            code_lines.append(
                Assign(name, f"allocator({ecm(size, PREC_NONE, 'i')})"))

        for tv in global_temporaries:
            if tv.base_storage:
                assert tv.base_storage in base_storage_sizes
                code_lines.append(Assign(tv.name, tv.base_storage))
            else:
                nbytes_str = ecm(tv.nbytes, PREC_NONE, "i")
                allocated_var_names.append(tv.name)
                code_lines.append(Assign(tv.name, f"allocator({nbytes_str})"))

        code_lines.append(
            Assign(
                "_global_temporaries", "[{tvs}]".format(tvs=", ".join(
                    tv for tv in allocated_var_names))))

        code_lines.append(Line())
        code_lines.append(Comment("}}}"))
        code_lines.append(Line())

        return code_lines