Ejemplo n.º 1
0
    def __call__(self, kernel, codegen_result):
        """
        Generates the wrapping python invoker for this execution target

        :arg kernel: the loopy :class:`LoopKernel`(s) to be executued
        :codegen_result: the loopy :class:`CodeGenerationResult` created
        by code generation

        :returns: A python callable that handles execution of this
            kernel
        """

        options = kernel.options
        implemented_data_info = codegen_result.implemented_data_info

        from loopy.kernel.data import KernelArgument
        gen = PythonFunctionGenerator(
            "invoke_%s_loopy_kernel" % kernel.name, self.system_args + [
                "%s=None" % idi.name for idi in implemented_data_info
                if issubclass(idi.arg_class, KernelArgument)
            ])

        gen.add_to_preamble("from __future__ import division")
        gen.add_to_preamble("")
        self.target_specific_preamble(gen)
        gen.add_to_preamble("")
        self.generate_host_code(gen, codegen_result)
        gen.add_to_preamble("")

        self.initialize_system_args(gen)

        self.generate_integer_arg_finding_from_shapes(gen, kernel,
                                                      implemented_data_info)
        self.generate_integer_arg_finding_from_offsets(gen, kernel,
                                                       implemented_data_info)
        self.generate_integer_arg_finding_from_strides(gen, kernel,
                                                       implemented_data_info)
        self.generate_value_arg_check(gen, kernel, implemented_data_info)

        args = self.generate_arg_setup(gen, kernel, implemented_data_info,
                                       options)

        self.generate_invocation(gen, codegen_result.host_program.name, args,
                                 kernel, implemented_data_info)

        self.generate_output_handler(gen, options, kernel,
                                     implemented_data_info)

        if options.write_wrapper:
            output = gen.get()
            if options.highlight_wrapper:
                output = get_highlighted_python_code(output)

            if options.write_wrapper is True:
                print(output)
            else:
                with open(options.write_wrapper, "w") as outf:
                    outf.write(output)

        return gen.get_picklable_function()
Ejemplo n.º 2
0
    def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \
      -> tuple:
        """Generate the kernel stats for a program with its args."""
        args_tuple = tuple(
            (key, value.shape) if hasattr(value, "shape") else (key, value)
            for key, value in kwargs.items())

        # Are kernel stats already in the cache?
        try:
            self.kernel_stats[t_unit][args_tuple]
            return args_tuple
        except KeyError:
            # If not, calculate and cache the stats
            ep_name = t_unit.default_entrypoint.name
            executor = t_unit.target.get_kernel_executor(t_unit,
                                                         self.queue,
                                                         entrypoint=ep_name)
            info = executor.translation_unit_info(
                ep_name, executor.arg_to_dtype_set(kwargs))

            typed_t_unit = executor.get_typed_and_scheduled_translation_unit(
                ep_name, executor.arg_to_dtype_set(kwargs))
            kernel = typed_t_unit[ep_name]

            idi = info.implemented_data_info

            param_dict = kwargs.copy()
            param_dict.update({
                k: None
                for k in kernel.arg_dict.keys() if k not in param_dict
            })

            param_dict.update(
                {d.name: None
                 for d in idi if d.name not in param_dict})

            # Generate the wrapper code
            wrapper = executor.get_wrapper_generator()

            gen = PythonFunctionGenerator("_mcom_gen_args_profile",
                                          list(param_dict))

            wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi)

            param_names = kernel.all_params()
            gen("return {%s}" % ", ".join(f"{repr(name)}: {name}"
                                          for name in param_names))

            # Run the wrapper code, save argument values in domain_params
            domain_params = gen.get_picklable_function()(**param_dict)

            # Get flops/memory statistics
            op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess")
            bytes_accessed = lp.get_mem_access_map(
                typed_t_unit, subgroup_size="guess") \
                            .to_bytes().eval_and_sum(domain_params)

            flops = op_map.filter_by(
                dtype=[np.float32, np.float64]).eval_and_sum(domain_params)

            # Footprint gathering is not yet available in loopy with
            # kernel callables:
            # https://github.com/inducer/loopy/issues/399
            if 0:
                try:
                    footprint = lp.gather_access_footprint_bytes(typed_t_unit)
                    footprint_bytes = sum(
                        footprint[k].eval_with_dict(domain_params)
                        for k in footprint)

                except lp.symbolic.UnableToDetermineAccessRange:
                    footprint_bytes = None
            else:
                footprint_bytes = None

            res = SingleCallKernelProfile(time=0,
                                          flops=flops,
                                          bytes_accessed=bytes_accessed,
                                          footprint_bytes=footprint_bytes)

            self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res

            if self.logmgr:
                if f"{ep_name}_time" not in self.logmgr.quantity_data:
                    self.logmgr.add_quantity(KernelProfile(self, ep_name))

            return args_tuple
Ejemplo n.º 3
0
def generate_invoker(kernel, codegen_result):
    options = kernel.options
    implemented_data_info = codegen_result.implemented_data_info
    host_code = codegen_result.host_code()

    system_args = [
        "_lpy_cl_kernels",
        "queue",
        "allocator=None",
        "wait_for=None",
        # ignored if options.no_numpy
        "out_host=None"
    ]

    from loopy.kernel.data import KernelArgument
    gen = PythonFunctionGenerator(
        "invoke_%s_loopy_kernel" % kernel.name, system_args + [
            "%s=None" % idi.name for idi in implemented_data_info
            if issubclass(idi.arg_class, KernelArgument)
        ])

    gen.add_to_preamble("from __future__ import division")
    gen.add_to_preamble("")
    gen.add_to_preamble("import pyopencl as _lpy_cl")
    gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
    gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
    gen.add_to_preamble("import numpy as _lpy_np")
    gen.add_to_preamble("")
    gen.add_to_preamble(host_code)
    gen.add_to_preamble("")

    gen("if allocator is None:")
    with Indentation(gen):
        gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
    gen("")

    generate_integer_arg_finding_from_shapes(gen, kernel,
                                             implemented_data_info)
    generate_integer_arg_finding_from_offsets(gen, kernel,
                                              implemented_data_info)
    generate_integer_arg_finding_from_strides(gen, kernel,
                                              implemented_data_info)
    generate_value_arg_check(gen, kernel, implemented_data_info)

    args = generate_arg_setup(gen, kernel, implemented_data_info, options)

    # {{{ generate invocation

    gen("_lpy_evt = {kernel_name}({args})".format(
        kernel_name=codegen_result.host_program.name,
        args=", ".join(["_lpy_cl_kernels", "queue"] + args +
                       ["wait_for=wait_for"])))

    # }}}

    # {{{ output

    if not options.no_numpy:
        gen("if out_host is None and (_lpy_encountered_numpy "
            "and not _lpy_encountered_dev):")
        with Indentation(gen):
            gen("out_host = True")

        gen("if out_host:")
        with Indentation(gen):
            gen("pass")  # if no outputs (?!)
            for arg in implemented_data_info:
                if not issubclass(arg.arg_class, KernelArgument):
                    continue

                is_written = arg.base_name in kernel.get_written_variables()
                if is_written:
                    gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))

        gen("")

    if options.return_dict:
        gen("return _lpy_evt, {%s}" %
            ", ".join("\"%s\": %s" % (arg.name, arg.name)
                      for arg in implemented_data_info
                      if issubclass(arg.arg_class, KernelArgument)
                      if arg.base_name in kernel.get_written_variables()))
    else:
        out_args = [
            arg for arg in implemented_data_info
            if issubclass(arg.arg_class, KernelArgument)
            if arg.base_name in kernel.get_written_variables()
        ]
        if out_args:
            gen("return _lpy_evt, (%s,)" % ", ".join(arg.name
                                                     for arg in out_args))
        else:
            gen("return _lpy_evt, ()")

    # }}}

    if options.write_wrapper:
        output = gen.get()
        if options.highlight_wrapper:
            output = get_highlighted_python_code(output)

        if options.write_wrapper is True:
            print(output)
        else:
            with open(options.write_wrapper, "w") as outf:
                outf.write(output)

    return gen.get_function()
Ejemplo n.º 4
0
    def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \
      -> tuple:
        """Generate the kernel stats for a program with its args."""
        args_tuple = tuple(
            (key, value.shape) if hasattr(value, "shape") else (key, value)
            for key, value in kwargs.items())

        # Are kernel stats already in the cache?
        try:
            x = self.kernel_stats[program][args_tuple]  # noqa
            return args_tuple
        except KeyError:
            # If not, calculate and cache the stats
            executor = program.target.get_kernel_executor(program, self.queue)
            info = executor.kernel_info(executor.arg_to_dtype_set(kwargs))

            kernel = executor.get_typed_and_scheduled_kernel(
                executor.arg_to_dtype_set(kwargs))

            idi = info.implemented_data_info

            types = {
                k: v
                for k, v in kwargs.items()
                if hasattr(v, "dtype") and not v.dtype == object
            }

            param_dict = kwargs.copy()
            param_dict.update({
                k: None
                for k in kernel.arg_dict.keys() if k not in param_dict
            })

            param_dict.update(
                {d.name: None
                 for d in idi if d.name not in param_dict})

            # Generate the wrapper code
            wrapper = executor.get_wrapper_generator()

            gen = PythonFunctionGenerator("_mcom_gen_args_profile",
                                          list(param_dict))

            wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi)

            param_names = program.all_params()
            gen("return {%s}" % ", ".join(f"{repr(name)}: {name}"
                                          for name in param_names))

            # Run the wrapper code, save argument values in domain_params
            domain_params = gen.get_picklable_function()(**param_dict)

            # Get flops/memory statistics
            kernel = lp.add_and_infer_dtypes(kernel, types)
            op_map = lp.get_op_map(kernel, subgroup_size="guess")
            bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \
              .to_bytes().eval_and_sum(domain_params)

            flops = op_map.filter_by(
                dtype=[np.float32, np.float64]).eval_and_sum(domain_params)

            try:
                footprint = lp.gather_access_footprint_bytes(kernel)
                footprint_bytes = sum(
                    footprint[k].eval_with_dict(domain_params)
                    for k in footprint)

            except lp.symbolic.UnableToDetermineAccessRange:
                footprint_bytes = None

            res = ProfileResult(time=0,
                                flops=flops,
                                bytes_accessed=bytes_accessed,
                                footprint_bytes=footprint_bytes)

            self.kernel_stats.setdefault(program, {})[args_tuple] = res
            return args_tuple