def opencl_preamble_generator(preamble_info): has_double = False for dtype in preamble_info.seen_dtypes: if (isinstance(dtype, NumpyType) and dtype.numpy_dtype in [np.float64, np.complex128]): has_double = True if has_double: yield ("00_enable_double", """ #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif """) from loopy.types import AtomicNumpyType seen_64_bit_atomics = any( isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 for dtype in preamble_info.seen_atomic_dtypes) if seen_64_bit_atomics: # FIXME: Should gate on "CL1" atomics style yield ("00_enable_64bit_atomics", """ #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable """) from loopy.tools import remove_common_indentation kernel = preamble_info.kernel yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) #define gid(N) ((%(idx_ctype)s) get_group_id(N)) """ % dict( idx_ctype=kernel.target.dtype_to_typename(kernel.index_dtype))))
def opencl_preamble_generator(preamble_info): has_double = False for dtype in preamble_info.seen_dtypes: if (isinstance(dtype, NumpyType) and dtype.numpy_dtype in [np.float64, np.complex128]): has_double = True if has_double: yield ("00_enable_double", """ #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif """) from loopy.types import AtomicNumpyType seen_64_bit_atomics = any( isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 for dtype in preamble_info.seen_atomic_dtypes) if seen_64_bit_atomics: # FIXME: Should gate on "CL1" atomics style yield ("00_enable_64bit_atomics", """ #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable """) from loopy.tools import remove_common_indentation kernel = preamble_info.kernel yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) #define gid(N) ((%(idx_ctype)s) get_group_id(N)) """ % dict(idx_ctype=kernel.target.dtype_to_typename( kernel.index_dtype))))
def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( super(OpenCLTarget, self).generate_code( kernel, codegen_state, impl_arg_info)) from loopy.tools import remove_common_indentation code = ( remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) #define gid(N) ((%(idx_ctype)s) get_group_id(N)) """ % dict(idx_ctype=self.dtype_to_typename(kernel.index_dtype))) + "\n\n" + code) return code, implemented_domains
def process_preambles(preambles): seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation return [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles]
def __init__(self, iname_exprs, code, read_variables=frozenset(), assignees=tuple(), id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=0, boostable=None, boostable_into=None, predicates=frozenset(), tags=None, insn_deps=None, insn_deps_is_final=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single string is also allowed, which should consists of comma-separated inames. :arg assignees: Like :attr:`assignees`, but may also be a semicolon-separated string of such expressions or a sequence of strings parseable into the desired format. """ InstructionBase.__init__(self, id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, insn_deps=insn_deps, insn_deps_is_final=insn_deps_is_final) # {{{ normalize iname_exprs if isinstance(iname_exprs, str): iname_exprs = [i.strip() for i in iname_exprs.split(",")] iname_exprs = [i for i in iname_exprs if i] from pymbolic import var new_iname_exprs = [] for i in iname_exprs: if isinstance(i, str): new_iname_exprs.append((i, var(i))) else: new_iname_exprs.append(i) # }}} # {{{ normalize assignees if isinstance(assignees, str): assignees = [i.strip() for i in assignees.split(";")] assignees = [i for i in assignees if i] new_assignees = [] from loopy.symbolic import parse for i in assignees: if isinstance(i, str): new_assignees.append(parse(i)) else: new_assignees.append(i) # }}} self.iname_exprs = new_iname_exprs from loopy.tools import remove_common_indentation self.code = remove_common_indentation(code) self.read_variables = read_variables self.assignees = new_assignees
def parse_transformed_fortran(source, free_form=True, strict=True, pre_transform_code=None, transform_code_context=None, filename="<floopy code>"): """ :arg source: a string of Fortran source code which must include a snippet of transform code as described below. :arg pre_transform_code: code that is run in the same context as the transform *source* may contain snippets of loopy transform code between markers:: !$loopy begin ! ... !$loopy end Within the transform code, the following symbols are predefined: * ``lp``: a reference to the :mod:`loopy` package * ``np``: a reference to the :mod:`numpy` package * ``SOURCE``: the source code surrounding the transform block. This may be processed using :func:`c_preprocess` and :func:`parse_fortran`. * ``FILENAME``: the file name of the code being processed The transform code must define ``RESULT``, conventionally a list of kernels, which is returned from this function unmodified. An example of *source* may look as follows:: subroutine fill(out, a, n) implicit none real*8 a, out(n) integer n, i do i = 1, n out(i) = a end do end !$loopy begin ! ! fill, = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") ! RESULT = [fill] ! !$loopy end """ source, transform_code = _extract_loopy_lines(source) if not transform_code: raise LoopyError("no transform code found") from loopy.tools import remove_common_indentation transform_code = remove_common_indentation( transform_code, require_leading_newline=False, ignore_lines_starting_with="#") if transform_code_context is None: proc_dict = {} else: proc_dict = transform_code_context.copy() import loopy as lp import numpy as np proc_dict["lp"] = lp proc_dict["np"] = np proc_dict["SOURCE"] = source proc_dict["FILENAME"] = filename from os.path import dirname, abspath from os import getcwd infile_dirname = dirname(filename) if infile_dirname: infile_dirname = abspath(infile_dirname) else: infile_dirname = getcwd() import sys prev_sys_path = sys.path try: if infile_dirname: sys.path = prev_sys_path + [infile_dirname] if pre_transform_code is not None: proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code exec(compile(pre_transform_code, "<loopy pre-transform code>", "exec"), proc_dict) proc_dict["_MODULE_SOURCE_CODE"] = transform_code exec(compile(transform_code, filename, "exec"), proc_dict) finally: sys.path = prev_sys_path if "RESULT" not in proc_dict: raise LoopyError("transform code did not set RESULT") return proc_dict["RESULT"]
def _preamble_generator(preamble_info, func_qualifier="inline"): integer_type_names = ["int8", "int16", "int32", "int64"] def_integer_types_macro = ("03_def_integer_types", r""" #define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \ MACRO_NAME(int8, char) \ MACRO_NAME(int16, short) \ MACRO_NAME(int32, int) \ MACRO_NAME(int64, long) """) undef_integer_types_macro = ("05_undef_integer_types", """ #undef LOOPY_CALL_WITH_INTEGER_TYPES """) function_defs = { "loopy_floor_div": r""" #define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \ {} TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ {{ \ if ((a<0) != (b<0)) \ a = a - (b + (b<0) - (b>=0)); \ return a/b; \ }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV) #undef LOOPY_DEFINE_FLOOR_DIV """.format(func_qualifier), "loopy_floor_div_pos_b": r""" #define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \ {} TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ {{ \ if (a<0) \ a = a - (b-1); \ return a/b; \ }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B) #undef LOOPY_DEFINE_FLOOR_DIV_POS_B """.format(func_qualifier), "loopy_mod": r""" #define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \ {} TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ {{ \ TYPE result = a%b; \ if (result < 0 && b > 0) \ result += b; \ if (result > 0 && b < 0) \ result = result + b; \ return result; \ }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD) #undef LOOPY_DEFINE_MOD """.format(func_qualifier), "loopy_mod_pos_b": r""" #define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \ {} TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ {{ \ TYPE result = a%b; \ if (result < 0) \ result += b; \ return result; \ }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B) #undef LOOPY_DEFINE_MOD_POS_B """.format(func_qualifier), } c_funcs = {func.c_name for func in preamble_info.seen_functions} for func_name, func_body in function_defs.items(): if any((func_name + "_" + tpname) in c_funcs for tpname in integer_type_names): yield def_integer_types_macro yield ("04_%s" % func_name, func_body) yield undef_integer_types_macro for func in preamble_info.seen_functions: if func.name == "int_pow": base_ctype = preamble_info.kernel.target.dtype_to_typename( func.arg_dtypes[0]) exp_ctype = preamble_info.kernel.target.dtype_to_typename( func.arg_dtypes[1]) res_ctype = preamble_info.kernel.target.dtype_to_typename( func.result_dtypes[0]) if func.arg_dtypes[1].numpy_dtype.kind == "u": signed_exponent_preamble = "" else: signed_exponent_preamble = "\n" + remove_common_indentation(""" if (n < 0) { x = 1.0/x; n = -n; }""") yield (f"07_{func.c_name}", f""" inline {res_ctype} {func.c_name}({base_ctype} x, {exp_ctype} n) {{ if (n == 0) return 1; {re.sub("^", 14*" ", signed_exponent_preamble, flags=re.M)} {res_ctype} y = 1; while (n > 1) {{ if (n % 2) {{ y = x * y; x = x * x; }} else x = x * x; n = n / 2; }} return x*y; }}""")
def generate_code(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase impl_arg_info = [] for arg in kernel.args: if isinstance(arg, ArrayBase): impl_arg_info.extend( arg.decl_info( kernel.target, is_written=arg.name in kernel.get_written_variables(), index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): impl_arg_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, cgen_declarator=arg.get_arg_decl(kernel.target), arg_class=ValueArg)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True # }}} seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] code_str = "".join(preamble_codes) + code_str # }}} logger.info("%s: generate code: done" % kernel.name) result = code_str, impl_arg_info if CACHING_ENABLED: code_gen_cache[input_kernel] = result return result
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] from pytools import Record class PreambleInfo(Record): pass preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] codegen_result = codegen_result.copy( device_preambles=preamble_codes) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result