def test_slice(ctx_factory, shape): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) from numpy.random import default_rng rng = default_rng() x_in = rng.random(size=shape) namespace = pt.Namespace() x = pt.make_data_wrapper(namespace, x_in) outputs = {} ref_outputs = {} i = 0 for slice_ in generate_test_slices(shape): outputs[f"out_{i}"] = x[slice_] ref_outputs[f"out_{i}"] = x_in[slice_] i += 1 prog = pt.generate_loopy(pt.make_dict_of_named_arrays(outputs), target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog() for output in outputs: x_out = outputs[output] x_ref = ref_outputs[output] assert (x_out == x_ref).all()
def test_codegen_with_DictOfNamedArrays(ctx_factory): # noqa ctx = ctx_factory() queue = cl.CommandQueue(ctx) namespace = pt.Namespace() x = Placeholder(namespace, "x", (5, ), np.int) y = Placeholder(namespace, "y", (5, ), np.int) x_in = np.array([1, 2, 3, 4, 5]) y_in = np.array([6, 7, 8, 9, 10]) result = pt.DictOfNamedArrays(dict(x_out=x, y_out=y)) # Without return_dict. prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue)) _, (x_out, y_out) = prog(x=x_in, y=y_in) assert (x_out == x_in).all() assert (y_out == y_in).all() # With return_dict. prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog(x=x_in, y=y_in) assert (outputs["x_out"] == x_in).all() assert (outputs["y_out"] == y_in).all()
def test_unary_arith(ctx_factory, which): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) op = getattr(operator, which) x_orig = np.array([1, 2, 3, 4, 5]) namespace = pt.Namespace() exprs = {} for dtype in ARITH_DTYPES: exprs[dtype] = op(pt.make_data_wrapper(namespace, x_orig.astype(dtype))) prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs), target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog() for dtype in ARITH_DTYPES: out = outputs[dtype] out_ref = op(x_orig.astype(dtype)) assert out.dtype == out_ref.dtype assert np.array_equal(out, out_ref)
def test_scalar_array_binary_arith(ctx_factory, which, reverse): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) op = getattr(operator, which) if reverse: op = reverse_args(op) x_orig = 7 y_orig = np.array([1, 2, 3, 4, 5]) for first_dtype in (int, float, complex): namespace = pt.Namespace() x_in = first_dtype(x_orig) exprs = {} for dtype in ARITH_DTYPES: y = pt.make_data_wrapper(namespace, y_orig.astype(dtype), name=f"y{dtype}") exprs[dtype] = op(x_in, y) prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs), target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog() for dtype in exprs: out = outputs[dtype] out_ref = op(x_in, y_orig.astype(dtype)) assert out.dtype == out_ref.dtype, (out.dtype, out_ref.dtype) # In some cases ops are done in float32 in loopy but float64 in numpy. assert np.allclose(out, out_ref), (out, out_ref)
def make_loopy_program(domains, statements, kernel_data=["..."], name="mm_actx_kernel"): """Return a :class:`loopy.LoopKernel` suitable for use with :meth:`ArrayContext.call_loopy`. """ return lp.make_kernel(domains, statements, kernel_data=kernel_data, options=lp.Options(no_numpy=True, return_dict=True), default_offset=lp.auto, name=name, lang_version=MOST_RECENT_LANGUAGE_VERSION)
def init_global_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < n}", "{[jdof]: 0 <= jdof < m}"], """ result[idof, jdof] = 0 {id=init} """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="init_a_global_matrix", )
def __init__(self, fft, dk, dx, effective_k): self.fft = fft grid_size = fft.grid_shape[0] * fft.grid_shape[1] * fft.grid_shape[2] queue = self.fft.sub_k["momenta_x"].queue sub_k = list(x.get().astype("int") for x in self.fft.sub_k.values()) k_names = ("k_x", "k_y", "k_z") self.momenta = {} self.momenta = {} for mu, (name, kk) in enumerate(zip(k_names, sub_k)): kk_mu = effective_k(dk[mu] * kk.astype(fft.rdtype), dx[mu]) self.momenta[name] = cla.to_device(queue, kk_mu) args = [ lp.GlobalArg("fk", fft.cdtype, shape="(Nx, Ny, Nz)"), lp.GlobalArg("k_x", fft.rdtype, shape=("Nx", )), lp.GlobalArg("k_y", fft.rdtype, shape=("Ny", )), lp.GlobalArg("k_z", fft.rdtype, shape=("Nz", )), lp.ValueArg("m_squared", fft.rdtype), ] from pystella.field import Field from pymbolic.primitives import Variable, If, Comparison fk = Field("fk") indices = fk.indices rho_tmp = Variable("rho_tmp") tmp_insns = [(rho_tmp, Field("rhok") * (1 / grid_size))] mom_vars = tuple(Variable(name) for name in k_names) minus_k_squared = sum(kk_i[x_i] for kk_i, x_i in zip(mom_vars, indices)) sol = rho_tmp / (minus_k_squared - Variable("m_squared")) solution = { Field("fk"): If(Comparison(minus_k_squared, "<", 0), sol, 0) } from pystella.elementwise import ElementWiseMap options = lp.Options(return_dict=True) self.knl = ElementWiseMap(solution, args=args, halo_shape=0, options=options, tmp_instructions=tmp_insns, lsize=(16, 2, 1))
def make_steps(self, MapKernel=ElementWiseMap, **kwargs): rhs = var("rhs") dt = var("dt") q = var("q") fixed_parameters = kwargs.pop("fixed_parameters", dict()) rhs_statements = { rhs[i]: index_fields(value, prepend_with=(q, )) for i, value in enumerate(self.rhs_dict.values()) } steps = [] for stage in range(self.num_stages): RK_dict = {} for i, f in enumerate(self.rhs_dict.keys()): # ensure that key is either a Field or a Subscript of a Field # so that index_fields can prepend the q index key_has_field = False if isinstance(f, Field): key_has_field = True elif isinstance(f, Subscript): if isinstance(f.aggregate, Field): key_has_field = True if not key_has_field: raise ValueError("rhs_dict keys must be Field instances") statements = self.step_statements(stage, f, dt, rhs[i]) for k, v in statements.items(): RK_dict[k] = v fixed_parameters.update(q=0 if stage == 0 else 1) options = lp.Options(enforce_variable_access_ordered="no_check") step = MapKernel(RK_dict, tmp_instructions=rhs_statements, args=self.args, **kwargs, options=options, fixed_parameters=fixed_parameters) steps.append(step) return steps
def write_into_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < ndofs}", "{[jdof]: 0 <= jdof < mdofs}"], """ result[offset_i + idof, offset_j + jdof] = mat[idof, jdof] """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), lp.GlobalArg("mat", None, shape="ndofs, mdofs", offset=lp.auto), lp.ValueArg("offset_i", np.int32), lp.ValueArg("offset_j", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="write_into_global_matrix", )
def test_array_array_binary_arith(ctx_factory, which, reverse): if which == "sub": pytest.skip("https://github.com/inducer/loopy/issues/131") cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) op = getattr(operator, which) if reverse: op = reverse_args(op) x_orig = np.array([1, 2, 3, 4, 5]) y_orig = np.array([10, 9, 8, 7, 6]) for first_dtype in ARITH_DTYPES: namespace = pt.Namespace() x_in = x_orig.astype(first_dtype) x = pt.make_data_wrapper(namespace, x_in, name="x") exprs = {} for dtype in ARITH_DTYPES: y = pt.make_data_wrapper(namespace, y_orig.astype(dtype), name=f"y{dtype}") exprs[dtype] = op(x, y) prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs), target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog() for dtype in ARITH_DTYPES: out = outputs[dtype] out_ref = op(x_in, y_orig.astype(dtype)) assert out.dtype == out_ref.dtype, (out.dtype, out_ref.dtype) # In some cases ops are done in float32 in loopy but float64 in numpy. assert np.allclose(out, out_ref), (out, out_ref)
def __init__(self, fft, dk): self.fft = fft grid_size = fft.grid_shape[0] * fft.grid_shape[1] * fft.grid_shape[2] queue = self.fft.sub_k["momenta_x"].queue sub_k = list(x.get().astype("int") for x in self.fft.sub_k.values()) k_names = ("k_x", "k_y", "k_z") self.momenta = {} for mu, (name, kk) in enumerate(zip(k_names, sub_k)): kk_mu = dk[mu] * kk.astype(fft.rdtype) self.momenta[name + "_2"] = cla.to_device(queue, kk_mu) # zero Nyquist mode for first derivatives kk_mu[abs(sub_k[mu]) == fft.grid_shape[mu] // 2] = 0. kk_mu[sub_k[mu] == 0] = 0. self.momenta[name + "_1"] = cla.to_device(queue, kk_mu) args = [ lp.GlobalArg("fk", shape="(Nx, Ny, Nz)"), lp.GlobalArg("k_x_1, k_x_2", fft.rdtype, shape=("Nx", )), lp.GlobalArg("k_y_1, k_y_2", fft.rdtype, shape=("Ny", )), lp.GlobalArg("k_z_1, k_z_2", fft.rdtype, shape=("Nz", )), ] from pystella.field import Field fk = Field("fk") pd = tuple(Field(pdi) for pdi in ("pdx_k", "pdy_k", "pdz_k")) indices = fk.indices from pymbolic import var mom_vars = tuple(var(name + "_1") for name in k_names) fk_tmp = var("fk_tmp") tmp_insns = [(fk_tmp, fk * (1 / grid_size))] pdx, pdy, pdz = ({ pdi: kk_i[indices[i]] * 1j * fk_tmp } for i, (pdi, kk_i) in enumerate(zip(pd, mom_vars))) pdx_incr, pdy_incr, pdz_incr = ({ Field("div"): Field("div") + kk_i[indices[i]] * 1j * fk_tmp } for i, kk_i in enumerate(mom_vars)) mom_vars = tuple(var(name + "_2") for name in k_names) kmag_sq = sum(kk_i[x_i]**2 for kk_i, x_i in zip(mom_vars, indices)) lap = {Field("lap_k"): -kmag_sq * fk_tmp} from pystella.elementwise import ElementWiseMap common_args = dict(halo_shape=0, args=args, lsize=(16, 2, 1), tmp_instructions=tmp_insns, options=lp.Options(return_dict=True)) self.pdx_knl = ElementWiseMap(pdx, **common_args) self.pdy_knl = ElementWiseMap(pdy, **common_args) self.pdz_knl = ElementWiseMap(pdz, **common_args) self.pdx_incr_knl = ElementWiseMap(pdx_incr, **common_args) self.pdy_incr_knl = ElementWiseMap(pdy_incr, **common_args) self.pdz_incr_knl = ElementWiseMap(pdz_incr, **common_args) self.lap_knl = ElementWiseMap(lap, **common_args) common_args["lsize"] = (16, 1, 1) self.grad_knl = ElementWiseMap({**pdx, **pdy, **pdz}, **common_args) self.grad_lap_knl = ElementWiseMap({ **pdx, **pdy, **pdz, **lap }, **common_args)
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset( [builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [ tuple(merger(i) for i in inits) for inits in mapper.initialisers ] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % ( prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [ rename_nodes(inits, replacements) for inits in mapper.initialisers ] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append( d.add_constraint( isl.Constraint.eq_from_names(d.space, { "n": 1, "start": -1 }))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append( d.add_constraint( isl.Constraint.eq_from_names( d.space, { "layer": 1, t1: -1, 1: 1 }))) else: new_domains.append(d) domains = new_domains assumptions, = reduce( operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set( pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set( ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, petsc_function_lookup) return wrapper
def __init__(self, decomp, input, **kwargs): self.decomp = decomp from pystella import Sector if isinstance(input, Sector): self.reducers = input.reducers elif isinstance(input, list): self.reducers = dict(i for s in input for i in s.reducers.items()) elif isinstance(input, dict): self.reducers = input else: raise NotImplementedError reducers = self.reducers self.grid_size = kwargs.pop("grid_size", None) self.callback = kwargs.pop("callback", lambda x: x) self.num_reductions = sum(len(i) for i in reducers.values()) from pymbolic import var tmp = var("tmp") self.tmp_dict = {} i = 0 for key, val in reducers.items(): inext = i + len(val) self.tmp_dict[key] = range(i, inext) i = inext # flatten and process inputs into expression and operation flat_reducers = [] reduction_ops = [] for val in reducers.values(): for v in val: if isinstance(v, tuple): flat_reducers.append(v[0]) reduction_ops.append(v[1]) else: flat_reducers.append(v) reduction_ops.append("avg") self.reduction_ops = reduction_ops def reduction(expr, op): return lp.symbolic.Reduction(operation=op, inames=("i",), expr=expr, allow_simultaneous=True) statements = [ (tmp[i, var("j"), var("k")], reduction(expr, "sum" if op == "avg" else op)) for i, (expr, op) in enumerate(zip(flat_reducers, reduction_ops)) ] statements += [ lp.Assignment( var("Nx_"), var("Nx"), id="Nx_assign", predicates={"i == 0", "j == 0", "k == 0"}) ] args = [lp.GlobalArg("Nx_", shape=(), dtype="int")] args += kwargs.pop("args", [...]) lsize = kwargs.pop("lsize", (32, 2, 1)) silenced_warnings = kwargs.pop("silenced_warnings", []) silenced_warnings += ["write_race(Nx_assign)"] super().__init__(statements, **kwargs, args=args, seq_dependencies=False, lsize=lsize, options=lp.Options(return_dict=True), silenced_warnings=silenced_warnings)
def generate_loopy( result: Union[Array, DictOfNamedArrays, Dict[str, Array]], target: Optional[LoopyTarget] = None, options: Optional[lp.Options] = None, *, cl_device: Optional["pyopencl.Device"] = None, array_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset( [ImplStored, Named, PrefixNamed]), axis_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset(), ) -> BoundProgram: r"""Code generation entry point. :param result: Outputs of the computation. :param target: Code generation target. :param options: Code generation options for the kernel. :returns: A :class:`pytato.target.BoundProgram` wrapping the generated :mod:`loopy` program. If *result* is a :class:`dict` or a :class:`pytato.DictOfNamedArrays` and *options* is not supplied, then the Loopy option :attr:`~loopy.Options.return_dict` will be set to *True*. If it is supplied, :attr:`~loopy.Options.return_dict` must already be set to *True*. .. note:: :mod:`pytato` metadata :math:`\mapsto` :mod:`loopy` metadata semantics: - Inames that index over an :class:`~pytato.array.Array`'s axis in the allocation instruction are tagged with the corresponding :class:`~pytato.array.Axis`'s tags. The caller may choose to not propagate axis tags of type *axis_tag_t_to_not_propagate*. - :attr:`pytato.Array.tags` of inputs/outputs in *outputs* would be copied over to the tags of the corresponding :class:`loopy.ArrayArg`. The caller may choose to not propagate array tags of type *array_tag_t_to_not_propagate*. - Arrays tagged with :class:`pytato.tags.ImplStored` would have their tags copied over to the tags of corresponding :class:`loopy.TemporaryVariable`. The caller may choose to not propagate array tags of type *array_tag_t_to_not_propagate*. """ result_is_dict = isinstance(result, (dict, DictOfNamedArrays)) orig_outputs: DictOfNamedArrays = normalize_outputs(result) del result if target is None: target = LoopyPyOpenCLTarget(device=cl_device) else: if cl_device is not None: raise TypeError("may not pass both 'target' and 'cl_device'") preproc_result = preprocess(orig_outputs, target) outputs = preproc_result.outputs compute_order = preproc_result.compute_order if options is None: options = lp.Options(return_dict=result_is_dict) elif isinstance(options, dict): from warnings import warn warn( "Passing a dict for options is deprecated and will stop working in " "2022. Pass an actual loopy.Options object instead.", DeprecationWarning, stacklevel=2) options = lp.Options(**options) if options.return_dict != result_is_dict: raise ValueError("options.result_is_dict is expected to match " "whether the returned value is a dictionary") state = get_initial_codegen_state(target, options) from pytato.transform import InputGatherer ing = InputGatherer() state.var_name_gen.add_names({ input_expr.name for name in compute_order for input_expr in ing(outputs[name].expr) if isinstance(input_expr, (Placeholder, SizeParam, DataWrapper)) if input_expr.name is not None }) state.var_name_gen.add_names(outputs) cg_mapper = CodeGenMapper(array_tag_t_to_not_propagate, axis_tag_t_to_not_propagate) # Generate code for outputs. for name in compute_order: expr = outputs[name].expr insn_id = add_store(name, expr, cg_mapper(expr, state), state, cg_mapper) # replace "expr" with the created stored variable state.results[expr] = StoredResult(name, expr.ndim, frozenset([insn_id])) # Why call make_reduction_inames_unique? # Consider pt.generate_loopy(pt.sum(x) + pt.sum(x)), the generated program # would be a single instruction with rhs: `_pt_subst() + _pt_subst()`. # The result of pt.sum(x) is cached => same instance of InlinedResult is # emitted for both invocations and we would be required to avoid such # reduction iname collisions. program = lp.make_reduction_inames_unique(state.program) return target.bind_program(program=program, bound_arguments=preproc_result.bound_arguments)
def map_insn_assign(self, insn): from grudge.symbolic.primitives import OperatorBinding if ( len(insn.exprs) == 1 and ( isinstance(insn.exprs[0], OperatorBinding) or is_external_call( insn.exprs[0], self.function_registry))): return insn # FIXME: These names and the size names could clash with user-given names. # Need better metadata tracking in loopy. iel = "iel" idof = "idof" temp_names = [ name for name, dnr in zip(insn.names, insn.do_not_return) if dnr] from pymbolic import var expr_mapper = ToLoopyExpressionMapper( self.dd_inference_mapper, temp_names, (var(iel), var(idof))) insns = [] import loopy as lp from pymbolic import var for name, expr, dnr in zip(insn.names, insn.exprs, insn.do_not_return): insns.append( lp.Assignment( expr_mapper(var(name)), expr_mapper(expr), temp_var_type=lp.Optional(None) if dnr else lp.Optional(), no_sync_with=frozenset([ ("*", "any"), ]), )) if not expr_mapper.non_scalar_vars: return insn knl = lp.make_kernel( "{[%(iel)s, %(idof)s]: " "0 <= %(iel)s < nelements and 0 <= %(idof)s < nunit_dofs}" % {"iel": iel, "idof": idof}, insns, name="grudge_assign_%d" % self.insn_count, # Single-insn kernels may have their no_sync_with resolve to an # empty set, that's OK. options=lp.Options( check_dep_resolution=False, return_dict=True, no_numpy=True, ) ) self.insn_count += 1 from pytools import single_valued governing_dd = single_valued( self.dd_inference_mapper(expr) for expr in insn.exprs) knl = lp.register_preamble_generators(knl, [bessel_preamble_generator]) knl = lp.register_function_manglers(knl, [bessel_function_mangler]) input_mappings = {} output_mappings = {} from grudge.symbolic.mappers import DependencyMapper dep_mapper = DependencyMapper(composite_leaves=False) for expr, name in expr_mapper.expr_to_name.items(): deps = dep_mapper(expr) assert len(deps) <= 1 if not deps: is_output = False else: dep, = deps is_output = dep.name in insn.names if is_output: tgt_dict = output_mappings else: tgt_dict = input_mappings tgt_dict[name] = expr return LoopyKernelInstruction( LoopyKernelDescriptor( loopy_kernel=knl, input_mappings=input_mappings, output_mappings=output_mappings, fixed_arguments={}, governing_dd=governing_dd) )
def make_kernel(self, map_instructions, tmp_instructions, args, domains, **kwargs): temp_statements = [] temp_vars = [] from pystella.field import index_fields indexed_tmp_insns = index_fields(tmp_instructions) indexed_map_insns = index_fields(map_instructions) for statement in indexed_tmp_insns: if isinstance(statement, lp.InstructionBase): temp_statements += [statement] else: assignee, expression = statement # only declare temporary variables once if isinstance(assignee, pp.Variable): current_tmp = assignee elif isinstance(assignee, pp.Subscript): current_tmp = assignee.aggregate else: current_tmp = None if current_tmp is not None and current_tmp not in temp_vars: temp_vars += [current_tmp] tvt = lp.Optional(None) else: tvt = lp.Optional() temp_statements += [ self._assignment(assignee, expression, temp_var_type=tvt) ] output_statements = [] for statement in indexed_map_insns: if isinstance(statement, lp.InstructionBase): output_statements += [statement] else: assignee, expression = statement temp_statements += [self._assignment(assignee, expression)] options = kwargs.pop("options", lp.Options()) # ignore lack of supposed dependency for single-instruction kernels if len(map_instructions) + len(tmp_instructions) == 1: options.check_dep_resolution = False from pystella import get_field_args inferred_args = get_field_args([map_instructions, tmp_instructions]) all_args = append_new_args(args, inferred_args) t_unit = lp.make_kernel( domains, temp_statements + output_statements, all_args + [lp.ValueArg("Nx, Ny, Nz", dtype="int"), ...], options=options, **kwargs, ) new_args = [] knl = t_unit.default_entrypoint for arg in knl.args: if isinstance(arg, lp.KernelArgument) and arg.dtype is None: new_arg = arg.copy(dtype=self.dtype) new_args.append(new_arg) else: new_args.append(arg) t_unit = t_unit.with_kernel(knl.copy(args=new_args)) t_unit = lp.remove_unused_arguments(t_unit) t_unit = lp.register_callable(t_unit, "round", UnaryOpenCLCallable("round")) return t_unit
def map_insn_assign(self, insn): from grudge.symbolic.primitives import OperatorBinding if (len(insn.exprs) == 1 and (isinstance(insn.exprs[0], OperatorBinding) or is_external_call(insn.exprs[0], self.function_registry))): return insn iname = "grdg_i" size_name = "grdg_n" temp_names = [ name for name, dnr in zip(insn.names, insn.do_not_return) if dnr ] expr_mapper = ToLoopyExpressionMapper(self.dd_inference_mapper, temp_names, iname) insns = [] import loopy as lp from pymbolic import var for name, expr, dnr in zip(insn.names, insn.exprs, insn.do_not_return): insns.append( lp.Assignment( expr_mapper(var(name)), expr_mapper(expr), temp_var_type=lp.Optional(None) if dnr else lp.Optional(), no_sync_with=frozenset([ ("*", "any"), ]), )) if not expr_mapper.non_scalar_vars: return insn knl = lp.make_kernel( "{[%s]: 0 <= %s < %s}" % (iname, iname, size_name), insns, default_offset=lp.auto, name="grudge_assign_%d" % self.insn_count, # Single-insn kernels may have their no_sync_with resolve to an # empty set, that's OK. options=lp.Options(check_dep_resolution=False)) knl = lp.set_options(knl, return_dict=True) knl = lp.split_iname(knl, iname, 128, outer_tag="g.0", inner_tag="l.0") self.insn_count += 1 from pytools import single_valued governing_dd = single_valued( self.dd_inference_mapper(expr) for expr in insn.exprs) knl = lp.register_preamble_generators(knl, [bessel_preamble_generator]) knl = lp.register_function_manglers(knl, [bessel_function_mangler]) input_mappings = {} output_mappings = {} from grudge.symbolic.mappers import DependencyMapper dep_mapper = DependencyMapper(composite_leaves=False) for expr, name in six.iteritems(expr_mapper.expr_to_name): deps = dep_mapper(expr) assert len(deps) <= 1 if not deps: is_output = False else: dep, = deps is_output = dep.name in insn.names if is_output: tgt_dict = output_mappings else: tgt_dict = input_mappings tgt_dict[name] = expr return LoopyKernelInstruction( LoopyKernelDescriptor(loopy_kernel=knl, input_mappings=input_mappings, output_mappings=output_mappings, fixed_arguments={}, governing_dd=governing_dd))