def test_shape_translation_through_sub_array_ref(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*abs(a[i]) """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<3 and 0 <= j < 2}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") callee3 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 5*a[i] """, name="callee_fn3") knl = lp.make_kernel( "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", """ [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.merge([knl, callee3]) if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.inline_callable_kernel(knl, "callee_fn3") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() y3 = out_dict["y3"].get() assert (np.linalg.norm(y1 - 2 * x1.get())) < 1e-15 assert (np.linalg.norm(y2 - 3 * x2.get())) < 1e-15 assert (np.linalg.norm(np.diag(y3 - 5 * x3.get()))) < 1e-15
def test_inlining_with_indirections(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ones_and_zeros = lp.make_function("{[i, j]: 0<=i<6 and 0<=j<3}", """ x[i] = 0.0f ...gbarrier x[map[j]] = 1.0f """, seq_dependencies=True, name="ones_and_zeros") t_unit = lp.make_kernel( "{ : }", """ y[:] = ones_and_zeros(mymap[:]) """, [ lp.GlobalArg("y", shape=6, dtype=lp.auto), lp.GlobalArg("mymap", dtype=np.int32, shape=3) ]) t_unit = lp.merge([t_unit, ones_and_zeros]) t_unit = lp.inline_callable_kernel(t_unit, "ones_and_zeros") map_in = np.arange(3).astype(np.int32) evt, (out, ) = t_unit(queue, mymap=map_in) expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) assert (expected_out == out).all()
def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 2**3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function("{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel("{:}", """ z[:, :] = linear_combo(x, y) """, kernel_data=[ lp.GlobalArg(name="x, y, z", dtype=np.float64, shape=(n, n)), ... ]) knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def test_empty_sub_array_refs(ctx_factory, inline): # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618 ctx = ctx_factory() queue = cl.CommandQueue(ctx) x = np.random.randn(10) y = np.random.randn(10) callee = lp.make_function("{[d]:0<=d<1}", """ c[d] = a[d] - b[d] """, name="wence_function") caller = lp.make_kernel( "{[i,k]: 0<=i<10 and 0<=k<1}", """ [k]:z[i+k] = wence_function([k]:x[i+k], [k]:y[i+k]) """, [lp.GlobalArg("x, y", dtype=np.float64, shape=(10, )), ...]) caller = lp.merge([caller, callee]) if inline: caller = lp.inline_callable_kernel(caller, "wence_function") evt, (out, ) = caller(queue, x=x, y=y) assert np.allclose(out, x - y)
def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel("{[i, k, m]: 0<=i, k, m<4}", """ z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg(name="x, y, z", dtype=np.float64, shape=(n, n, n, n, n)), ... ]) knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out[:, ::-1, :, :, :]) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") with pytest.raises(LoopyError): lp.generate_code_v2(knl)
def test_unused_hw_axes_in_callee(ctx_factory, inline): ctx = ctx_factory() twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.1"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") lp.auto_test_vs_ref(knl, ctx, knl)
def test_packing_unpacking(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*a[i] """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<2 and 0 <= j < 3}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") knl = lp.make_kernel( "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", """ [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) [k]: y2[k] = callee_fn2([k]: x2[k]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1") knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2") if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() assert np.linalg.norm(2 * x1.get() - y1) / np.linalg.norm( 2 * x1.get()) < 1e-15 assert np.linalg.norm(3 * x2.get() - y2) / np.linalg.norm( 3 * x2.get()) < 1e-15
def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function("{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name="linear_combo1") child_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name="linear_combo2") parent_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg(name="x, y", dtype=np.float64, shape=(n, n, n, n, n)), ... ]) knl = lp.merge([grandchild_knl, child_knl, parent_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo2") knl = lp.inline_callable_kernel(knl, "linear_combo1") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, name="caller") caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.merge([caller_knl, callee_knl]) knl = lp.set_options(knl, "return_dict") if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, out = knl(queue, x=x_dev, y=y_dev) x_host = x_dev.get() y_host = y_dev.get() assert np.linalg.norm(2 * x_host + 3 * y_host - out["z"].get() ) / np.linalg.norm(2 * x_host + 3 * y_host) < 1e-15
def test_inlining_with_callee_domain_param(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) fill2 = lp.make_function("{[i]: 0<=i<n}", """ y[i] = 2.0 """, name="fill2") caller = lp.make_kernel( "{[i]: 0<=i<10}", """ [i]: res[i] = fill2(10) """) caller = lp.merge([caller, fill2]) caller = lp.inline_callable_kernel(caller, "fill2") evt, (out, ) = caller(queue) assert (out == 2).all()
def test_valueargs_being_mapped_in_inling(ctx_factory): doublify = lp.make_function( "{[i]: 0<=i<n}", """ y[i] = n*x[i] """, [lp.ValueArg("n", dtype=np.int32), ...], name="doublify", ) knl = lp.make_kernel( "{[i, j]: 0<=i, j<10}", """ [i]: bar[i] = doublify(10, [j]: foo[j]) """, [lp.GlobalArg("foo", dtype=float, shape=lp.auto), ...], ) knl = lp.merge([knl, doublify]) knl = lp.inline_callable_kernel(knl, "doublify") lp.auto_test_vs_ref(knl, ctx_factory(), knl)
def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline): ctx = cl.create_some_context() cq = cl.CommandQueue(ctx) call_sin = lp.make_function("{:}", """ y = sin(x) """, name="call_sin") knl = lp.make_kernel( "{:}", """ []: real_y[()] = call_sin(real_x) """) knl = lp.merge([knl, call_sin]) knl = lp.set_options(knl, "write_cl") if inline: knl = lp.inline_callable_kernel(knl, "call_sin") evt, (out, ) = knl(cq, real_x=np.asarray(3.0, dtype=float))
def test_simplify_indices(ctx_factory): ctx = ctx_factory() twice = lp.make_function("{[i, j]: 0<=i<10 and 0<=j<4}", """ y[i,j] = 2*x[i,j] """, name="zerozerozeroonezeroify") knl = lp.make_kernel( "{:}", """ Y[:,:] = zerozerozeroonezeroify(X[:,:]) """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)]) class ContainsFloorDiv(lp.symbolic.CombineMapper): def combine(self, values): return any(values) def map_floor_div(self, expr): return True def map_variable(self, expr): return False def map_constant(self, expr): return False knl = lp.merge([knl, twice]) knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify") simplified_knl = lp.simplify_indices(knl) contains_floordiv = ContainsFloorDiv() assert any( contains_floordiv(insn.expression) for insn in knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) assert all(not contains_floordiv(insn.expression) for insn in simplified_knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) lp.auto_test_vs_ref(knl, ctx, simplified_knl)
def test_non1_step_slices(ctx_factory, start, inline): # See https://github.com/inducer/loopy/pull/222#discussion_r645905188 ctx = ctx_factory() cq = cl.CommandQueue(ctx) callee = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i**2 """, [lp.ValueArg("n"), ...], name="squared_arange") t_unit = lp.make_kernel("{[i_init, j_init]: 0<=i_init, j_init<40}", f""" X[i_init] = 42 X[{start}:40:3] = squared_arange({len(range(start, 40, 3))}) Y[j_init] = 1729 Y[39:{start}:-3] = squared_arange({len(range(39, start, -3))}) """, [lp.GlobalArg("X,Y", shape=40)], seq_dependencies=True) expected_out1 = 42 * np.ones(40, dtype=np.int64) expected_out1[start:40:3] = np.arange(len(range(start, 40, 3)))**2 expected_out2 = 1729 * np.ones(40, dtype=np.int64) expected_out2[39:start:-3] = np.arange(len(range(39, start, -3)))**2 t_unit = lp.merge([t_unit, callee]) t_unit = lp.set_options(t_unit, "return_dict") if inline: t_unit = lp.inline_callable_kernel(t_unit, "squared_arange") evt, out_dict = t_unit(cq) np.testing.assert_allclose(out_dict["X"].get(), expected_out1) np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
def test_kc_with_floor_div_in_expr(ctx_factory, inline): # See https://github.com/inducer/loopy/issues/366 import loopy as lp ctx = ctx_factory() callee = lp.make_function("{[i]: 0<=i<10}", """ x[i] = 2*x[i] """, name="callee_with_update") knl = lp.make_kernel( "{[i]: 0<=i<10}", """ [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i]) """) knl = lp.merge([knl, callee]) if inline: knl = lp.inline_callable_kernel(knl, "callee_with_update") lp.auto_test_vs_ref(knl, ctx, knl)
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset([builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [tuple(merger(i) for i in inits) for inits in mapper.initialisers] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % (prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [rename_nodes(inits, replacements) for inits in mapper.initialisers] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"n": 1, "start": -1}))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"layer": 1, t1: -1, 1: 1}))) else: new_domains.append(d) domains = new_domains assumptions, = reduce(operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set(pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set(["#include <math.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper(wrapper, petsc_function_lookup) return wrapper
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset( [builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [ tuple(merger(i) for i in inits) for inits in mapper.initialisers ] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % ( prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [ rename_nodes(inits, replacements) for inits in mapper.initialisers ] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append( d.add_constraint( isl.Constraint.eq_from_names(d.space, { "n": 1, "start": -1 }))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append( d.add_constraint( isl.Constraint.eq_from_names( d.space, { "layer": 1, t1: -1, 1: 1 }))) else: new_domains.append(d) domains = new_domains assumptions, = reduce( operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set( pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set( ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, petsc_function_lookup) return wrapper