def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 # This is (purposefully) somewhat un-idiomatic, to replicate the conditions # under which the above bug was found. If assignees were phi[i], then the # iname propagation heuristic would not assume that dependent instructions # need to run inside of 'i', and hence the forced_iname_* bits below would not # be needed. i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi") from pymbolic.primitives import Subscript, Variable i2 = lp.Assignment("a", lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), forced_iname_deps=frozenset(), forced_iname_deps_is_final=True) k = lp.make_kernel( "{[i,j] : 0<=i,j<n}", [i1, i2], [ lp.GlobalArg("a", dtype=np.float32, shape=()), lp.ValueArg("n", dtype=np.int32), lp.TemporaryVariable("phi", dtype=np.float32, shape=("n", )), ], target=lp.CTarget(), ) k = lp.preprocess_kernel(k) assert 'i' not in k.insn_inames("insn_0_j_update") print(k.stringify(with_dependencies=True))
def test_solve_callable(self, zero_vec, solve_mat, solve_vec): loopy.set_caching_enabled(False) k = loopy.make_kernel( ["{[i,j] : 0 <= i,j < 2}"], """ x[:] = solve(A[:,:], b[:]) """, [ loopy.GlobalArg('x', dtype=np.float64, shape=(2, )), loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)), loopy.GlobalArg( 'b', dtype=np.float64, shape=(2, ), ) ], target=loopy.CTarget(), name="callable_kernel2", lang_version=(2018, 2)) k = loopy.register_function_id_to_in_knl_callable_mapper( k, solve_fn_lookup) code = loopy.generate_code_v2(k).device_code() code.replace('void callable_kernel2', 'static void callable_kernel2') loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"]) args = [zero_vec(op2.READ), solve_mat(op2.READ), solve_vec(op2.WRITE)] op2.par_loop(loopykernel, solve_mat.dataset.set, *args) expected = np.linalg.solve(solve_mat.data, solve_vec.data) assert np.allclose(expected, zero_vec.data)
def build_loopy_kernel_A_text(): knl_name = "kernel_tensor_A" knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }", """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """, name=knl_name, assumptions="n >= 1 and m >= 1", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes( knl, { "A": np.dtype(np.double), "B": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def merge_loopy(slate_loopy, output_arg, builder, var2terminal): """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel.""" from firedrake.slate.slac.kernel_builder import SlateWrapperBag coeffs = builder.collect_coefficients() builder.bag = SlateWrapperBag(coeffs) # In the initialisation the loopy tensors for the terminals are generated # Those are the needed again for generating the TSFC calls inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients) terminal_tensors = list(filter(lambda x: isinstance(x, sl.Tensor), var2terminal.values())) tsfc_calls, tsfc_kernels = zip(*itertools.chain.from_iterable( (builder.generate_tsfc_calls(terminal, tensor2temp[terminal]) for terminal in terminal_tensors))) # Construct args args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp, tsfc_kernels) # Munge instructions insns = inits insns.extend(tsfc_calls) insns.append(builder.slate_call(slate_loopy, tensor2temp.values())) # Inames come from initialisations + loopyfying kernel args and lhs domains = builder.bag.index_creator.domains # Generates the loopy wrapper kernel slate_wrapper = lp.make_function(domains, insns, args, name="slate_wrapper", seq_dependencies=True, target=lp.CTarget()) # Generate program from kernel, so that one can register kernels prg = make_program(slate_wrapper) for tsfc_loopy in tsfc_kernels: prg = register_callable_kernel(prg, tsfc_loopy) prg = register_callable_kernel(prg, slate_loopy) return prg
def build_loopy_kernel_b_text(): knl_name = "kernel_tensor_b" knl = lp.make_kernel("{ [i]: 0<=i<n }", """ b[i] = c """, name="kernel_tensor_b", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, { "b": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3) #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_b(b, Ae / 6.0);" return knl_name, knl_call, knl_c, knl_h
def __init__(self, cd_proto, target=lp.CTarget(), test_instructions=False, print_domains=False, print_instructions=False, print_variables=False, print_assumptions=False, print_codegen=True): # Verbose debugging options self.test_instructions = test_instructions self.print_domains = print_domains self.print_instructions = print_instructions self.print_variables = print_variables self.print_assumptions = print_assumptions self.print_codegen = print_codegen # Target output self.target = target # Reading from proto self.input_proto = cd_proto self.output_dir, self.output_file = os.path.split(self.input_proto) self.proto_name = self.output_file.split('.')[0] program = load_store.load_program(self.input_proto) self.graph = program.graph self.templates = program.templates self.kernels = {} self.program = None self.function_gen()
def test_inverse_callable(self, zero_mat, inv_mat): loopy.set_caching_enabled(False) k = loopy.make_kernel( ["{[i,j] : 0 <= i,j < 2}"], """ B[:,:] = inv(A[:,:]) """, [ loopy.GlobalArg('B', dtype=np.float64, shape=(2, 2)), loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)) ], target=loopy.CTarget(), name="callable_kernel", lang_version=(2018, 2)) k = loopy.register_function_id_to_in_knl_callable_mapper( k, inv_fn_lookup) code = loopy.generate_code_v2(k).device_code() code.replace('void callable_kernel', 'static void callable_kernel') loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"]) op2.par_loop(loopykernel, zero_mat.dataset.set, zero_mat(op2.WRITE), inv_mat(op2.READ)) expected = np.linalg.inv(inv_mat.data) assert np.allclose(expected, zero_mat.data)
def merge_loopy(slate_loopy, output_arg, builder, var2terminal, name): """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel.""" from firedrake.slate.slac.kernel_builder import SlateWrapperBag coeffs = builder.collect_coefficients() builder.bag = SlateWrapperBag(coeffs) # In the initialisation the loopy tensors for the terminals are generated # Those are the needed again for generating the TSFC calls inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients) terminal_tensors = list( filter(lambda x: (x.terminal and not x.assembled), var2terminal.values())) calls_and_kernels = tuple((c, k) for terminal in terminal_tensors for c, k in builder.generate_tsfc_calls( terminal, tensor2temp[terminal])) if calls_and_kernels: # tsfc may not give a kernel back tsfc_calls, tsfc_kernels = zip(*calls_and_kernels) else: tsfc_calls = () tsfc_kernels = () # Construct args args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp) # Munge instructions insns = inits insns.extend(tsfc_calls) insns.append(builder.slate_call(slate_loopy, tensor2temp.values())) # Inames come from initialisations + loopyfying kernel args and lhs domains = builder.bag.index_creator.domains # Generates the loopy wrapper kernel slate_wrapper = lp.make_function(domains, insns, args, name=name, seq_dependencies=True, target=lp.CTarget()) # Generate program from kernel, so that one can register kernels from pyop2.codegen.loopycompat import _match_caller_callee_argument_dimension_ from loopy.kernel.function_interface import CallableKernel for tsfc_loopy in tsfc_kernels: slate_wrapper = merge([slate_wrapper, tsfc_loopy]) names = tsfc_loopy.callables_table for name in names: if isinstance(slate_wrapper.callables_table[name], CallableKernel): slate_wrapper = _match_caller_callee_argument_dimension_( slate_wrapper, name) slate_wrapper = merge([slate_wrapper, slate_loopy]) names = slate_loopy.callables_table for name in names: if isinstance(slate_wrapper.callables_table[name], CallableKernel): slate_wrapper = _match_caller_callee_argument_dimension_( slate_wrapper, name) return slate_wrapper
def loopy_example(): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)}) print(lp.generate_code_v2(knl).device_code())
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]): """Generates loopy code. :arg impero_c: ImperoC tuple with Impero AST and other data :arg args: list of loopy.GlobalArgs :arg precision: floating-point precision for printing :arg scalar_type: type of scalars as C typename string :arg kernel_name: function name of the kernel :arg index_names: pre-assigned index names :returns: loopy kernel """ ctx = LoopyContext() ctx.indices = impero_c.indices ctx.index_names = defaultdict(lambda: "i", index_names) ctx.precision = precision ctx.scalar_type = scalar_type ctx.epsilon = 10.0 ** (-precision) # Create arguments data = list(args) for i, temp in enumerate(impero_c.temporaries): name = "t%d" % i if isinstance(temp, gem.Constant): data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True)) else: shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False)) ctx.gem_to_pymbolic[temp] = p.Variable(name) # Create instructions instructions = statement(impero_c.tree, ctx) # Create domains domains = [] for idx, extent in ctx.index_extent.items(): inames = isl.make_zero_and_vars([idx]) domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent)))) if not domains: domains = [isl.BasicSet("[] -> {[]}")] # Create loopy kernel knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"]) # Prevent loopy interchange by loopy knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys())) # Help loopy in scheduling by assigning priority to instructions insn_new = [] for i, insn in enumerate(knl.instructions): insn_new.append(insn.copy(priority=len(knl.instructions) - i)) knl = knl.copy(instructions=insn_new) return knl
def generate_kernel(): '''Generates and returns source and header for a kernel using loopy''' knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.double)}) #knl = lp.split_iname(knl, "i", 4) #knl = lp.tag_inames(knl, dict(i_inner="unr")) return lp.generate_code_v2(knl).all_code(), str(lp.generate_header(knl)[0])
def test_prefetch_through_indirect_access(): knl = lp.make_kernel( "{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", """ for i, j, k a[map1[indirect[i], j], k] = 2 end """, [ lp.GlobalArg("a", strides=(2, 1), dtype=int), lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..." ], target=lp.CTarget()) knl = lp.prioritize_loops(knl, "i,j,k") with pytest.raises(LoopyError): knl = lp.add_prefetch(knl, "map1[:, j]")
def create_loop_kernel(component_name, domains, instructions, edges, signature): domains, assumption_string = create_domain_string(domains, edges) globals = create_globals(signature, edges) knl = lp.make_function( domains, instructions, globals + ["..."], name=component_name, assumptions=assumption_string, target=lp.CTarget() ) knl = add_instruction_deps(knl) if component_name == 'main': print(lp.generate_code_v2(knl).device_code()) return knl
def test_reduction_with_conditional(): # Test whether realization of a reduction inherits predicates # of the original instruction. Tested with the CTarget, because # the PyOpenCL target will hoist the conditional into the host # code in this minimal example. knl = lp.make_kernel( "{ [i] : 0<=i<42 }", """ if n > 0 <>b = sum(i, a[i]) end """, [lp.GlobalArg("a", dtype=np.float32, shape=(42,)), lp.GlobalArg("n", dtype=np.float32, shape=())], target=lp.CTarget()) code = lp.generate_body(knl) # Check that the if appears before the loop that realizes the reduction. assert code.index("if") < code.index("for")
def make_extruded_coords(extruded_topology, base_coords, ext_coords, layer_height, extrusion_type='uniform', kernel=None): """ Given either a kernel or a (fixed) layer_height, compute an extruded coordinate field for an extruded mesh. :arg extruded_topology: an :class:`~.ExtrudedMeshTopology` to extrude a coordinate field for. :arg base_coords: a :class:`~.Function` to read the base coordinates from. :arg ext_coords: a :class:`~.Function` to write the extruded coordinates into. :arg layer_height: the height for each layer. Either a scalar, where layers will be equi-spaced at the specified height, or a 1D array of variable layer heights to use through the extrusion. :arg extrusion_type: the type of extrusion to use. Predefined options are either "uniform" (creating equi-spaced layers by extruding in the (n+1)dth direction), "radial" (creating equi-spaced layers by extruding in the outward direction from the origin) or "radial_hedgehog" (creating equi-spaced layers by extruding coordinates in the outward cell-normal direction, needs a P1dgxP1 coordinate field). :arg kernel: an optional kernel to carry out coordinate extrusion. The kernel signature (if provided) is:: void kernel(double **base_coords, double **ext_coords, double *layer_height, int layer) The kernel iterates over the cells of the mesh and receives as arguments the coordinates of the base cell (to read), the coordinates on the extruded cell (to write to), the fixed layer height, and the current cell layer. """ _, vert_space = ext_coords.function_space().ufl_element().sub_elements( )[0].sub_elements() if kernel is None and not (vert_space.degree() == 1 and vert_space.family() in ['Lagrange', 'Discontinuous Lagrange']): raise RuntimeError( 'Extrusion of coordinates is only possible for a P1 or P1dg interval unless a custom kernel is provided' ) layer_height = numpy.atleast_1d(numpy.array(layer_height, dtype=RealType)) if layer_height.ndim > 1: raise RuntimeError('Extrusion layer height should be 1d or scalar') if layer_height.size > 1: layer_height = numpy.cumsum(numpy.concatenate(([0], layer_height))) layer_heights = layer_height.size layer_height = op2.Global(layer_heights, layer_height, dtype=RealType) if kernel is not None: op2.ParLoop(kernel, ext_coords.cell_set, ext_coords.dat(op2.WRITE, ext_coords.cell_node_map()), base_coords.dat(op2.READ, base_coords.cell_node_map()), layer_height(op2.READ), pass_layer_arg=True, is_loopy_kernel=True).compute() return ext_fe = create_element(ext_coords.ufl_element()) ext_shape = ext_fe.index_shape base_fe = create_element(base_coords.ufl_element()) base_shape = base_fe.index_shape data = [] data.append(lp.GlobalArg("ext_coords", dtype=ScalarType, shape=ext_shape)) data.append(lp.GlobalArg("base_coords", dtype=ScalarType, shape=base_shape)) data.append( lp.GlobalArg("layer_height", dtype=RealType, shape=(layer_heights, ))) data.append(lp.ValueArg('layer')) base_coord_dim = base_coords.function_space().value_size # Deal with tensor product cells adim = len(ext_shape) - 2 # handle single or variable layer heights if layer_heights == 1: height_var = "layer_height[0] * (layer + l)" else: height_var = "layer_height[layer + l]" def _get_arity_axis_inames(_base): return tuple(_base + str(i) for i in range(adim)) def _get_lp_domains(_inames, _extents): domains = [] for idx, extent in zip(_inames, _extents): inames = isl.make_zero_and_vars([idx]) domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent)))) return domains if extrusion_type == 'uniform': domains = [] dd = _get_arity_axis_inames('d') domains.extend(_get_lp_domains(dd, ext_shape[:adim])) domains.extend(_get_lp_domains(('c', ), (base_coord_dim, ))) if layer_heights == 1: domains.extend(_get_lp_domains(('l', ), (2, ))) else: domains.append( "[layer] -> { [l] : 0 <= l <= 1 & 0 <= l + layer < %d}" % layer_heights) instructions = """ ext_coords[{dd}, l, c] = base_coords[{dd}, c] ext_coords[{dd}, l, {base_coord_dim}] = ({hv}) """.format(dd=', '.join(dd), base_coord_dim=base_coord_dim, hv=height_var) name = "pyop2_kernel_uniform_extrusion" elif extrusion_type == 'radial': domains = [] dd = _get_arity_axis_inames('d') domains.extend(_get_lp_domains(dd, ext_shape[:adim])) domains.extend(_get_lp_domains(('c', 'k'), (base_coord_dim, ) * 2)) if layer_heights == 1: domains.extend(_get_lp_domains(('l', ), (2, ))) else: domains.append( "[layer] -> { [l] : 0 <= l <= 1 & 0 <= l + layer < %d}" % layer_heights) instructions = """ <{RealType}> tt[{dd}] = 0 <{RealType}> bc[{dd}] = 0 for k bc[{dd}] = real(base_coords[{dd}, k]) tt[{dd}] = tt[{dd}] + bc[{dd}] * bc[{dd}] end tt[{dd}] = sqrt(tt[{dd}]) ext_coords[{dd}, l, c] = base_coords[{dd}, c] + base_coords[{dd}, c] * ({hv}) / tt[{dd}] """.format(RealType=RealType, dd=', '.join(dd), hv=height_var) name = "pyop2_kernel_radial_extrusion" elif extrusion_type == 'radial_hedgehog': # Only implemented for interval in 2D and triangle in 3D. # gdim != tdim already checked in ExtrudedMesh constructor. tdim = base_coords.ufl_domain().ufl_cell().topological_dimension() if tdim not in [1, 2]: raise NotImplementedError( "Hedgehog extrusion not implemented for %s" % base_coords.ufl_domain().ufl_cell()) # tdim == 1: # # normal is: # (0 -1) (x2 - x1) # (1 0) (y2 - y1) # # tdim == 2: # normal is # v0 x v1 # # /\ # v0/ \ # / \ # /------\ # v1 domains = [] dd = _get_arity_axis_inames('d') _dd = _get_arity_axis_inames('_d') domains.extend(_get_lp_domains(dd, ext_shape[:adim])) domains.extend(_get_lp_domains(_dd, ext_shape[:adim])) domains.extend( _get_lp_domains(('c0', 'c1', 'c2', 'c3', 'k', 'l'), (base_coord_dim, ) * 5 + (2, ))) # Formula for normal, n n_1_1 = """ n[0] = -bc[1, 1] + bc[0, 1] n[1] = bc[1, 0] - bc[0, 0] """ n_2_1 = """ v0[c3] = bc[1, c3] - bc[0, c3] v1[c3] = bc[2, c3] - bc[0, c3] n[0] = v0[1] * v1[2] - v0[2] * v1[1] n[1] = v0[2] * v1[0] - v0[0] * v1[2] n[2] = v0[0] * v1[1] - v0[1] * v1[0] """ n_2_2 = """ v0[c3] = bc[0, 1, c3] - bc[0, 0, c3] v1[c3] = bc[1, 0, c3] - bc[0, 0, c3] n[0] = v0[1] * v1[2] - v0[2] * v1[1] n[1] = v0[2] * v1[0] - v0[0] * v1[2] n[2] = v0[0] * v1[1] - v0[1] * v1[0] """ n_dict = {1: {1: n_1_1}, 2: {1: n_2_1, 2: n_2_2}} instructions = """ <{RealType}> dot = 0 <{RealType}> norm = 0 <{RealType}> v0[c2] = 0 <{RealType}> v1[c2] = 0 <{RealType}> n[c2] = 0 <{RealType}> x[c2] = 0 <{RealType}> bc[{_dd}, c1] = real(base_coords[{_dd}, c1]) for {_dd} x[c1] = x[c1] + bc[{_dd}, c1] end {ninst} for k dot = dot + x[k] * n[k] norm = norm + n[k] * n[k] end norm = sqrt(norm) norm = -norm if dot < 0 else norm ext_coords[{dd}, l, c0] = base_coords[{dd}, c0] + n[c0] * ({hv}) / norm """.format(RealType=RealType, dd=', '.join(dd), _dd=', '.join(_dd), ninst=n_dict[tdim][adim], hv=height_var) name = "pyop2_kernel_radial_hedgehog_extrusion" else: raise NotImplementedError('Unsupported extrusion type "%s"' % extrusion_type) ast = lp.make_function(domains, instructions, data, name=name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"]) kernel = op2.Kernel(ast, name) op2.ParLoop(kernel, ext_coords.cell_set, ext_coords.dat(op2.WRITE, ext_coords.cell_node_map()), base_coords.dat(op2.READ, base_coords.cell_node_map()), layer_height(op2.READ), pass_layer_arg=True, is_loopy_kernel=True).compute()
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset( [builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [ tuple(merger(i) for i in inits) for inits in mapper.initialisers ] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % ( prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [ rename_nodes(inits, replacements) for inits in mapper.initialisers ] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append( d.add_constraint( isl.Constraint.eq_from_names(d.space, { "n": 1, "start": -1 }))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append( d.add_constraint( isl.Constraint.eq_from_names( d.space, { "layer": 1, t1: -1, 1: 1 }))) else: new_domains.append(d) domains = new_domains assumptions, = reduce( operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set( pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set( ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, petsc_function_lookup) return wrapper
def build_loopy_kernel_A_auto(): knl_name = "kernel_tensor_A" # Inputs to the kernel arg_names = ["A", "B", "c"] # Kernel parameters that will be fixed later param_names = ["n", "m"] # Tuples of inames and extents of their loops loops = [("i", "n"), ("j", "n"), ("k", "m")] # Generate the domains for the loops isl_domains = [] for idx, extent in loops: # Create dict of loop variables (inames) and parameters vs = isl.make_zero_and_vars([idx], [extent]) # Create the loop domain using '<=' and '>' restrictions isl_domains.append( ((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent])))) print("ISL loop domains:") print(isl_domains) print("") # Generate pymbolic variables for all used symbols args = {arg: pb.Variable(arg) for arg in arg_names} params = {param: pb.Variable(param) for param in param_names} inames = {iname: pb.Variable(iname) for iname, extent in loops} # Input arguments for the loopy kernel lp_args = { "A": lp.GlobalArg("A", dtype=np.double, shape=(params["n"], params["n"])), "B": lp.GlobalArg("B", dtype=np.double, shape=(params["m"], params["n"])), "c": lp.ValueArg("c", dtype=np.double) } # Generate the list of arguments & parameters that will be passed to loopy data = [] data += [arg for arg in lp_args.values()] data += [lp.ValueArg(param) for param in ["n", "m"]] # Build the kernel instruction: computation and assignment of the element matrix def build_ass(): """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """ # The target of the assignment target = pb.Subscript(args["A"], (inames["i"], inames["j"])) # The rhs expression: A reduce operation of the matrix columns # Maybe replace with manual increment? reduce_op = lp.library.reduction.SumReductionOperation() reduce_expr = pb.Subscript(args["B"], (inames["k"], inames["i"])) * pb.Subscript( args["B"], (inames["k"], inames["j"])) expr = args["c"] * lp.Reduction(reduce_op, inames["k"], reduce_expr) return lp.Assignment(target, expr) ass = build_ass() print("Assignment expression:") print(ass) print("") instructions = [ass] # Construct the kernel knl = lp.make_kernel(isl_domains, instructions, data, name=knl_name, target=lp.CTarget(), lang_version=lp.MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") print(knl) print("") # Generate kernel code knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) print(knl_c) print("") # Postprocess kernel code replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def generate(impero_c, args, scalar_type, kernel_name="loopy_kernel", index_names=[], return_increments=True): """Generates loopy code. :arg impero_c: ImperoC tuple with Impero AST and other data :arg args: list of loopy.GlobalArgs :arg scalar_type: type of scalars as C typename string :arg kernel_name: function name of the kernel :arg index_names: pre-assigned index names :arg return_increments: Does codegen for Return nodes increment the lvalue, or assign? :returns: loopy kernel """ ctx = LoopyContext() ctx.indices = impero_c.indices ctx.index_names = defaultdict(lambda: "i", index_names) ctx.epsilon = numpy.finfo(scalar_type).resolution ctx.scalar_type = scalar_type ctx.return_increments = return_increments # Create arguments data = list(args) for i, (temp, dtype) in enumerate( assign_dtypes(impero_c.temporaries, scalar_type)): name = "t%d" % i if isinstance(temp, gem.Constant): data.append( lp.TemporaryVariable(name, shape=temp.shape, dtype=dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True)) else: shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape data.append( lp.TemporaryVariable(name, shape=shape, dtype=dtype, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False)) ctx.gem_to_pymbolic[temp] = p.Variable(name) # Create instructions instructions = statement(impero_c.tree, ctx) # Create domains domains = create_domains(ctx.index_extent.items()) # Create loopy kernel knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"], lang_version=(2018, 2)) # Prevent loopy interchange by loopy knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys())) return knl
def __generate_loopy(self, knl_name: str, verbose: bool = False, **kwargs): """Generate cell kernel for the Laplace operator using Loopy""" n_dof, n_dim = self.n_dof, self.n_dim # Inputs to the kernel arg_names = ["A_T", "A0", "G_T"] # Kernel parameters that will be fixed later param_names = ["n", "m"] # Tuples of inames and extents of their loops loops = [("i", "n"), ("j", "n"), ("k", "m")] # Generate the domains for the loops isl_domains = [] for idx, extent in loops: # Create dict of loop variables (inames) and parameters vs = isl.make_zero_and_vars([idx], [extent]) # Create the loop domain using '<=' and '>' restrictions isl_domains.append(((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent])))) if verbose: print("ISL loop domains:") print(isl_domains) print("") # Generate pymbolic variables for all used symbols args = {arg: pb.Variable(arg) for arg in arg_names} params = {param: pb.Variable(param) for param in param_names} inames = {iname: pb.Variable(iname) for iname, extent in loops} # Input arguments for the loopy kernel n, m = params["n"], params["m"] lp_args = { "A_T": lp.GlobalArg("A_T", dtype=np.double, shape=(n, n)), "A0": lp.GlobalArg("A0", dtype=np.double, shape=(n, n, m)), "G_T": lp.GlobalArg("G_T", dtype=np.double, shape=(m)) } # Generate the list of arguments & parameters that will be passed to loopy data = [] data += [arg for arg in lp_args.values()] data += [lp.ValueArg(param) for param in param_names] # Build the kernel instruction: computation and assignment of the element matrix def build_ass(): # A_T[i,j] = sum(k, A0[i,j,k] * G_T[k]); # Get variable symbols for all required variables i, j, k = inames["i"], inames["j"], inames["k"] A_T, A0, G_T = args["A_T"], args["A0"], args["G_T"] # The target of the assignment target = pb.Subscript(A_T, (i, j)) # The rhs expression: Frobenius inner product <A0[i,j],G_T> reduce_op = lp.library.reduction.SumReductionOperation() reduce_expr = pb.Subscript(A0, (i, j, k)) * pb.Subscript(G_T, (k)) expr = lp.Reduction(reduce_op, k, reduce_expr) return lp.Assignment(target, expr) ass = build_ass() if verbose: print("Assignment expression:") print(ass) print("") instructions = [ass] # Construct the kernel knl = lp.make_kernel(isl_domains, instructions, data, name=knl_name, target=lp.CTarget(), lang_version=lp.MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, n=n_dof, m=n_dim**2) knl = lp.prioritize_loops(knl, "i,j") if verbose: print("") print(knl) print("") # Generate kernel code knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) if verbose: print(knl_c) print("") # Postprocess kernel code knl_c = knl_c.replace("__restrict__", "restrict") knl_h = knl_h.replace("__restrict__", "restrict") return knl_c, knl_h
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs): kargs = [] for var, (func, intent) in args.items(): if isinstance(func, constant.Constant): if intent is not READ: raise RuntimeError("Only READ access is allowed to Constant") # Constants modelled as Globals, so no need for double # indirection ndof = func.dat.cdim kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) else: # Do we have a component of a mixed function? if isinstance(func, Indexed): c, i = func.ufl_operands idx = i._indices[0]._value ndof = c.function_space()[idx].finat_element.space_dimension() cdim = c.dat[idx].cdim dtype = c.dat[idx].dtype else: if func.function_space().ufl_element().family() == "Real": ndof = func.function_space().dim() # == 1 kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) continue else: if len(func.function_space()) > 1: raise NotImplementedError("Must index mixed function in par_loop.") ndof = func.function_space().finat_element.space_dimension() cdim = func.dat.cdim dtype = func.dat.dtype if measure.integral_type() == 'interior_facet': ndof *= 2 # FIXME: shape for facets [2][ndof]? kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim))) kernel_domains = kernel_domains.replace(var+".dofs", str(ndof)) if kernel_domains == "": kernel_domains = "[] -> {[]}" try: key = (kernel_domains, tuple(instructions), tuple(map(tuple, kwargs.items()))) if kernel_cache is not None: return kernel_cache[key] else: raise KeyError("No cache") except KeyError: kargs.append(...) knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True, name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"], target=loopy.CTarget()) knl = pyop2.Kernel(knl, "par_loop_kernel", **kwargs) if kernel_cache is not None: return kernel_cache.setdefault(key, knl) else: return knl