def get_injection_kernel(fiat_element, unique_indices, dim=1): weights = get_injection_weights(fiat_element)[unique_indices].T ncdof = weights.shape[0] nfdof = weights.shape[1] # What if we have multiple nodes in same location (DG)? Divide by # rowsum. weights = weights / np.sum(weights, axis=1).reshape(-1, 1) all_same = np.allclose(weights, weights[0, 0]) arglist = [ ast.Decl("double", ast.Symbol("coarse", (ncdof * dim, ))), ast.Decl("double", ast.Symbol("*restrict *restrict fine", ()), qualifiers=["const"]) ] if all_same: w_sym = ast.Symbol("weights", ()) w = [ast.Decl("double", w_sym, weights[0, 0], qualifiers=["const"])] else: init = ast.ArrayInit(format_array_literal(weights)) w_sym = ast.Symbol("weights", (ncdof, nfdof)) w = [ast.Decl("double", w_sym, init, qualifiers=["const"])] i = ast.Symbol("i", ()) j = ast.Symbol("j", ()) k = ast.Symbol("k", ()) if all_same: assign = ast.Prod(ast.Symbol("fine", (j, k)), w_sym) else: assign = ast.Prod(ast.Symbol("fine", (j, k)), ast.Symbol("weights", (i, j))) assignment = ast.Incr( ast.Symbol("coarse", (ast.Sum(k, ast.Prod(i, ast.c_sym(dim))), )), assign) k_loop = ast.For(ast.Decl("int", k, ast.c_sym(0)), ast.Less(k, ast.c_sym(dim)), ast.Incr(k, ast.c_sym(1)), ast.Block([assignment], open_scope=True)) j_loop = ast.For(ast.Decl("int", j, ast.c_sym(0)), ast.Less(j, ast.c_sym(nfdof)), ast.Incr(j, ast.c_sym(1)), ast.Block([k_loop], open_scope=True)) i_loop = ast.For(ast.Decl("int", i, ast.c_sym(0)), ast.Less(i, ast.c_sym(ncdof)), ast.Incr(i, ast.c_sym(1)), ast.Block([j_loop], open_scope=True)) k = ast.FunDecl("void", "injection", arglist, ast.Block(w + [i_loop]), pred=["static", "inline"]) return op2.Kernel(k, "injection", opts=parameters["coffee"])
def expression_kernel(expr, args): """Produce a :class:`pyop2.Kernel` from the processed UFL expression expr and the corresponding args.""" # Empty slot indicating assignment to indexed LHS, so don't do anything if type(expr) is Zero: return fs = args[0].function.function_space() d = ast.Symbol("dim") ast_expr = _ast(expr) body = ast.Block( ( ast.Decl("int", d), ast.For(ast.Assign(d, ast.Symbol(0)), ast.Less(d, ast.Symbol(fs.dof_dset.cdim)), ast.Incr(d, ast.Symbol(1)), ast_expr) ) ) return op2.Kernel(ast.FunDecl("void", "expression", [arg.arg for arg in args], body), "expression")
def statement_block(tree, parameters): statements = [statement(child, parameters) for child in tree.children] declares = [] for expr in parameters.declare[tree]: declares.append( coffee.Decl(SCALAR_TYPE, _decl_symbol(expr, parameters))) return coffee.Block(declares + statements, open_scope=True)
def ker_ind_reduce(): incr = ast.Incr(ast.Symbol('A', ('i',)), ast.Symbol('B', (0, 0))) body = ItSpace().to_for([(0, 2)], ('i',), [incr]) return ast.FunDecl('void', 'ker_ind_reduce', [ast.Decl('int', 'A', qualifiers=['unsigned'], pointers=['']), ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=['', ''])], ast.Block(body))
def ker_ind_inc(): return ast.FunDecl( 'void', 'ker_ind_inc', [ ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=['', '']), ast.Decl('int', 'A', qualifiers=['unsigned'], pointers=['']) ], ast.Block([ast.Incr(ast.Symbol('B', (0, 0)), ast.Symbol('A', (0, )))]))
def statement_evaluate(leaf, parameters): expr = leaf.expression if isinstance(expr, gem.ListTensor): if parameters.declare[leaf]: array_expression = numpy.vectorize(lambda v: expression(v, parameters)) return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), coffee.ArrayInit(array_expression(expr.array), precision=parameters.precision)) else: ops = [] for multiindex, value in numpy.ndenumerate(expr.array): coffee_sym = _coffee_symbol(_ref_symbol(expr, parameters), rank=multiindex) ops.append(coffee.Assign(coffee_sym, expression(value, parameters))) return coffee.Block(ops, open_scope=False) elif isinstance(expr, gem.Constant): assert parameters.declare[leaf] return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), coffee.ArrayInit(expr.array, parameters.precision), qualifiers=["static", "const"]) else: code = expression(expr, parameters, top=True) if parameters.declare[leaf]: return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), code) else: return coffee.Assign(_ref_symbol(expr, parameters), code)
def get_prolongation_kernel(fiat_element, unique_indices, dim=1): weights = restriction_weights(fiat_element)[unique_indices] nfdof = weights.shape[0] ncdof = weights.shape[1] arglist = [ ast.Decl("double", ast.Symbol("fine", (nfdof * dim, ))), ast.Decl("double *restrict *restrict ", ast.Symbol("coarse", ()), qualifiers=["const"]) ] all_same = np.allclose(weights, weights[0, 0]) if all_same: w_sym = ast.Symbol("weights", ()) w = [ast.Decl("double", w_sym, weights[0, 0], qualifiers=["const"])] else: w_sym = ast.Symbol("weights", (nfdof, ncdof)) init = ast.ArrayInit(format_array_literal(weights)) w = [ast.Decl("double", w_sym, init, qualifiers=["const"])] i = ast.Symbol("i", ()) j = ast.Symbol("j", ()) k = ast.Symbol("k", ()) if all_same: assign = ast.Prod(ast.Symbol("coarse", (j, k)), w_sym) else: assign = ast.Prod(ast.Symbol("coarse", (j, k)), ast.Symbol("weights", (i, j))) assignment = ast.Incr( ast.Symbol("fine", (ast.Sum(k, ast.Prod(i, ast.c_sym(dim))), )), assign) k_loop = ast.For(ast.Decl("int", k, ast.c_sym(0)), ast.Less(k, ast.c_sym(dim)), ast.Incr(k, ast.c_sym(1)), ast.Block([assignment], open_scope=True)) j_loop = ast.For(ast.Decl("int", j, ast.c_sym(0)), ast.Less(j, ast.c_sym(ncdof)), ast.Incr(j, ast.c_sym(1)), ast.Block([k_loop], open_scope=True)) i_loop = ast.For(ast.Decl("int", i, ast.c_sym(0)), ast.Less(i, ast.c_sym(nfdof)), ast.Incr(i, ast.c_sym(1)), ast.Block([j_loop], open_scope=True)) k = ast.FunDecl("void", "prolongation", arglist, ast.Block(w + [i_loop]), pred=["static", "inline"]) return op2.Kernel(k, "prolongation", opts=parameters["coffee"])
def ker_write2d(): return ast.FunDecl( 'void', 'ker_write2d', [ast.Decl('int', 'V', qualifiers=['unsigned'], pointers=[''])], ast.Block([ ast.Assign(ast.Symbol('V', (0, )), 1), ast.Assign(ast.Symbol('V', (1, )), 2) ]))
def construct_empty_kernel(self, name): """Construct an empty kernel function. Kernel will just zero the return buffer and do nothing else. :arg name: function name :returns: a COFFEE function definition object """ body = coffee.Block([]) # empty block return self._construct_kernel_from_body(name, body)
def ker_reduce_ind_read(): body = ast.Incr('a', ast.Prod(ast.Symbol('V', (0, 'i')), ast.Symbol('B', (0,)))) body = \ [ast.Decl('int', 'a', '0')] +\ ItSpace().to_for([(0, 2)], ('i',), [body]) +\ [ast.Incr(ast.Symbol('A', (0,)), 'a')] return ast.FunDecl('void', 'ker_reduce_ind_read', [ast.Decl('int', 'A', qualifiers=['unsigned'], pointers=['']), ast.Decl('int', 'V', qualifiers=['unsigned'], pointers=['', '']), ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=[''])], ast.Block(body))
def construct_kernel(self, name, args, body): """Construct a COFFEE function declaration with the accumulated glue code. :arg name: function name :arg args: function argument list :arg body: function body (:class:`coffee.Block` node) :returns: :class:`coffee.FunDecl` object """ assert isinstance(body, coffee.Block) body_ = coffee.Block(self.prepare + body.children + self.finalise) return coffee.FunDecl("void", name, args, body_, pred=["static", "inline"])
def extruded_top_bottom_facet(cxt_kernel, builder, coordsym, mesh_layer_sym, cell_orientations): """Generates a code statement for evaluating exterior top/bottom facet integrals. :arg cxt_kernel: A :namedtuple:`ContextKernel` containing all relevant integral types and TSFC kernels associated with the form nested in the expression. :arg builder: A :class:`KernelBuilder` containing the expression context. :arg coordsym: An `ast.Symbol` object representing coordinate arguments for the kernel. :arg mesh_layer_sym: An `ast.Symbol` representing the mesh layer. :arg cell_orientations: An `ast.Symbol` representing cell orientation information. Returns: A COFFEE code statement and updated include_dirs """ exp = cxt_kernel.tensor t = builder.temps[exp] nlayers = exp.ufl_domain().topological.layers - 1 incl = [] body = [] for splitkernel in cxt_kernel.tsfc_kernels: index = splitkernel.indices kinfo = splitkernel.kinfo # Generate an iterable of coefficients to pass to the subkernel # if any are required clist = [ c for ci in kinfo.coefficient_map for c in builder.coefficient(exp.coefficients()[ci]) ] if kinfo.oriented: clist.insert(0, cell_orientations) incl.extend(kinfo.kernel._include_dirs) tensor = eigen_tensor(exp, t, index) body.append(ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist)) if cxt_kernel.original_integral_type == "exterior_facet_bottom": layer = 0 else: layer = nlayers - 1 stmt = ast.If(ast.Eq(mesh_layer_sym, layer), [ast.Block(body, open_scope=True)]) return stmt, incl
def _generate_element_tensor(integrals, sets, optimise_parameters, parameters): "Construct quadrature code for element tensors." # Prefetch formats to speed up code generation. f_comment = format["comment"] f_ip = format["integration points"] f_I = format["ip constant"] f_loop = format["generate loop"] f_ip_coords = format["generate ip coordinates"] f_coords = format["coordinate_dofs"] f_double = format["float declaration"] f_decl = format["declaration"] f_X = format["ip coordinates"] f_C = format["conditional"] # Initialise return values. tensor_ops_count = 0 ffc_assert(1 == len(integrals), "This function is not capable of handling multiple integrals.") # We receive a dictionary {num_points: form,}. # Loop points and forms. for points, terms, functions, ip_consts, coordinate, conditionals in integrals: nest_ir = [] ip_ir = [] num_ops = 0 # Generate code to compute coordinates if used. if coordinate: raise RuntimeError("Don't know how to compute coordinates") # Left in place for posterity name, gdim, ip, r = coordinate element_code += ["", f_comment("Declare array to hold physical coordinate of quadrature point.")] element_code += [f_decl(f_double, f_X(points, gdim))] ops, coord_code = f_ip_coords(gdim, points, name, ip, r) ip_code += ["", f_comment("Compute physical coordinate of quadrature point, operations: %d." % ops)] ip_code += [coord_code] num_ops += ops # Update used psi tables and transformation set. sets[1].add(name) sets[3].add(f_coords(r)) # Generate code to compute function values. if functions: const_func_code, func_code, ops = _generate_functions(functions, sets) nest_ir += const_func_code ip_ir += func_code num_ops += ops # Generate code to compute conditionals (might depend on coordinates # and function values so put here). # TODO: Some conditionals might only depend on geometry so they # should be moved outside if possible. if conditionals: ip_ir.append(pyop2.Decl(f_double, c_sym(f_C(len(conditionals))))) # Sort conditionals (need to in case of nested conditionals). reversed_conds = dict([(n, (o, e)) for e, (t, o, n) in conditionals.items()]) for num in range(len(conditionals)): name = format["conditional"](num) ops, expr = reversed_conds[num] ip_ir.append(pyop2.Assign(c_sym(name), visit_rhs(expr))) num_ops += ops # Generate code for ip constant declarations. # TODO: this code should be removable as only executed when ffc's optimisations are on ip_const_ops, ip_const_code = generate_aux_constants(ip_consts, f_I,\ format["assign"], True) if len(ip_const_code) > 0: raise RuntimeError("IP Const code not supported") num_ops += ip_const_ops # Generate code to evaluate the element tensor. code, ops = _generate_integral_ir(points, terms, sets, optimise_parameters, parameters) num_ops += ops tensor_ops_count += num_ops*points ip_ir += code # Loop code over all IPs. # @@@: for (ip ...) { A[0][0] += ... } if points > 1: it_var = pyop2.Symbol(f_ip, ()) nest_ir += [pyop2.For(pyop2.Decl("int", it_var, c_sym(0)), pyop2.Less(it_var, c_sym(points)), pyop2.Incr(it_var, c_sym(1)), pyop2.Block(ip_ir, open_scope=True))] else: nest_ir += ip_ir return (nest_ir, tensor_ops_count)
def compile_c_kernel(expression, to_pts, to_element, fs, coords): """Produce a :class:`PyOP2.Kernel` from the c expression provided.""" coords_space = coords.function_space() coords_element = coords_space.fiat_element names = {v[0] for v in expression._user_args} X = coords_element.tabulate(0, to_pts).values()[0] # Produce C array notation of X. X_str = "{{"+"},\n{".join([",".join(map(str, x)) for x in X.T])+"}}" A = utils.unique_name("A", names) X = utils.unique_name("X", names) x_ = utils.unique_name("x_", names) k = utils.unique_name("k", names) d = utils.unique_name("d", names) i_ = utils.unique_name("i", names) # x is a reserved name. x = "x" if "x" in names: raise ValueError("cannot use 'x' as a user-defined Expression variable") ass_exp = [ast.Assign(ast.Symbol(A, (k,), ((len(expression.code), i),)), ast.FlatBlock("%s" % code)) for i, code in enumerate(expression.code)] vals = { "X": X, "x": x, "x_": x_, "k": k, "d": d, "i": i_, "x_array": X_str, "dim": coords_space.dim, "xndof": coords_element.space_dimension(), # FS will always either be a functionspace or # vectorfunctionspace, so just accessing dim here is safe # (we don't need to go through ufl_element.value_shape()) "nfdof": to_element.space_dimension() * numpy.prod(fs.dim, dtype=int), "ndof": to_element.space_dimension(), "assign_dim": numpy.prod(expression.value_shape(), dtype=int) } init = ast.FlatBlock(""" const double %(X)s[%(ndof)d][%(xndof)d] = %(x_array)s; double %(x)s[%(dim)d]; const double pi = 3.141592653589793; """ % vals) block = ast.FlatBlock(""" for (unsigned int %(d)s=0; %(d)s < %(dim)d; %(d)s++) { %(x)s[%(d)s] = 0; for (unsigned int %(i)s=0; %(i)s < %(xndof)d; %(i)s++) { %(x)s[%(d)s] += %(X)s[%(k)s][%(i)s] * %(x_)s[%(i)s][%(d)s]; }; }; """ % vals) loop = ast.c_for(k, "%(ndof)d" % vals, ast.Block([block] + ass_exp, open_scope=True)) user_args = [] user_init = [] for _, arg in expression._user_args: if arg.shape == (1, ): user_args.append(ast.Decl("double *", "%s_" % arg.name)) user_init.append(ast.FlatBlock("const double %s = *%s_;" % (arg.name, arg.name))) else: user_args.append(ast.Decl("double *", arg.name)) kernel_code = ast.FunDecl("void", "expression_kernel", [ast.Decl("double", ast.Symbol(A, (int("%(nfdof)d" % vals),))), ast.Decl("double**", x_)] + user_args, ast.Block(user_init + [init, loop], open_scope=False)) coefficients = [coords] for _, arg in expression._user_args: coefficients.append(GlobalWrapper(arg)) return op2.Kernel(kernel_code, kernel_code.name), False, tuple(coefficients)
def ker_init(): return ast.FunDecl('void', 'ker_init', [ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=[''])], ast.Block([ast.Assign(ast.Symbol('B', (0,)), 0)]))
def tensor_assembly_calls(builder): """Generates a block of statements for assembling the local finite element tensors. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information and assembly calls. """ statements = [ast.FlatBlock("/* Assemble local tensors */\n")] # Cell integrals are straightforward. Just splat them out. statements.extend(builder.assembly_calls["cell"]) if builder.needs_cell_facets: # The for-loop will have the general structure: # # FOR (facet=0; facet<num_facets; facet++): # IF (facet is interior): # *interior calls # ELSE IF (facet is exterior): # *exterior calls # # If only interior (exterior) facets are present, # then only a single IF-statement checking for interior # (exterior) facets will be present within the loop. The # cell facets are labelled `1` for interior, and `0` for # exterior. statements.append(ast.FlatBlock("/* Loop over cell facets */\n")) int_calls = list( chain(*[ builder.assembly_calls[it_type] for it_type in ("interior_facet", "interior_facet_vert") ])) ext_calls = list( chain(*[ builder.assembly_calls[it_type] for it_type in ("exterior_facet", "exterior_facet_vert") ])) # Compute the number of facets to loop over domain = builder.expression.ufl_domain() if domain.cell_set._extruded: num_facets = domain.ufl_cell()._cells[0].num_facets() else: num_facets = domain.ufl_cell().num_facets() if_ext = ast.Eq( ast.Symbol(builder.cell_facet_sym, rank=(builder.it_sym, )), 0) if_int = ast.Eq( ast.Symbol(builder.cell_facet_sym, rank=(builder.it_sym, )), 1) body = [] if ext_calls: body.append( ast.If(if_ext, (ast.Block(ext_calls, open_scope=True), ))) if int_calls: body.append( ast.If(if_int, (ast.Block(int_calls, open_scope=True), ))) statements.append( ast.For(ast.Decl("unsigned int", builder.it_sym, init=0), ast.Less(builder.it_sym, num_facets), ast.Incr(builder.it_sym, 1), body)) if builder.needs_mesh_layers: # In the presence of interior horizontal facet calls, an # IF-ELIF-ELSE block is generated using the mesh levels # as conditions for which calls are needed: # # IF (layer == bottom_layer): # *bottom calls # ELSE IF (layer == top_layer): # *top calls # ELSE: # *top calls # *bottom calls # # Any extruded top or bottom calls for extruded facets are # included within the appropriate mesh-level IF-blocks. If # no interior horizontal facet calls are present, then # standard IF-blocks are generated for exterior top/bottom # facet calls when appropriate: # # IF (layer == bottom_layer): # *bottom calls # # IF (layer == top_layer): # *top calls # # The mesh level is an integer provided as a macro kernel # argument. # FIXME: No variable layers assumption statements.append(ast.FlatBlock("/* Mesh levels: */\n")) num_layers = builder.expression.ufl_domain().topological.layers - 1 int_top = builder.assembly_calls["interior_facet_horiz_top"] int_btm = builder.assembly_calls["interior_facet_horiz_bottom"] ext_top = builder.assembly_calls["exterior_facet_top"] ext_btm = builder.assembly_calls["exterior_facet_bottom"] bottom = ast.Block(int_top + ext_btm, open_scope=True) top = ast.Block(int_btm + ext_top, open_scope=True) rest = ast.Block(int_btm + int_top, open_scope=True) statements.append( ast.If(ast.Eq(builder.mesh_layer_sym, 0), (bottom, ast.If(ast.Eq(builder.mesh_layer_sym, num_layers - 1), (top, rest))))) return statements
def build_hard_fusion_kernel(base_loop, fuse_loop, fusion_map, loop_chain_index): """ Build AST and :class:`Kernel` for two loops suitable to hard fusion. The AST consists of three functions: fusion, base, fuse. base and fuse are respectively the ``base_loop`` and the ``fuse_loop`` kernels, whereas fusion is the orchestrator that invokes, for each ``base_loop`` iteration, base and, if still to be executed, fuse. The orchestrator has the following structure: :: fusion (buffer, ..., executed): base (buffer, ...) for i = 0 to arity: if not executed[i]: additional pointer staging required by kernel2 fuse (sub_buffer, ...) insertion into buffer The executed array tracks whether the i-th iteration (out of /arity/) adjacent to the main kernel1 iteration has been executed. """ finder = Find((ast.FunDecl, ast.PreprocessNode)) base = base_loop.kernel base_ast = dcopy(base._ast) base_info = finder.visit(base_ast) base_headers = base_info[ast.PreprocessNode] base_fundecl = base_info[ast.FunDecl] assert len(base_fundecl) == 1 base_fundecl = base_fundecl[0] fuse = fuse_loop.kernel fuse_ast = dcopy(fuse._ast) fuse_info = finder.visit(fuse_ast) fuse_headers = fuse_info[ast.PreprocessNode] fuse_fundecl = fuse_info[ast.FunDecl] assert len(fuse_fundecl) == 1 fuse_fundecl = fuse_fundecl[0] # Create /fusion/ arguments and signature body = ast.Block([]) fusion_name = '%s_%s' % (base_fundecl.name, fuse_fundecl.name) fusion_args = dcopy(base_fundecl.args + fuse_fundecl.args) fusion_fundecl = ast.FunDecl(base_fundecl.ret, fusion_name, fusion_args, body) # Make sure kernel and variable names are unique base_fundecl.name = "%s_base" % base_fundecl.name fuse_fundecl.name = "%s_fuse" % fuse_fundecl.name for i, decl in enumerate(fusion_args): decl.sym.symbol += '_%d' % i # Filter out duplicate arguments, and append extra arguments to the fundecl binding = WeakFilter().kernel_args([base_loop, fuse_loop], fusion_fundecl) fusion_args += [ast.Decl('int*', 'executed'), ast.Decl('int*', 'fused_iters'), ast.Decl('int', 'i')] # Which args are actually used in /fuse/, but not in /base/ ? The gather for # such arguments is moved to /fusion/, to avoid usless memory LOADs base_dats = set(a.data for a in base_loop.args) fuse_dats = set(a.data for a in fuse_loop.args) unshared = OrderedDict() for arg, decl in binding.items(): if arg.data in fuse_dats - base_dats: unshared.setdefault(decl, arg) # Track position of Args that need a postponed gather # Can't track Args themselves as they change across different parloops fargs = {fusion_args.index(i): ('postponed', False) for i in unshared.keys()} fargs.update({len(set(binding.values())): ('onlymap', True)}) # Add maps for arguments that need a postponed gather for decl, arg in unshared.items(): decl_pos = fusion_args.index(decl) fusion_args[decl_pos].sym.symbol = arg.c_arg_name() if arg._is_indirect: fusion_args[decl_pos].sym.rank = () fusion_args.insert(decl_pos + 1, ast.Decl('int*', arg.c_map_name(0, 0))) # Append the invocation of /base/; then, proceed with the invocation # of the /fuse/ kernels base_funcall_syms = [binding[a].sym.symbol for a in base_loop.args] body.children.append(ast.FunCall(base_fundecl.name, *base_funcall_syms)) for idx in range(fusion_map.arity): fused_iter = ast.Assign('i', ast.Symbol('fused_iters', (idx,))) fuse_funcall = ast.FunCall(fuse_fundecl.name) if_cond = ast.Not(ast.Symbol('executed', ('i',))) if_update = ast.Assign(ast.Symbol('executed', ('i',)), 1) if_body = ast.Block([fuse_funcall, if_update], open_scope=True) if_exec = ast.If(if_cond, [if_body]) body.children.extend([ast.FlatBlock('\n'), fused_iter, if_exec]) # Modify the /fuse/ kernel # This is to take into account that many arguments are shared with # /base/, so they will only staged once for /base/. This requires # tweaking the way the arguments are declared and accessed in /fuse/. # For example, the shared incremented array (called /buffer/ in # the pseudocode in the comment above) now needs to take offsets # to be sure the locations that /base/ is supposed to increment are # actually accessed. The same concept apply to indirect arguments. init = lambda v: '{%s}' % ', '.join([str(j) for j in v]) for i, fuse_loop_arg in enumerate(fuse_loop.args): fuse_kernel_arg = binding[fuse_loop_arg] buffer_name = '%s_vec' % fuse_kernel_arg.sym.symbol fuse_funcall_sym = ast.Symbol(buffer_name) # What kind of temporaries do we need ? if fuse_loop_arg.access == INC: op, lvalue, rvalue = ast.Incr, fuse_kernel_arg.sym.symbol, buffer_name stager = lambda b, l: b.children.extend(l) indexer = lambda indices: [(k, j) for j, k in enumerate(indices)] pointers = [] elif fuse_loop_arg.access == READ: op, lvalue, rvalue = ast.Assign, buffer_name, fuse_kernel_arg.sym.symbol stager = lambda b, l: [b.children.insert(0, j) for j in reversed(l)] indexer = lambda indices: [(j, k) for j, k in enumerate(indices)] pointers = list(fuse_kernel_arg.pointers) # Now gonna handle arguments depending on their type and rank ... if fuse_loop_arg._is_global: # ... Handle global arguments. These can be dropped in the # kernel without any particular fiddling fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol) elif fuse_kernel_arg in unshared: # ... Handle arguments that appear only in /fuse/ staging = unshared[fuse_kernel_arg].c_vec_init(False).split('\n') rvalues = [ast.FlatBlock(j.split('=')[1]) for j in staging] lvalues = [ast.Symbol(buffer_name, (j,)) for j in range(len(staging))] staging = [ast.Assign(j, k) for j, k in zip(lvalues, rvalues)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (len(staging),)) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, qualifiers=fuse_kernel_arg.qual, pointers=list(pointers)) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_mat: # ... Handle Mats staging = [] for b in fused_inc_arg._block_shape: for rc in b: lvalue = ast.Symbol(lvalue, (idx, idx), ((rc[0], 'j'), (rc[1], 'k'))) rvalue = ast.Symbol(rvalue, ('j', 'k')) staging = ItSpace(mode=0).to_for([(0, rc[0]), (0, rc[1])], ('j', 'k'), [op(lvalue, rvalue)])[:1] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (fuse_kernel_arg.sym.rank,)) buffer_init = ast.ArrayInit(init([init([0.0])])) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_indirect: cdim = fuse_loop_arg.data.cdim if cdim == 1 and fuse_kernel_arg.sym.rank: # [Special case] # ... Handle rank 1 indirect arguments that appear in both # /base/ and /fuse/: just point into the right location rank = (idx,) if fusion_map.arity > 1 else () fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol, rank) else: # ... Handle indirect arguments. At the C level, these arguments # are of pointer type, so simple pointer arithmetic is used # to ensure the kernel accesses are to the correct locations fuse_arity = fuse_loop_arg.map.arity base_arity = fuse_arity*fusion_map.arity size = fuse_arity*cdim # Set the proper storage layout before invoking /fuse/ ofs_vals = [[base_arity*j + k for k in range(fuse_arity)] for j in range(cdim)] ofs_vals = [[fuse_arity*j + k for k in flatten(ofs_vals)] for j in range(fusion_map.arity)] ofs_vals = list(flatten(ofs_vals)) indices = [ofs_vals[idx*size + j] for j in range(size)] staging = [op(ast.Symbol(lvalue, (j,)), ast.Symbol(rvalue, (k,))) for j, k in indexer(indices)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (size,)) if fuse_loop_arg.access == INC: buffer_init = ast.ArrayInit(init([0.0])) else: buffer_init = ast.EmptyStatement() pointers.pop() buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) else: # Nothing special to do for direct arguments pass # Finally update the /fuse/ funcall fuse_funcall.children.append(fuse_funcall_sym) fused_headers = set([str(h) for h in base_headers + fuse_headers]) fused_ast = ast.Root([ast.PreprocessNode(h) for h in fused_headers] + [base_fundecl, fuse_fundecl, fusion_fundecl]) return Kernel([base, fuse], fused_ast, loop_chain_index), fargs
def get_restriction_kernel(fiat_element, unique_indices, dim=1, no_weights=False): weights = get_restriction_weights(fiat_element)[unique_indices].T ncdof = weights.shape[0] nfdof = weights.shape[1] arglist = [ ast.Decl("double", ast.Symbol("coarse", (ncdof * dim, ))), ast.Decl("double", ast.Symbol("*restrict *restrict fine", ()), qualifiers=["const"]) ] if not no_weights: arglist.append( ast.Decl("double", ast.Symbol("*restrict *restrict count_weights", ()), qualifiers=["const"])) all_ones = np.allclose(weights, 1.0) if all_ones: w = [] else: w_sym = ast.Symbol("weights", (ncdof, nfdof)) init = ast.ArrayInit(format_array_literal(weights)) w = [ast.Decl("double", w_sym, init, qualifiers=["const"])] i = ast.Symbol("i", ()) j = ast.Symbol("j", ()) k = ast.Symbol("k", ()) fine = ast.Symbol("fine", (j, k)) if no_weights: if all_ones: assign = fine else: assign = ast.Prod(fine, ast.Symbol("weights", (i, j))) else: if all_ones: assign = ast.Prod(fine, ast.Symbol("count_weights", (j, 0))) else: assign = ast.Prod( fine, ast.Prod(ast.Symbol("weights", (i, j)), ast.Symbol("count_weights", (j, 0)))) assignment = ast.Incr( ast.Symbol("coarse", (ast.Sum(k, ast.Prod(i, ast.c_sym(dim))), )), assign) k_loop = ast.For(ast.Decl("int", k, ast.c_sym(0)), ast.Less(k, ast.c_sym(dim)), ast.Incr(k, ast.c_sym(1)), ast.Block([assignment], open_scope=True)) j_loop = ast.For(ast.Decl("int", j, ast.c_sym(0)), ast.Less(j, ast.c_sym(nfdof)), ast.Incr(j, ast.c_sym(1)), ast.Block([k_loop], open_scope=True)) i_loop = ast.For(ast.Decl("int", i, ast.c_sym(0)), ast.Less(i, ast.c_sym(ncdof)), ast.Incr(i, ast.c_sym(1)), ast.Block([j_loop], open_scope=True)) k = ast.FunDecl("void", "restriction", arglist, ast.Block(w + [i_loop]), pred=["static", "inline"]) return op2.Kernel(k, "restriction", opts=parameters["coffee"])
def generate_kernel_ast(builder, statements, declared_temps): """Glues together the complete AST for the Slate expression contained in the :class:`LocalKernelBuilder`. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg statements: A list of COFFEE objects containing all assembly calls and temporary declarations. :arg declared_temps: A `dict` containing all previously declared temporaries. Return: A `KernelInfo` object describing the complete AST. """ slate_expr = builder.expression if slate_expr.rank == 0: # Scalars are treated as 1x1 MatrixBase objects shape = (1, ) else: shape = slate_expr.shape # Now we create the result statement by declaring its eigen type and # using Eigen::Map to move between Eigen and C data structs. statements.append(ast.FlatBlock("/* Map eigen tensor into C struct */\n")) result_sym = ast.Symbol("T%d" % len(declared_temps)) result_data_sym = ast.Symbol("A%d" % len(declared_temps)) result_type = "Eigen::Map<%s >" % eigen_matrixbase_type(shape) result = ast.Decl(SCALAR_TYPE, ast.Symbol(result_data_sym, shape)) result_statement = ast.FlatBlock( "%s %s((%s *)%s);\n" % (result_type, result_sym, SCALAR_TYPE, result_data_sym)) statements.append(result_statement) # Generate the complete c++ string performing the linear algebra operations # on Eigen matrices/vectors statements.append(ast.FlatBlock("/* Linear algebra expression */\n")) cpp_string = ast.FlatBlock( metaphrase_slate_to_cpp(slate_expr, declared_temps)) statements.append(ast.Incr(result_sym, cpp_string)) # Generate arguments for the macro kernel args = [result, ast.Decl("%s **" % SCALAR_TYPE, builder.coord_sym)] # Orientation information if builder.oriented: args.append(ast.Decl("int **", builder.cell_orientations_sym)) # Coefficient information expr_coeffs = slate_expr.coefficients() for c in expr_coeffs: if isinstance(c, Constant): ctype = "%s *" % SCALAR_TYPE else: ctype = "%s **" % SCALAR_TYPE args.extend([ast.Decl(ctype, csym) for csym in builder.coefficient(c)]) # Facet information if builder.needs_cell_facets: args.append( ast.Decl("%s *" % as_cstr(cell_to_facets_dtype), builder.cell_facet_sym)) # NOTE: We need to be careful about the ordering here. Mesh layers are # added as the final argument to the kernel. if builder.needs_mesh_layers: args.append(ast.Decl("int", builder.mesh_layer_sym)) # Macro kernel macro_kernel_name = "compile_slate" stmts = ast.Block(statements) macro_kernel = ast.FunDecl("void", macro_kernel_name, args, stmts, pred=["static", "inline"]) # Construct the final ast kernel_ast = ast.Node(builder.templated_subkernels + [macro_kernel]) # Now we wrap up the kernel ast as a PyOP2 kernel and include the # Eigen header files include_dirs = builder.include_dirs include_dirs.extend(["%s/include/eigen3/" % d for d in PETSC_DIR]) op2kernel = op2.Kernel( kernel_ast, macro_kernel_name, cpp=True, include_dirs=include_dirs, headers=['#include <Eigen/Dense>', '#define restrict __restrict']) # Send back a "TSFC-like" SplitKernel object with an # index and KernelInfo kinfo = KernelInfo(kernel=op2kernel, integral_type=builder.integral_type, oriented=builder.oriented, subdomain_id="otherwise", domain_number=0, coefficient_map=tuple(range(len(expr_coeffs))), needs_cell_facets=builder.needs_cell_facets, pass_layer_arg=builder.needs_mesh_layers) return kinfo
def compile_expression(slate_expr, tsfc_parameters=None): """Takes a SLATE expression `slate_expr` and returns the appropriate :class:`firedrake.op2.Kernel` object representing the SLATE expression. :arg slate_expr: a :class:'TensorBase' expression. :arg tsfc_parameters: an optional `dict` of form compiler parameters to be passed onto TSFC during the compilation of ufl forms. """ if not isinstance(slate_expr, TensorBase): raise ValueError( "Expecting a `slate.TensorBase` expression, not a %r" % slate_expr) # TODO: Get PyOP2 to write into mixed dats if any(len(a.function_space()) > 1 for a in slate_expr.arguments()): raise NotImplementedError("Compiling mixed slate expressions") # Initialize shape and statements list shape = slate_expr.shape statements = [] # Create a builder for the SLATE expression builder = KernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) # Initialize coordinate and facet symbols coordsym = ast.Symbol("coords") coords = None cellfacetsym = ast.Symbol("cell_facets") inc = [] # Now we construct the list of statements to provide to the builder context_temps = builder.temps.copy() for exp, t in context_temps.items(): statements.append(ast.Decl(eigen_matrixbase_type(exp.shape), t)) statements.append(ast.FlatBlock("%s.setZero();\n" % t)) for splitkernel in builder.kernel_exprs[exp]: clist = [] index = splitkernel.indices kinfo = splitkernel.kinfo integral_type = kinfo.integral_type if integral_type not in [ "cell", "interior_facet", "exterior_facet" ]: raise NotImplementedError( "Integral type %s not currently supported." % integral_type) coordinates = exp.ufl_domain().coordinates if coords is not None: assert coordinates == coords else: coords = coordinates for cindex in kinfo.coefficient_map: c = exp.coefficients()[cindex] # Handles both mixed and non-mixed coefficient cases clist.extend(builder.extract_coefficient(c)) inc.extend(kinfo.kernel._include_dirs) tensor = eigen_tensor(exp, t, index) if integral_type in ["interior_facet", "exterior_facet"]: builder.require_cell_facets() itsym = ast.Symbol("i0") clist.append(ast.FlatBlock("&%s" % itsym)) loop_body = [] nfacet = exp.ufl_domain().ufl_cell().num_facets() if integral_type == "exterior_facet": checker = 1 else: checker = 0 loop_body.append( ast.If( ast.Eq(ast.Symbol(cellfacetsym, rank=(itsym, )), checker), [ ast.Block([ ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist) ], open_scope=True) ])) loop = ast.For(ast.Decl("unsigned int", itsym, init=0), ast.Less(itsym, nfacet), ast.Incr(itsym, 1), loop_body) statements.append(loop) else: statements.append( ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist)) # Now we handle any terms that require auxiliary data (if any) if bool(builder.aux_exprs): aux_temps, aux_statements = auxiliary_information(builder) context_temps.update(aux_temps) statements.extend(aux_statements) result_sym = ast.Symbol("T%d" % len(builder.temps)) result_data_sym = ast.Symbol("A%d" % len(builder.temps)) result_type = "Eigen::Map<%s >" % eigen_matrixbase_type(shape) result = ast.Decl(SCALAR_TYPE, ast.Symbol(result_data_sym, shape)) result_statement = ast.FlatBlock( "%s %s((%s *)%s);\n" % (result_type, result_sym, SCALAR_TYPE, result_data_sym)) statements.append(result_statement) cpp_string = ast.FlatBlock( metaphrase_slate_to_cpp(slate_expr, context_temps)) statements.append(ast.Assign(result_sym, cpp_string)) # Generate arguments for the macro kernel args = [result, ast.Decl("%s **" % SCALAR_TYPE, coordsym)] for c in slate_expr.coefficients(): if isinstance(c, Constant): ctype = "%s *" % SCALAR_TYPE else: ctype = "%s **" % SCALAR_TYPE args.extend([ ast.Decl(ctype, sym_c) for sym_c in builder.extract_coefficient(c) ]) if builder.needs_cell_facets: args.append(ast.Decl("char *", cellfacetsym)) macro_kernel_name = "compile_slate" kernel_ast, oriented = builder.construct_ast( name=macro_kernel_name, args=args, statements=ast.Block(statements)) inc.extend(["%s/include/eigen3/" % d for d in PETSC_DIR]) op2kernel = op2.Kernel( kernel_ast, macro_kernel_name, cpp=True, include_dirs=inc, headers=['#include <Eigen/Dense>', '#define restrict __restrict']) assert len(slate_expr.ufl_domains()) == 1 kinfo = KernelInfo(kernel=op2kernel, integral_type="cell", oriented=oriented, subdomain_id="otherwise", domain_number=0, coefficient_map=range(len(slate_expr.coefficients())), needs_cell_facets=builder.needs_cell_facets) idx = tuple([0] * slate_expr.rank) return (SplitKernel(idx, kinfo), )
def compile_c_kernel(expression, to_pts, to_element, fs, coords): """Produce a :class:`PyOP2.Kernel` from the c expression provided.""" coords_space = coords.function_space() coords_element = create_element(coords_space.ufl_element(), vector_is_mixed=False) names = {v[0] for v in expression._user_args} X = list(coords_element.tabulate(0, to_pts).values())[0] # Produce C array notation of X. X_str = "{{" + "},\n{".join([",".join(map(str, x)) for x in X.T]) + "}}" A = utils.unique_name("A", names) X = utils.unique_name("X", names) x_ = utils.unique_name("x_", names) k = utils.unique_name("k", names) d = utils.unique_name("d", names) i_ = utils.unique_name("i", names) # x is a reserved name. x = "x" if "x" in names: raise ValueError( "cannot use 'x' as a user-defined Expression variable") ass_exp = [ ast.Assign(ast.Symbol(A, (k, ), ((len(expression.code), i), )), ast.FlatBlock("%s" % code)) for i, code in enumerate(expression.code) ] dim = coords_space.value_size ndof = to_element.space_dimension() xndof = coords_element.space_dimension() nfdof = to_element.space_dimension() * numpy.prod(fs.value_size, dtype=int) init_X = ast.Decl(typ="double", sym=ast.Symbol(X, rank=(ndof, xndof)), qualifiers=["const"], init=X_str) init_x = ast.Decl(typ="double", sym=ast.Symbol(x, rank=(coords_space.value_size, ))) init_pi = ast.Decl(typ="double", sym="pi", qualifiers=["const"], init="3.141592653589793") init = ast.Block([init_X, init_x, init_pi]) incr_x = ast.Incr( ast.Symbol(x, rank=(d, )), ast.Prod(ast.Symbol(X, rank=(k, i_)), ast.Symbol(x_, rank=(ast.Sum(ast.Prod(i_, dim), d), )))) assign_x = ast.Assign(ast.Symbol(x, rank=(d, )), 0) loop_x = ast.For(init=ast.Decl("unsigned int", i_, 0), cond=ast.Less(i_, xndof), incr=ast.Incr(i_, 1), body=[incr_x]) block = ast.For(init=ast.Decl("unsigned int", d, 0), cond=ast.Less(d, dim), incr=ast.Incr(d, 1), body=[assign_x, loop_x]) loop = ast.c_for(k, ndof, ast.Block([block] + ass_exp, open_scope=True)) user_args = [] user_init = [] for _, arg in expression._user_args: if arg.shape == (1, ): user_args.append(ast.Decl("double *", "%s_" % arg.name)) user_init.append( ast.FlatBlock("const double %s = *%s_;" % (arg.name, arg.name))) else: user_args.append(ast.Decl("double *", arg.name)) kernel_code = ast.FunDecl( "void", "expression_kernel", [ ast.Decl("double", ast.Symbol(A, (nfdof, ))), ast.Decl("double*", x_) ] + user_args, ast.Block(user_init + [init, loop], open_scope=False)) coefficients = [coords] for _, arg in expression._user_args: coefficients.append(GlobalWrapper(arg)) return op2.Kernel(kernel_code, kernel_code.name), False, tuple(coefficients)
def tensor_assembly_calls(builder): """Generates a block of statements for assembling the local finite element tensors. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information and assembly calls. """ assembly_calls = builder.assembly_calls statements = [ast.FlatBlock("/* Assemble local tensors */\n")] # Cell integrals are straightforward. Just splat them out. statements.extend(assembly_calls["cell"]) if builder.needs_cell_facets: # The for-loop will have the general structure: # # FOR (facet=0; facet<num_facets; facet++): # IF (facet is interior): # *interior calls # ELSE IF (facet is exterior): # *exterior calls # # If only interior (exterior) facets are present, # then only a single IF-statement checking for interior # (exterior) facets will be present within the loop. The # cell facets are labelled `1` for interior, and `0` for # exterior. statements.append(ast.FlatBlock("/* Loop over cell facets */\n")) int_calls = list( chain(*[ assembly_calls[it_type] for it_type in ("interior_facet", "interior_facet_vert") ])) ext_calls = list( chain(*[ assembly_calls[it_type] for it_type in ("exterior_facet", "exterior_facet_vert") ])) # Generate logical statements for handling exterior/interior facet # integrals on subdomains. # Currently only facet integrals are supported. for sd_type in ("subdomains_exterior_facet", "subdomains_interior_facet"): stmts = [] for sd, sd_calls in groupby(assembly_calls[sd_type], lambda x: x[0]): _, calls = zip(*sd_calls) if_sd = ast.Eq( ast.Symbol(builder.cell_facet_sym, rank=(builder.it_sym, 1)), sd) stmts.append( ast.If(if_sd, (ast.Block(calls, open_scope=True), ))) if sd_type == "subdomains_exterior_facet": ext_calls.extend(stmts) if sd_type == "subdomains_interior_facet": int_calls.extend(stmts) # Compute the number of facets to loop over domain = builder.expression.ufl_domain() if domain.cell_set._extruded: num_facets = domain.ufl_cell()._cells[0].num_facets() else: num_facets = domain.ufl_cell().num_facets() if_ext = ast.Eq( ast.Symbol(builder.cell_facet_sym, rank=(builder.it_sym, 0)), 0) if_int = ast.Eq( ast.Symbol(builder.cell_facet_sym, rank=(builder.it_sym, 0)), 1) body = [] if ext_calls: body.append( ast.If(if_ext, (ast.Block(ext_calls, open_scope=True), ))) if int_calls: body.append( ast.If(if_int, (ast.Block(int_calls, open_scope=True), ))) statements.append( ast.For(ast.Decl("unsigned int", builder.it_sym, init=0), ast.Less(builder.it_sym, num_facets), ast.Incr(builder.it_sym, 1), body)) if builder.needs_mesh_layers: # In the presence of interior horizontal facet calls, an # IF-ELIF-ELSE block is generated using the mesh levels # as conditions for which calls are needed: # # IF (layer == bottom_layer): # *bottom calls # ELSE IF (layer == top_layer): # *top calls # ELSE: # *top calls # *bottom calls # # Any extruded top or bottom calls for extruded facets are # included within the appropriate mesh-level IF-blocks. If # no interior horizontal facet calls are present, then # standard IF-blocks are generated for exterior top/bottom # facet calls when appropriate: # # IF (layer == bottom_layer): # *bottom calls # # IF (layer == top_layer): # *top calls # # The mesh level is an integer provided as a macro kernel # argument. # FIXME: No variable layers assumption statements.append(ast.FlatBlock("/* Mesh levels: */\n")) num_layers = ast.Symbol(builder.mesh_layer_count_sym, rank=(0, )) layer = builder.mesh_layer_sym types = [ "interior_facet_horiz_top", "interior_facet_horiz_bottom", "exterior_facet_top", "exterior_facet_bottom" ] decide = [ ast.Less(layer, num_layers), ast.Greater(layer, 0), ast.Eq(layer, num_layers), ast.Eq(layer, 0) ] for (integral_type, which) in zip(types, decide): statements.append( ast.If(which, (ast.Block(assembly_calls[integral_type], open_scope=True), ))) return statements
def init_cell_orientations(self, expr): """Compute and initialise :attr:`cell_orientations` relative to a specified orientation. :arg expr: an :class:`.Expression` evaluated to produce a reference normal direction. """ import firedrake.function as function import firedrake.functionspace as functionspace if expr.value_shape()[0] != 3: raise NotImplementedError('Only implemented for 3-vectors') if self.ufl_cell() not in (ufl.Cell('triangle', 3), ufl.Cell("quadrilateral", 3), ufl.OuterProductCell(ufl.Cell('interval'), ufl.Cell('interval'), gdim=3)): raise NotImplementedError('Only implemented for triangles and quadrilaterals embedded in 3d') if hasattr(self.topology, '_cell_orientations'): raise RuntimeError("init_cell_orientations already called, did you mean to do so again?") v0 = lambda x: ast.Symbol("v0", (x,)) v1 = lambda x: ast.Symbol("v1", (x,)) n = lambda x: ast.Symbol("n", (x,)) x = lambda x: ast.Symbol("x", (x,)) coords = lambda x, y: ast.Symbol("coords", (x, y)) body = [] body += [ast.Decl("double", v(3)) for v in [v0, v1, n, x]] body.append(ast.Decl("double", "dot")) body.append(ast.Assign("dot", 0.0)) body.append(ast.Decl("int", "i")) # if triangle, use v0 = x1 - x0, v1 = x2 - x0 # otherwise, for the various quads, use v0 = x2 - x0, v1 = x1 - x0 # recall reference element ordering: # triangle: 2 quad: 1 3 # 0 1 0 2 if self.ufl_cell() == ufl.Cell('triangle', 3): body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Assign(v0("i"), ast.Sub(coords(1, "i"), coords(0, "i"))), ast.Assign(v1("i"), ast.Sub(coords(2, "i"), coords(0, "i"))), ast.Assign(x("i"), 0.0)])) else: body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Assign(v0("i"), ast.Sub(coords(2, "i"), coords(0, "i"))), ast.Assign(v1("i"), ast.Sub(coords(1, "i"), coords(0, "i"))), ast.Assign(x("i"), 0.0)])) # n = v0 x v1 body.append(ast.Assign(n(0), ast.Sub(ast.Prod(v0(1), v1(2)), ast.Prod(v0(2), v1(1))))) body.append(ast.Assign(n(1), ast.Sub(ast.Prod(v0(2), v1(0)), ast.Prod(v0(0), v1(2))))) body.append(ast.Assign(n(2), ast.Sub(ast.Prod(v0(0), v1(1)), ast.Prod(v0(1), v1(0))))) body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Incr(x(j), coords("i", j)) for j in range(3)])) body.extend([ast.FlatBlock("dot += (%(x)s) * n[%(i)d];\n" % {"x": x_, "i": i}) for i, x_ in enumerate(expr.code)]) body.append(ast.Assign("orientation[0][0]", ast.Ternary(ast.Less("dot", 0), 1, 0))) kernel = op2.Kernel(ast.FunDecl("void", "cell_orientations", [ast.Decl("int**", "orientation"), ast.Decl("double**", "coords")], ast.Block(body)), "cell_orientations") # Build the cell orientations as a DG0 field (so that we can # pass it in for facet integrals and the like) fs = functionspace.FunctionSpace(self, 'DG', 0) cell_orientations = function.Function(fs, name="cell_orientations", dtype=np.int32) op2.par_loop(kernel, self.cell_set, cell_orientations.dat(op2.WRITE, cell_orientations.cell_node_map()), self.coordinates.dat(op2.READ, self.coordinates.cell_node_map())) self.topology._cell_orientations = cell_orientations
def _generate_functions(functions, sets): "Generate declarations for functions and code to compute values." f_comment = format["comment"] f_double = format["float declaration"] f_F = format["function value"] f_float = format["floating point"] f_decl = format["declaration"] f_r = format["free indices"] f_iadd = format["iadd"] f_loop = format["generate loop"] # Create the function declarations -- only the (unique) variables we need const_vardecls = set(fd.id for fd in functions.values() if fd.cellwise_constant) const_ast_items = [pyop2.Decl(f_double, c_sym(f_F(n)), c_sym(f_float(0))) for n in const_vardecls] vardecls = set(fd.id for fd in functions.values() if not fd.cellwise_constant) ast_items = [pyop2.Decl(f_double, c_sym(f_F(n)), c_sym(f_float(0))) for n in vardecls] # Get sets. used_psi_tables = sets[1] used_nzcs = sets[2] # Sort functions after being cellwise constant and loop ranges. function_groups = collections.defaultdict(list) for f, fd in functions.items(): function_groups[(fd.cellwise_constant, fd.loop_range)].append(f) total_ops = 0 # Loop ranges and get list of functions. for (cellwise_constant, loop_range), function_list in function_groups.iteritems(): function_expr = [] # Loop functions. func_ops = 0 for function in function_list: data = functions[function] range_i = data.loop_range if not isinstance(range_i, tuple): range_i = tuple([range_i]) # Add name to used psi names and non zeros name to used_nzcs. used_psi_tables.add(data.psi_name) used_nzcs.update(data.used_nzcs) # # TODO: This check can be removed for speed later. # REMOVED this, since we might need to increment into the same # number more than once for mixed element + interior facets # ffc_assert(data.id not in function_expr, "This is definitely not supposed to happen!") # Convert function to COFFEE ast node, save string # representation for sorting (such that we're reproducible # in parallel). function = visit_rhs(function) key = str(function) function_expr.append((data.id, function, key)) # Get number of operations to compute entry and add to function operations count. func_ops += (data.ops + 1)*sum(range_i) # Gather, sorted by string rep of function. lines = [pyop2.Incr(c_sym(f_F(n)), fn) for n, fn, _ in sorted(function_expr, key=lambda x: x[2])] if isinstance(loop_range, tuple): if not all(map(lambda x: x==loop_range[0], loop_range)): raise RuntimeError("General mixed elements not yet supported in PyOP2") loop_vars = [ (f_r[0], 0, loop_range[0]), (f_r[1], 0, len(loop_range)) ] else: loop_vars = [(f_r[0], 0, loop_range)] # TODO: If loop_range == 1, this loop may be unneccessary. Not sure if it's safe to just skip it. it_var = c_sym(loop_vars[0][0]) loop_size = c_sym(loop_vars[0][2]) ast_item = pyop2.For(pyop2.Decl("int", it_var, c_sym(0)), pyop2.Less(it_var, loop_size), pyop2.Incr(it_var, c_sym(1)), pyop2.Block(lines, open_scope=True)) if cellwise_constant: const_ast_items.append(ast_item) else: ast_items.append(ast_item) return const_ast_items, ast_items, total_ops
def _generate_integral_ir(points, terms, sets, optimise_parameters, parameters): "Generate code to evaluate the element tensor." # For checking if the integral code is for a matrix def is_matrix(loop): loop_indices = [ l[0] for l in loop ] return (format["first free index"] in loop_indices and \ format["second free index"] in loop_indices) # Prefetch formats to speed up code generation. p_format = parameters["format"] f_comment = format["comment"] f_mul = format["mul"] f_scale_factor = format["scale factor"] f_iadd = format["iadd"] f_add = format["add"] f_A = format["element tensor"][p_format] f_j = format["first free index"] f_k = format["second free index"] f_loop = format["generate loop"] f_B = format["basis constant"] # Initialise return values. code = [] num_ops = 0 loops = {} # Extract sets. used_weights, used_psi_tables, used_nzcs, trans_set = sets nests = [] # Loop terms and create code. for loop, (data, entry_vals) in terms.items(): # If we don't have any entry values, there's no need to generate the loop. if not entry_vals: continue # Get data. t_set, u_weights, u_psi_tables, u_nzcs, basis_consts = data # If we have a value, then we also need to update the sets of used variables. trans_set.update(t_set) used_weights.update(u_weights) used_psi_tables.update(u_psi_tables) used_nzcs.update(u_nzcs) # @@@: A[0][0] += FE0[ip][j]*FE0[ip][k]*W24[ip]*det; entry_ir = [] for entry, value, ops in entry_vals: # Left hand side it_vars = entry if len(loop) > 0 else (0,) rank = tuple(i.loop_index if hasattr(i, 'loop_index') else i for i in it_vars) offset = tuple((1, int(i.offset)) if hasattr(i, 'offset') else (1, 0) for i in it_vars) local_tensor = pyop2.Symbol(f_A(''), rank, offset) # Right hand side pyop2_rhs = visit_rhs(value) entry_ir.append(pyop2.Incr(local_tensor, pyop2_rhs, "#pragma coffee expression")) # Disgusting hack, ensure resulting IR comes out in same order # on all processes. entry_ir = sorted(entry_ir, key=lambda x: x.gencode()) if len(loop) == 0: nest = pyop2.Block(entry_ir, open_scope=True) elif len(loop) in [1, 2]: it_var = c_sym(loop[0][0]) end = c_sym(loop[0][2]) nest = pyop2.For(pyop2.Decl("int", it_var, c_sym(0)), pyop2.Less(it_var, end), \ pyop2.Incr(it_var, c_sym(1)), pyop2.Block(entry_ir, open_scope=True), "#pragma coffee itspace") if len(loop) == 2: it_var = c_sym(loop[1][0]) end = c_sym(loop[1][2]) nest_k = pyop2.For(pyop2.Decl("int", it_var, c_sym(0)), pyop2.Less(it_var, end), \ pyop2.Incr(it_var, c_sym(1)), pyop2.Block(entry_ir, open_scope=True), "#pragma coffee itspace") nest.children[0] = pyop2.Block([nest_k], open_scope=True) nests.append(nest) return nests, num_ops
def facet_integral_loop(cxt_kernel, builder, coordsym, cellfacetsym, cell_orientations): """Generates a code statement for evaluating exterior/interior facet integrals. :arg cxt_kernel: A :namedtuple:`ContextKernel` containing all relevant integral types and TSFC kernels associated with the form nested in the expression. :arg builder: A :class:`KernelBuilder` containing the expression context. :arg coordsym: An `ast.Symbol` object representing coordinate arguments for the kernel. :arg cellfacetsym: An `ast.Symbol` representing the cell facets. :arg cell_orientations: An `ast.Symbol` representing cell orientation information. Returns: A COFFEE code statement and updated include_dirs """ exp = cxt_kernel.tensor t = builder.temps[exp] it_type = cxt_kernel.original_integral_type itsym = ast.Symbol("i0") chker = { "interior_facet": 1, "interior_facet_vert": 1, "exterior_facet": 0, "exterior_facet_vert": 0 } # Compute the correct number of facets for a particular facet measure if it_type in ["interior_facet", "exterior_facet"]: # Non-extruded case nfacet = exp.ufl_domain().ufl_cell().num_facets() elif it_type in ["interior_facet_vert", "exterior_facet_vert"]: # Extrusion case base_cell = exp.ufl_domain().ufl_cell()._cells[0] nfacet = base_cell.num_facets() else: raise ValueError("Integral type %s not supported." % it_type) incl = [] funcalls = [] checker = chker[it_type] for splitkernel in cxt_kernel.tsfc_kernels: index = splitkernel.indices kinfo = splitkernel.kinfo # Generate an iterable of coefficients to pass to the subkernel # if any are required clist = [ c for ci in kinfo.coefficient_map for c in builder.coefficient(exp.coefficients()[ci]) ] incl.extend(kinfo.kernel._include_dirs) tensor = eigen_tensor(exp, t, index) if kinfo.oriented: clist.insert(0, cell_orientations) clist.append(ast.FlatBlock("&%s" % itsym)) funcalls.append( ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist)) loop_body = ast.If( ast.Eq(ast.Symbol(cellfacetsym, rank=(itsym, )), checker), [ast.Block(funcalls, open_scope=True)]) loop_stmt = ast.For(ast.Decl("unsigned int", itsym, init=0), ast.Less(itsym, nfacet), ast.Incr(itsym, 1), loop_body) return loop_stmt, incl
def generate_kernel_ast(builder, statements, declared_temps): """Glues together the complete AST for the Slate expression contained in the :class:`LocalKernelBuilder`. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg statements: A list of COFFEE objects containing all assembly calls and temporary declarations. :arg declared_temps: A `dict` containing all previously declared temporaries. Return: A `KernelInfo` object describing the complete AST. """ slate_expr = builder.expression if slate_expr.rank == 0: # Scalars are treated as 1x1 MatrixBase objects shape = (1, ) else: shape = slate_expr.shape # Now we create the result statement by declaring its eigen type and # using Eigen::Map to move between Eigen and C data structs. statements.append(ast.FlatBlock("/* Map eigen tensor into C struct */\n")) result_sym = ast.Symbol("T%d" % len(declared_temps)) result_data_sym = ast.Symbol("A%d" % len(declared_temps)) result_type = "Eigen::Map<%s >" % eigen_matrixbase_type(shape) result = ast.Decl(ScalarType_c, ast.Symbol(result_data_sym), pointers=[("restrict", )]) result_statement = ast.FlatBlock( "%s %s((%s *)%s);\n" % (result_type, result_sym, ScalarType_c, result_data_sym)) statements.append(result_statement) # Generate the complete c++ string performing the linear algebra operations # on Eigen matrices/vectors statements.append(ast.FlatBlock("/* Linear algebra expression */\n")) cpp_string = ast.FlatBlock(slate_to_cpp(slate_expr, declared_temps)) statements.append(ast.Incr(result_sym, cpp_string)) # Generate arguments for the macro kernel args = [ result, ast.Decl(ScalarType_c, builder.coord_sym, pointers=[("restrict", )], qualifiers=["const"]) ] # Orientation information if builder.oriented: args.append( ast.Decl("int", builder.cell_orientations_sym, pointers=[("restrict", )], qualifiers=["const"])) # Coefficient information expr_coeffs = slate_expr.coefficients() for c in expr_coeffs: args.extend([ ast.Decl(ScalarType_c, csym, pointers=[("restrict", )], qualifiers=["const"]) for csym in builder.coefficient(c) ]) # Facet information if builder.needs_cell_facets: f_sym = builder.cell_facet_sym f_arg = ast.Symbol("arg_cell_facets") f_dtype = as_cstr(cell_to_facets_dtype) # cell_facets is locally a flattened 2-D array. We typecast here so we # can access its entries using standard array notation. cast = "%s (*%s)[2] = (%s (*)[2])%s;\n" % (f_dtype, f_sym, f_dtype, f_arg) statements.insert(0, ast.FlatBlock(cast)) args.append( ast.Decl(f_dtype, f_arg, pointers=[("restrict", )], qualifiers=["const"])) # NOTE: We need to be careful about the ordering here. Mesh layers are # added as the final argument to the kernel # and the amount of layers before that. if builder.needs_mesh_layers: args.append( ast.Decl("int", builder.mesh_layer_count_sym, pointers=[("restrict", )], qualifiers=["const"])) args.append(ast.Decl("int", builder.mesh_layer_sym)) # Cell size information if builder.needs_cell_sizes: args.append( ast.Decl(ScalarType_c, builder.cell_size_sym, pointers=[("restrict", )], qualifiers=["const"])) # Macro kernel macro_kernel_name = "pyop2_kernel_compile_slate" stmts = ast.Block(statements) macro_kernel = ast.FunDecl("void", macro_kernel_name, args, stmts, pred=["static", "inline"]) # Construct the final ast kernel_ast = ast.Node(builder.templated_subkernels + [macro_kernel]) # Now we wrap up the kernel ast as a PyOP2 kernel and include the # Eigen header files include_dirs = list(builder.include_dirs) include_dirs.append(EIGEN_INCLUDE_DIR) op2kernel = op2.Kernel( kernel_ast, macro_kernel_name, cpp=True, include_dirs=include_dirs, headers=['#include <Eigen/Dense>', '#define restrict __restrict']) op2kernel.num_flops = builder.expression_flops + builder.terminal_flops # Send back a "TSFC-like" SplitKernel object with an # index and KernelInfo kinfo = KernelInfo(kernel=op2kernel, integral_type=builder.integral_type, oriented=builder.oriented, subdomain_id="otherwise", domain_number=0, coefficient_map=slate_expr.coeff_map, needs_cell_facets=builder.needs_cell_facets, pass_layer_arg=builder.needs_mesh_layers, needs_cell_sizes=builder.needs_cell_sizes) return kinfo
def compile_expression(slate_expr, tsfc_parameters=None): """Takes a Slate expression `slate_expr` and returns the appropriate :class:`firedrake.op2.Kernel` object representing the Slate expression. :arg slate_expr: a :class:'TensorBase' expression. :arg tsfc_parameters: an optional `dict` of form compiler parameters to be passed onto TSFC during the compilation of ufl forms. Returns: A `tuple` containing a `SplitKernel(idx, kinfo)` """ if not isinstance(slate_expr, TensorBase): raise ValueError("Expecting a `TensorBase` expression, not %s" % type(slate_expr)) # TODO: Get PyOP2 to write into mixed dats if any(len(a.function_space()) > 1 for a in slate_expr.arguments()): raise NotImplementedError("Compiling mixed slate expressions") # If the expression has already been symbolically compiled, then # simply reuse the produced kernel. if slate_expr._metakernel_cache is not None: return slate_expr._metakernel_cache # Initialize coefficients, shape and statements list expr_coeffs = slate_expr.coefficients() # We treat scalars as 1x1 MatrixBase objects, so we give # the right shape to do so and everything just falls out. # This bit here ensures the return result has the right # shape if slate_expr.rank == 0: shape = (1, ) else: shape = slate_expr.shape statements = [] # Create a builder for the Slate expression builder = KernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) # Initialize coordinate, cell orientations and facet/layer # symbols coordsym = ast.Symbol("coords") coords = None cell_orientations = ast.Symbol("cell_orientations") cellfacetsym = ast.Symbol("cell_facets") mesh_layer_sym = ast.Symbol("layer") inc = [] # We keep track of temporaries that have been declared declared_temps = {} for cxt_kernel in builder.context_kernels: exp = cxt_kernel.tensor t = builder.temps[exp] if exp not in declared_temps: # Declare and initialize the temporary statements.append(ast.Decl(eigen_matrixbase_type(exp.shape), t)) statements.append(ast.FlatBlock("%s.setZero();\n" % t)) declared_temps[exp] = t it_type = cxt_kernel.original_integral_type if it_type not in supported_integral_types: raise NotImplementedError("Type %s not supported." % it_type) # Explicit checking of coordinates coordinates = exp.ufl_domain().coordinates if coords is not None: assert coordinates == coords else: coords = coordinates if it_type == "cell": # Nothing difficult about cellwise integrals. Just need # to get coefficient info, include_dirs and append # function calls to the appropriate subkernels. # If tensor is mixed, there will be more than one SplitKernel incl = [] for splitkernel in cxt_kernel.tsfc_kernels: index = splitkernel.indices kinfo = splitkernel.kinfo # Generate an iterable of coefficients to pass to the subkernel # if any are required clist = [ c for ci in kinfo.coefficient_map for c in builder.coefficient(exp.coefficients()[ci]) ] if kinfo.oriented: clist.insert(0, cell_orientations) incl.extend(kinfo.kernel._include_dirs) tensor = eigen_tensor(exp, t, index) statements.append( ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist)) elif it_type in [ "interior_facet", "exterior_facet", "interior_facet_vert", "exterior_facet_vert" ]: # These integral types will require accessing local facet # information and looping over facet indices. builder.require_cell_facets() loop_stmt, incl = facet_integral_loop(cxt_kernel, builder, coordsym, cellfacetsym, cell_orientations) statements.append(loop_stmt) elif it_type == "interior_facet_horiz": # The infamous interior horizontal facet # will have two SplitKernels: one top, # one bottom. The mesh layer will determine # which kernels we call. builder.require_mesh_layers() top_sks = [ k for k in cxt_kernel.tsfc_kernels if k.kinfo.integral_type == "exterior_facet_top" ] bottom_sks = [ k for k in cxt_kernel.tsfc_kernels if k.kinfo.integral_type == "exterior_facet_bottom" ] assert len(top_sks) == len(bottom_sks), ( "Number of top and bottom kernels should be equal") # Top and bottom kernels need to be sorted by kinfo.indices # if the space is mixed to ensure indices match. top_sks = sorted(top_sks, key=lambda x: x.indices) bottom_sks = sorted(bottom_sks, key=lambda x: x.indices) stmt, incl = extruded_int_horiz_facet(exp, builder, top_sks, bottom_sks, coordsym, mesh_layer_sym, cell_orientations) statements.append(stmt) elif it_type in ["exterior_facet_bottom", "exterior_facet_top"]: # These kernels will only be called if we are on # the top or bottom layers of the extruded mesh. builder.require_mesh_layers() stmt, incl = extruded_top_bottom_facet(cxt_kernel, builder, coordsym, mesh_layer_sym, cell_orientations) statements.append(stmt) else: raise ValueError("Kernel type not recognized: %s" % it_type) # Don't duplicate include lines inc_dir = list(set(incl) - set(inc)) inc.extend(inc_dir) # Now we handle any terms that require auxiliary temporaries, # such as inverses, transposes and actions of a tensor on a # coefficient if builder.aux_exprs: # The declared temps will be updated within this method aux_statements = auxiliary_temporaries(builder, declared_temps) statements.extend(aux_statements) # Now we create the result statement by declaring its eigen type and # using Eigen::Map to move between Eigen and C data structs. result_sym = ast.Symbol("T%d" % len(builder.temps)) result_data_sym = ast.Symbol("A%d" % len(builder.temps)) result_type = "Eigen::Map<%s >" % eigen_matrixbase_type(shape) result = ast.Decl(SCALAR_TYPE, ast.Symbol(result_data_sym, shape)) result_statement = ast.FlatBlock( "%s %s((%s *)%s);\n" % (result_type, result_sym, SCALAR_TYPE, result_data_sym)) statements.append(result_statement) # Generate the complete c++ string performing the linear algebra operations # on Eigen matrices/vectors cpp_string = ast.FlatBlock( metaphrase_slate_to_cpp(slate_expr, declared_temps)) statements.append(ast.Incr(result_sym, cpp_string)) # Finalize AST for macro kernel construction builder._finalize_kernels_and_update() # Generate arguments for the macro kernel args = [result, ast.Decl("%s **" % SCALAR_TYPE, coordsym)] # Orientation information if builder.oriented: args.append(ast.Decl("int **", cell_orientations)) # Coefficient information for c in expr_coeffs: if isinstance(c, Constant): ctype = "%s *" % SCALAR_TYPE else: ctype = "%s **" % SCALAR_TYPE args.extend([ast.Decl(ctype, csym) for csym in builder.coefficient(c)]) # Facet information if builder.needs_cell_facets: args.append( ast.Decl("%s *" % as_cstr(cell_to_facets_dtype), cellfacetsym)) # NOTE: We need to be careful about the ordering here. Mesh layers are # added as the final argument to the kernel. if builder.needs_mesh_layers: args.append(ast.Decl("int", mesh_layer_sym)) # NOTE: In the future we may want to have more than one "macro_kernel" macro_kernel_name = "compile_slate" stmt = ast.Block(statements) macro_kernel = builder.construct_macro_kernel(name=macro_kernel_name, args=args, statements=stmt) # Tell the builder to construct the final ast kernel_ast = builder.construct_ast([macro_kernel]) # Now we wrap up the kernel ast as a PyOP2 kernel. # Include the Eigen header files inc.extend(["%s/include/eigen3/" % d for d in PETSC_DIR]) op2kernel = op2.Kernel( kernel_ast, macro_kernel_name, cpp=True, include_dirs=inc, headers=['#include <Eigen/Dense>', '#define restrict __restrict']) assert len(slate_expr.ufl_domains()) == 1, ( "No support for multiple domains yet!") # Send back a "TSFC-like" SplitKernel object with an # index and KernelInfo kinfo = KernelInfo(kernel=op2kernel, integral_type=builder.integral_type, oriented=builder.oriented, subdomain_id="otherwise", domain_number=0, coefficient_map=tuple(range(len(expr_coeffs))), needs_cell_facets=builder.needs_cell_facets, pass_layer_arg=builder.needs_mesh_layers) idx = tuple([0] * slate_expr.rank) kernels = (SplitKernel(idx, kinfo), ) # Store the resulting kernel for reuse slate_expr._metakernel_cache = kernels return kernels
def exterior_facet_boundary_node_map(self, V, method): """Return the :class:`pyop2.Map` from exterior facets to nodes on the boundary. :arg V: The function space. :arg method: The method for determining boundary nodes. See :class:`~.DirichletBC` for details. """ try: return self.map_caches["boundary_node"][method] except KeyError: pass el = V.finat_element dim = self.mesh.facet_dimension() if method == "topological": boundary_dofs = el.entity_closure_dofs()[dim] elif method == "geometric": # This function is only called on extruded meshes when # asking for the nodes that live on the "vertical" # exterior facets. boundary_dofs = entity_support_dofs(el, dim) nodes_per_facet = \ len(boundary_dofs[0]) # HACK ALERT # The facet set does not have a halo associated with it, since # we only construct halos for DoF sets. Fortunately, this # loop is direct and we already have all the correct # information available locally. So We fake a set of the # correct size and carry out a direct loop facet_set = op2.Set(self.mesh.exterior_facets.set.total_size, comm=self.mesh.comm) fs_dat = op2.Dat( facet_set**el.space_dimension(), data=V.exterior_facet_node_map().values_with_halo.view()) facet_dat = op2.Dat(facet_set**nodes_per_facet, dtype=IntType) # Ensure these come out in sorted order. local_facet_nodes = numpy.array( [boundary_dofs[e] for e in sorted(boundary_dofs.keys())]) # Helper function to turn the inner index of an array into c # array literals. c_array = lambda xs: "{" + ", ".join(map(str, xs)) + "}" # AST for: l_nodes[facet[0]][n] rank_ast = ast.Symbol("l_nodes", rank=(ast.Symbol("facet", rank=(0, )), "n")) body = ast.Block([ ast.Decl("int", ast.Symbol("l_nodes", (len(el.cell.topology[dim]), nodes_per_facet)), init=ast.ArrayInit( c_array(map(c_array, local_facet_nodes))), qualifiers=["const"]), ast.For( ast.Decl("int", "n", 0), ast.Less("n", nodes_per_facet), ast.Incr("n", 1), ast.Assign(ast.Symbol("facet_nodes", ("n", )), ast.Symbol("cell_nodes", (rank_ast, )))) ]) kernel = op2.Kernel( ast.FunDecl("void", "create_bc_node_map", [ ast.Decl("%s*" % as_cstr(fs_dat.dtype), "cell_nodes"), ast.Decl("%s*" % as_cstr(facet_dat.dtype), "facet_nodes"), ast.Decl("unsigned int*", "facet") ], body), "create_bc_node_map") local_facet_dat = op2.Dat( facet_set**self.mesh.exterior_facets._rank, self.mesh.exterior_facets.local_facet_dat.data_ro_with_halos, dtype=numpy.uintc) op2.par_loop(kernel, facet_set, fs_dat(op2.READ), facet_dat(op2.WRITE), local_facet_dat(op2.READ)) if self.extruded: offset = self.offset[boundary_dofs[0]] else: offset = None val = op2.Map(facet_set, self.node_set, nodes_per_facet, facet_dat.data_ro_with_halos, name="exterior_facet_boundary_node", offset=offset) self.map_caches["boundary_node"][method] = val return val
def ast_matmul(self, F_a, implementation='optimized'): """Generate an AST for a PyOP2 kernel performing a matrix-vector multiplication.""" # The number of dofs on each element is /ndofs*cdim/ F_a_fs = F_a.function_space() ndofs = F_a_fs.fiat_element.entity_dofs() ndofs = sum(self.mesh.make_dofs_per_plex_entity(ndofs)) cdim = F_a_fs.dim name = 'mat_vec_mul_kernel_%s' % F_a_fs.name identifier = (ndofs, cdim, name, implementation) if identifier in self.asts: return self.asts[identifier] from coffee import isa, options if cdim and cdim % isa['dp_reg'] == 0: simd_pragma = '#pragma simd reduction(+:sum)' else: simd_pragma = '' # Craft the AST if implementation == 'optimized' and cdim >= 4: body = ast.Incr( ast.Symbol('sum'), ast.Prod( ast.Symbol('A', ('i', ), ((ndofs * cdim, 'j*%d + k' % cdim), )), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body, simd_pragma).children[0] body = [ ast.Decl('const int', ast.Symbol('index'), init=ast.Symbol('i%%%d' % cdim)), ast.Decl('double', ast.Symbol('sum'), init=ast.Symbol('0.0')), ast.c_for('j', ndofs, body).children[0], ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'index')), 'sum') ] body = ast.Block([ast.c_for('i', ndofs * cdim, body).children[0]]) funargs = [ ast.Decl('double* restrict', 'A'), ast.Decl('double *restrict *restrict', 'B'), ast.Decl('double *restrict *', 'C') ] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) else: body = ast.Incr( ast.Symbol('C', ('i/%d' % cdim, 'index')), ast.Prod( ast.Symbol('A', ('i', ), ((ndofs * cdim, 'j*%d + k' % cdim), )), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body).children[0] body = [ ast.Decl('const int', ast.Symbol('index'), init=ast.Symbol('i%%%d' % cdim)), ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'index' % cdim)), '0.0'), ast.c_for('j', ndofs, body).children[0] ] body = ast.Block([ast.c_for('i', ndofs * cdim, body).children[0]]) funargs = [ ast.Decl('double* restrict', 'A'), ast.Decl('double *restrict *restrict', 'B'), ast.Decl('double *restrict *', 'C') ] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) # Track the AST for later fast retrieval self.asts[identifier] = fundecl return fundecl