def statement_evaluate(leaf, parameters): expr = leaf.expression if isinstance(expr, gem.ListTensor): if parameters.declare[leaf]: array_expression = numpy.vectorize(lambda v: expression(v, parameters)) return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), coffee.ArrayInit(array_expression(expr.array), precision=parameters.precision)) else: ops = [] for multiindex, value in numpy.ndenumerate(expr.array): coffee_sym = _coffee_symbol(_ref_symbol(expr, parameters), rank=multiindex) ops.append(coffee.Assign(coffee_sym, expression(value, parameters))) return coffee.Block(ops, open_scope=False) elif isinstance(expr, gem.Constant): assert parameters.declare[leaf] return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), coffee.ArrayInit(expr.array, parameters.precision), qualifiers=["static", "const"]) else: code = expression(expr, parameters, top=True) if parameters.declare[leaf]: return coffee.Decl(parameters.scalar_type, _decl_symbol(expr, parameters), code) else: return coffee.Assign(_ref_symbol(expr, parameters), code)
def ker_write2d(): return ast.FunDecl( 'void', 'ker_write2d', [ast.Decl('int', 'V', qualifiers=['unsigned'], pointers=[''])], ast.Block([ ast.Assign(ast.Symbol('V', (0, )), 1), ast.Assign(ast.Symbol('V', (1, )), 2) ]))
def auxiliary_temporaries(builder, declared_temps): """Generates statements for assigning auxiliary temporaries for nodes in an expression with "high" reference count. Expressions which require additional temporaries are provided by the :class:`LocalKernelBuilder`. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg declared_temps: A `dict` containing all previously declared temporaries. This dictionary is updated as auxiliary expressions are assigned temporaries. """ statements = [ast.FlatBlock("/* Auxiliary temporaries */\n")] results = [ast.FlatBlock("/* Assign auxiliary temps */\n")] for exp in builder.aux_exprs: if exp not in declared_temps: t = ast.Symbol("auxT%d" % len(declared_temps)) result = metaphrase_slate_to_cpp(exp, declared_temps) tensor_type = eigen_matrixbase_type(shape=exp.shape) statements.append(ast.Decl(tensor_type, t)) statements.append(ast.FlatBlock("%s.setZero();\n" % t)) results.append(ast.Assign(t, result)) declared_temps[exp] = t statements.extend(results) return statements
def ast_matmul(self, F_a): """Generate an AST for a PyOP2 kernel performing a matrix-vector multiplication. :param F_a: Assembled firedrake.Function object for the RHS""" # The number of dofs on each element is /ndofs*cdim/ F_a_fs = F_a.function_space() ndofs = sum(F_a_fs.topological.dofs_per_entity) cdim = F_a_fs.dim name = 'mat_vec_mul_kernel_%s' % F_a_fs.name identifier = (ndofs, cdim, name) if identifier in self.asts: return self.asts[identifier] # Craft the AST body = ast.Incr(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)), ast.Prod(ast.Symbol('A', ('i',), ((ndofs*cdim, 'j*%d + k' % cdim),)), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body).children[0] body = [ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)), '0.0'), ast.c_for('j', ndofs, body).children[0]] body = ast.Root([ast.c_for('i', ndofs*cdim, body).children[0]]) funargs = [ast.Decl('double*', 'A'), ast.Decl('double**', 'B'), ast.Decl('double**', 'C')] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) # Track the AST for later fast retrieval self.asts[identifier] = fundecl return fundecl
def expression_kernel(expr, args): """Produce a :class:`pyop2.Kernel` from the processed UFL expression expr and the corresponding args.""" # Empty slot indicating assignment to indexed LHS, so don't do anything if type(expr) is Zero: return fs = args[0].function.function_space() d = ast.Symbol("dim") ast_expr = _ast(expr) body = ast.Block( ( ast.Decl("int", d), ast.For(ast.Assign(d, ast.Symbol(0)), ast.Less(d, ast.Symbol(fs.dof_dset.cdim)), ast.Incr(d, ast.Symbol(1)), ast_expr) ) ) return op2.Kernel(ast.FunDecl("void", "expression", [arg.arg for arg in args], body), "expression")
def auxiliary_information(builder): """This function generates any auxiliary information regarding special handling of expressions that do not have any integral forms or subkernels associated with it. :arg builder: a :class:`SlateKernelBuilder` object that contains all the necessary temporary and expression information. Returns: a mapping of the form ``{aux_node: aux_temp}``, where `aux_node` is an already assembled data-object provided as a `ufl.Coefficient` and `aux_temp` is the corresponding temporary. a list of auxiliary statements are returned that contain temporary declarations and any code-blocks needed to evaluate the expression. """ aux_temps = {} aux_statements = [] for i, exp in enumerate(builder.aux_exprs): if isinstance(exp, Action): acting_coefficient = exp._acting_coefficient assert isinstance(acting_coefficient, Coefficient) temp = ast.Symbol("C%d" % i) V = acting_coefficient.function_space() node_extent = V.fiat_element.space_dimension() dof_extent = np.prod(V.ufl_element().value_shape()) aux_statements.append( ast.Decl( eigen_matrixbase_type(shape=(dof_extent * node_extent, )), temp)) aux_statements.append(ast.FlatBlock("%s.setZero();\n" % temp)) # Now we unpack the coefficient and insert its entries into a 1D vector temporary isym = ast.Symbol("i1") jsym = ast.Symbol("j1") tensor_index = ast.Sum(ast.Prod(dof_extent, isym), jsym) # Inner-loop running over dof_extent inner_loop = ast.For( ast.Decl("unsigned int", jsym, init=0), ast.Less(jsym, dof_extent), ast.Incr(jsym, 1), ast.Assign( ast.Symbol(temp, rank=(tensor_index, )), ast.Symbol(builder.coefficient_map[acting_coefficient], rank=(isym, jsym)))) # Outer-loop running over node_extent loop = ast.For(ast.Decl("unsigned int", isym, init=0), ast.Less(isym, node_extent), ast.Incr(isym, 1), inner_loop) aux_statements.append(loop) aux_temps[acting_coefficient] = temp else: raise NotImplementedError( "Auxiliary expression type %s not currently implemented." % type(exp)) return aux_temps, aux_statements
def ker_loc_reduce(): body = ast.Incr('a', ast.Prod(ast.Symbol('V', ('i',)), ast.Symbol('B', (0,)))) body = \ [ast.Decl('int', 'a', '0')] +\ ItSpace().to_for([(0, 2)], ('i',), [body]) +\ [ast.Assign(ast.Symbol('A', (0,)), 'a')] return ast.FunDecl('void', 'ker_loc_reduce', [ast.Decl('int', 'A', qualifiers=['unsigned'], pointers=['']), ast.Decl('int', 'V', qualifiers=['unsigned'], pointers=['']), ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=[''])], ast.Block(body))
def auxiliary_expressions(builder, declared_temps): """Generates statements for assigning auxiliary temporaries and declaring factorizations for local matrix inverses (if the matrix is larger than 4 x 4). :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg declared_temps: A `dict` containing all previously declared temporaries. This dictionary is updated as auxiliary expressions are assigned temporaries. """ # These are either already declared terminals or expressions # which do not require an extra temporary/expression terminals = (slate.Tensor, slate.AssembledVector, slate.Negative, slate.Transpose) statements = [] sorted_exprs = [ exp for exp in topological_sort(builder.expression_dag) if ((builder.ref_counter[exp] > 1 and not isinstance(exp, terminals)) or isinstance(exp, slate.Factorization)) ] for exp in sorted_exprs: if exp not in declared_temps: if isinstance(exp, slate.Factorization): t = ast.Symbol("dec%d" % len(declared_temps)) operand, = exp.operands expr = slate_to_cpp(operand, declared_temps) tensor_type = eigen_matrixbase_type(shape=exp.shape) stmt = "Eigen::%s<%s > %s(%s);\n" % (exp.decomposition, tensor_type, t, expr) statements.append(stmt) else: t = ast.Symbol("auxT%d" % len(declared_temps)) result = slate_to_cpp(exp, declared_temps) tensor_type = eigen_matrixbase_type(shape=exp.shape) stmt = ast.Decl(tensor_type, t) assignment = ast.Assign(t, result) statements.extend([stmt, assignment]) declared_temps[exp] = t return statements
def auxiliary_temporaries(builder, declared_temps): """This function generates auxiliary information regarding special handling of expressions that require creating additional temporaries. :arg builder: a :class:`KernelBuilder` object that contains all the necessary temporary and expression information. :arg declared_temps: a `dict` of temporaries that have already been declared and assigned values. This will be updated in this method and referenced later in the compiler. Returns: a list of auxiliary statements are returned that contain temporary declarations and any code-blocks needed to evaluate the expression. """ aux_statements = [] for exp in builder.aux_exprs: if isinstance(exp, Inverse): if builder._ref_counts[exp] > 1: # Get the temporary for the particular expression result = metaphrase_slate_to_cpp(exp, declared_temps) # Now we use the generated result and assign the value to the # corresponding temporary. temp = ast.Symbol("auxT%d" % len(declared_temps)) shape = exp.shape aux_statements.append( ast.Decl(eigen_matrixbase_type(shape), temp)) aux_statements.append(ast.FlatBlock("%s.setZero();\n" % temp)) aux_statements.append(ast.Assign(temp, result)) # Update declared temps declared_temps[exp] = temp elif isinstance(exp, Action): # Action computations are relatively inexpensive, so # we don't waste memory space on creating temps for # these expressions. However, we must create a temporary # for the actee coefficient (if we haven't already). actee, = exp.actee if actee not in declared_temps: # Declare a temporary for the coefficient V = actee.function_space() shape_array = [(Vi.finat_element.space_dimension(), np.prod(Vi.shape)) for Vi in V.split()] ctemp = ast.Symbol("auxT%d" % len(declared_temps)) shape = sum(n * d for (n, d) in shape_array) typ = eigen_matrixbase_type(shape=(shape, )) aux_statements.append(ast.Decl(typ, ctemp)) aux_statements.append(ast.FlatBlock("%s.setZero();\n" % ctemp)) # Now we populate the temporary with the coefficient # information and insert in the right place. offset = 0 for i, shp in enumerate(shape_array): node_extent, dof_extent = shp # Now we unpack the function and insert its entries into a # 1D vector temporary isym = ast.Symbol("i1") jsym = ast.Symbol("j1") tensor_index = ast.Sum( offset, ast.Sum(ast.Prod(dof_extent, isym), jsym)) # Inner-loop running over dof_extent coeff_sym = ast.Symbol(builder.coefficient(actee)[i], rank=(isym, jsym)) coeff_temp = ast.Symbol(ctemp, rank=(tensor_index, )) inner_loop = ast.For( ast.Decl("unsigned int", jsym, init=0), ast.Less(jsym, dof_extent), ast.Incr(jsym, 1), ast.Assign(coeff_temp, coeff_sym)) # Outer-loop running over node_extent loop = ast.For(ast.Decl("unsigned int", isym, init=0), ast.Less(isym, node_extent), ast.Incr(isym, 1), inner_loop) aux_statements.append(loop) offset += node_extent * dof_extent # Update declared temporaries with the coefficient declared_temps[actee] = ctemp else: raise NotImplementedError( "Auxiliary expr type %s not currently implemented." % type(exp)) return aux_statements
def compile_c_kernel(expression, to_pts, to_element, fs, coords): """Produce a :class:`PyOP2.Kernel` from the c expression provided.""" coords_space = coords.function_space() coords_element = coords_space.fiat_element names = {v[0] for v in expression._user_args} X = coords_element.tabulate(0, to_pts).values()[0] # Produce C array notation of X. X_str = "{{"+"},\n{".join([",".join(map(str, x)) for x in X.T])+"}}" A = utils.unique_name("A", names) X = utils.unique_name("X", names) x_ = utils.unique_name("x_", names) k = utils.unique_name("k", names) d = utils.unique_name("d", names) i_ = utils.unique_name("i", names) # x is a reserved name. x = "x" if "x" in names: raise ValueError("cannot use 'x' as a user-defined Expression variable") ass_exp = [ast.Assign(ast.Symbol(A, (k,), ((len(expression.code), i),)), ast.FlatBlock("%s" % code)) for i, code in enumerate(expression.code)] vals = { "X": X, "x": x, "x_": x_, "k": k, "d": d, "i": i_, "x_array": X_str, "dim": coords_space.dim, "xndof": coords_element.space_dimension(), # FS will always either be a functionspace or # vectorfunctionspace, so just accessing dim here is safe # (we don't need to go through ufl_element.value_shape()) "nfdof": to_element.space_dimension() * numpy.prod(fs.dim, dtype=int), "ndof": to_element.space_dimension(), "assign_dim": numpy.prod(expression.value_shape(), dtype=int) } init = ast.FlatBlock(""" const double %(X)s[%(ndof)d][%(xndof)d] = %(x_array)s; double %(x)s[%(dim)d]; const double pi = 3.141592653589793; """ % vals) block = ast.FlatBlock(""" for (unsigned int %(d)s=0; %(d)s < %(dim)d; %(d)s++) { %(x)s[%(d)s] = 0; for (unsigned int %(i)s=0; %(i)s < %(xndof)d; %(i)s++) { %(x)s[%(d)s] += %(X)s[%(k)s][%(i)s] * %(x_)s[%(i)s][%(d)s]; }; }; """ % vals) loop = ast.c_for(k, "%(ndof)d" % vals, ast.Block([block] + ass_exp, open_scope=True)) user_args = [] user_init = [] for _, arg in expression._user_args: if arg.shape == (1, ): user_args.append(ast.Decl("double *", "%s_" % arg.name)) user_init.append(ast.FlatBlock("const double %s = *%s_;" % (arg.name, arg.name))) else: user_args.append(ast.Decl("double *", arg.name)) kernel_code = ast.FunDecl("void", "expression_kernel", [ast.Decl("double", ast.Symbol(A, (int("%(nfdof)d" % vals),))), ast.Decl("double**", x_)] + user_args, ast.Block(user_init + [init, loop], open_scope=False)) coefficients = [coords] for _, arg in expression._user_args: coefficients.append(GlobalWrapper(arg)) return op2.Kernel(kernel_code, kernel_code.name), False, tuple(coefficients)
def _generate_element_tensor(integrals, sets, optimise_parameters, parameters): "Construct quadrature code for element tensors." # Prefetch formats to speed up code generation. f_comment = format["comment"] f_ip = format["integration points"] f_I = format["ip constant"] f_loop = format["generate loop"] f_ip_coords = format["generate ip coordinates"] f_coords = format["coordinate_dofs"] f_double = format["float declaration"] f_decl = format["declaration"] f_X = format["ip coordinates"] f_C = format["conditional"] # Initialise return values. tensor_ops_count = 0 ffc_assert(1 == len(integrals), "This function is not capable of handling multiple integrals.") # We receive a dictionary {num_points: form,}. # Loop points and forms. for points, terms, functions, ip_consts, coordinate, conditionals in integrals: nest_ir = [] ip_ir = [] num_ops = 0 # Generate code to compute coordinates if used. if coordinate: raise RuntimeError("Don't know how to compute coordinates") # Left in place for posterity name, gdim, ip, r = coordinate element_code += ["", f_comment("Declare array to hold physical coordinate of quadrature point.")] element_code += [f_decl(f_double, f_X(points, gdim))] ops, coord_code = f_ip_coords(gdim, points, name, ip, r) ip_code += ["", f_comment("Compute physical coordinate of quadrature point, operations: %d." % ops)] ip_code += [coord_code] num_ops += ops # Update used psi tables and transformation set. sets[1].add(name) sets[3].add(f_coords(r)) # Generate code to compute function values. if functions: const_func_code, func_code, ops = _generate_functions(functions, sets) nest_ir += const_func_code ip_ir += func_code num_ops += ops # Generate code to compute conditionals (might depend on coordinates # and function values so put here). # TODO: Some conditionals might only depend on geometry so they # should be moved outside if possible. if conditionals: ip_ir.append(pyop2.Decl(f_double, c_sym(f_C(len(conditionals))))) # Sort conditionals (need to in case of nested conditionals). reversed_conds = dict([(n, (o, e)) for e, (t, o, n) in conditionals.items()]) for num in range(len(conditionals)): name = format["conditional"](num) ops, expr = reversed_conds[num] ip_ir.append(pyop2.Assign(c_sym(name), visit_rhs(expr))) num_ops += ops # Generate code for ip constant declarations. # TODO: this code should be removable as only executed when ffc's optimisations are on ip_const_ops, ip_const_code = generate_aux_constants(ip_consts, f_I,\ format["assign"], True) if len(ip_const_code) > 0: raise RuntimeError("IP Const code not supported") num_ops += ip_const_ops # Generate code to evaluate the element tensor. code, ops = _generate_integral_ir(points, terms, sets, optimise_parameters, parameters) num_ops += ops tensor_ops_count += num_ops*points ip_ir += code # Loop code over all IPs. # @@@: for (ip ...) { A[0][0] += ... } if points > 1: it_var = pyop2.Symbol(f_ip, ()) nest_ir += [pyop2.For(pyop2.Decl("int", it_var, c_sym(0)), pyop2.Less(it_var, c_sym(points)), pyop2.Incr(it_var, c_sym(1)), pyop2.Block(ip_ir, open_scope=True))] else: nest_ir += ip_ir return (nest_ir, tensor_ops_count)
def build_hard_fusion_kernel(base_loop, fuse_loop, fusion_map, loop_chain_index): """ Build AST and :class:`Kernel` for two loops suitable to hard fusion. The AST consists of three functions: fusion, base, fuse. base and fuse are respectively the ``base_loop`` and the ``fuse_loop`` kernels, whereas fusion is the orchestrator that invokes, for each ``base_loop`` iteration, base and, if still to be executed, fuse. The orchestrator has the following structure: :: fusion (buffer, ..., executed): base (buffer, ...) for i = 0 to arity: if not executed[i]: additional pointer staging required by kernel2 fuse (sub_buffer, ...) insertion into buffer The executed array tracks whether the i-th iteration (out of /arity/) adjacent to the main kernel1 iteration has been executed. """ finder = Find((ast.FunDecl, ast.PreprocessNode)) base = base_loop.kernel base_ast = dcopy(base._ast) base_info = finder.visit(base_ast) base_headers = base_info[ast.PreprocessNode] base_fundecl = base_info[ast.FunDecl] assert len(base_fundecl) == 1 base_fundecl = base_fundecl[0] fuse = fuse_loop.kernel fuse_ast = dcopy(fuse._ast) fuse_info = finder.visit(fuse_ast) fuse_headers = fuse_info[ast.PreprocessNode] fuse_fundecl = fuse_info[ast.FunDecl] assert len(fuse_fundecl) == 1 fuse_fundecl = fuse_fundecl[0] # Create /fusion/ arguments and signature body = ast.Block([]) fusion_name = '%s_%s' % (base_fundecl.name, fuse_fundecl.name) fusion_args = dcopy(base_fundecl.args + fuse_fundecl.args) fusion_fundecl = ast.FunDecl(base_fundecl.ret, fusion_name, fusion_args, body) # Make sure kernel and variable names are unique base_fundecl.name = "%s_base" % base_fundecl.name fuse_fundecl.name = "%s_fuse" % fuse_fundecl.name for i, decl in enumerate(fusion_args): decl.sym.symbol += '_%d' % i # Filter out duplicate arguments, and append extra arguments to the fundecl binding = WeakFilter().kernel_args([base_loop, fuse_loop], fusion_fundecl) fusion_args += [ast.Decl('int*', 'executed'), ast.Decl('int*', 'fused_iters'), ast.Decl('int', 'i')] # Which args are actually used in /fuse/, but not in /base/ ? The gather for # such arguments is moved to /fusion/, to avoid usless memory LOADs base_dats = set(a.data for a in base_loop.args) fuse_dats = set(a.data for a in fuse_loop.args) unshared = OrderedDict() for arg, decl in binding.items(): if arg.data in fuse_dats - base_dats: unshared.setdefault(decl, arg) # Track position of Args that need a postponed gather # Can't track Args themselves as they change across different parloops fargs = {fusion_args.index(i): ('postponed', False) for i in unshared.keys()} fargs.update({len(set(binding.values())): ('onlymap', True)}) # Add maps for arguments that need a postponed gather for decl, arg in unshared.items(): decl_pos = fusion_args.index(decl) fusion_args[decl_pos].sym.symbol = arg.c_arg_name() if arg._is_indirect: fusion_args[decl_pos].sym.rank = () fusion_args.insert(decl_pos + 1, ast.Decl('int*', arg.c_map_name(0, 0))) # Append the invocation of /base/; then, proceed with the invocation # of the /fuse/ kernels base_funcall_syms = [binding[a].sym.symbol for a in base_loop.args] body.children.append(ast.FunCall(base_fundecl.name, *base_funcall_syms)) for idx in range(fusion_map.arity): fused_iter = ast.Assign('i', ast.Symbol('fused_iters', (idx,))) fuse_funcall = ast.FunCall(fuse_fundecl.name) if_cond = ast.Not(ast.Symbol('executed', ('i',))) if_update = ast.Assign(ast.Symbol('executed', ('i',)), 1) if_body = ast.Block([fuse_funcall, if_update], open_scope=True) if_exec = ast.If(if_cond, [if_body]) body.children.extend([ast.FlatBlock('\n'), fused_iter, if_exec]) # Modify the /fuse/ kernel # This is to take into account that many arguments are shared with # /base/, so they will only staged once for /base/. This requires # tweaking the way the arguments are declared and accessed in /fuse/. # For example, the shared incremented array (called /buffer/ in # the pseudocode in the comment above) now needs to take offsets # to be sure the locations that /base/ is supposed to increment are # actually accessed. The same concept apply to indirect arguments. init = lambda v: '{%s}' % ', '.join([str(j) for j in v]) for i, fuse_loop_arg in enumerate(fuse_loop.args): fuse_kernel_arg = binding[fuse_loop_arg] buffer_name = '%s_vec' % fuse_kernel_arg.sym.symbol fuse_funcall_sym = ast.Symbol(buffer_name) # What kind of temporaries do we need ? if fuse_loop_arg.access == INC: op, lvalue, rvalue = ast.Incr, fuse_kernel_arg.sym.symbol, buffer_name stager = lambda b, l: b.children.extend(l) indexer = lambda indices: [(k, j) for j, k in enumerate(indices)] pointers = [] elif fuse_loop_arg.access == READ: op, lvalue, rvalue = ast.Assign, buffer_name, fuse_kernel_arg.sym.symbol stager = lambda b, l: [b.children.insert(0, j) for j in reversed(l)] indexer = lambda indices: [(j, k) for j, k in enumerate(indices)] pointers = list(fuse_kernel_arg.pointers) # Now gonna handle arguments depending on their type and rank ... if fuse_loop_arg._is_global: # ... Handle global arguments. These can be dropped in the # kernel without any particular fiddling fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol) elif fuse_kernel_arg in unshared: # ... Handle arguments that appear only in /fuse/ staging = unshared[fuse_kernel_arg].c_vec_init(False).split('\n') rvalues = [ast.FlatBlock(j.split('=')[1]) for j in staging] lvalues = [ast.Symbol(buffer_name, (j,)) for j in range(len(staging))] staging = [ast.Assign(j, k) for j, k in zip(lvalues, rvalues)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (len(staging),)) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, qualifiers=fuse_kernel_arg.qual, pointers=list(pointers)) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_mat: # ... Handle Mats staging = [] for b in fused_inc_arg._block_shape: for rc in b: lvalue = ast.Symbol(lvalue, (idx, idx), ((rc[0], 'j'), (rc[1], 'k'))) rvalue = ast.Symbol(rvalue, ('j', 'k')) staging = ItSpace(mode=0).to_for([(0, rc[0]), (0, rc[1])], ('j', 'k'), [op(lvalue, rvalue)])[:1] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (fuse_kernel_arg.sym.rank,)) buffer_init = ast.ArrayInit(init([init([0.0])])) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_indirect: cdim = fuse_loop_arg.data.cdim if cdim == 1 and fuse_kernel_arg.sym.rank: # [Special case] # ... Handle rank 1 indirect arguments that appear in both # /base/ and /fuse/: just point into the right location rank = (idx,) if fusion_map.arity > 1 else () fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol, rank) else: # ... Handle indirect arguments. At the C level, these arguments # are of pointer type, so simple pointer arithmetic is used # to ensure the kernel accesses are to the correct locations fuse_arity = fuse_loop_arg.map.arity base_arity = fuse_arity*fusion_map.arity size = fuse_arity*cdim # Set the proper storage layout before invoking /fuse/ ofs_vals = [[base_arity*j + k for k in range(fuse_arity)] for j in range(cdim)] ofs_vals = [[fuse_arity*j + k for k in flatten(ofs_vals)] for j in range(fusion_map.arity)] ofs_vals = list(flatten(ofs_vals)) indices = [ofs_vals[idx*size + j] for j in range(size)] staging = [op(ast.Symbol(lvalue, (j,)), ast.Symbol(rvalue, (k,))) for j, k in indexer(indices)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (size,)) if fuse_loop_arg.access == INC: buffer_init = ast.ArrayInit(init([0.0])) else: buffer_init = ast.EmptyStatement() pointers.pop() buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) else: # Nothing special to do for direct arguments pass # Finally update the /fuse/ funcall fuse_funcall.children.append(fuse_funcall_sym) fused_headers = set([str(h) for h in base_headers + fuse_headers]) fused_ast = ast.Root([ast.PreprocessNode(h) for h in fused_headers] + [base_fundecl, fuse_fundecl, fusion_fundecl]) return Kernel([base, fuse], fused_ast, loop_chain_index), fargs
def ast_matmul(self, F_a, implementation='optimized'): """Generate an AST for a PyOP2 kernel performing a matrix-vector multiplication.""" # The number of dofs on each element is /ndofs*cdim/ F_a_fs = F_a.function_space() ndofs = F_a_fs.fiat_element.entity_dofs() ndofs = sum(self.mesh.make_dofs_per_plex_entity(ndofs)) cdim = F_a_fs.dim name = 'mat_vec_mul_kernel_%s' % F_a_fs.name identifier = (ndofs, cdim, name, implementation) if identifier in self.asts: return self.asts[identifier] from coffee import isa, options if cdim and cdim % isa['dp_reg'] == 0: simd_pragma = '#pragma simd reduction(+:sum)' else: simd_pragma = '' # Craft the AST if implementation == 'optimized' and cdim >= 4: body = ast.Incr( ast.Symbol('sum'), ast.Prod( ast.Symbol('A', ('i', ), ((ndofs * cdim, 'j*%d + k' % cdim), )), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body, simd_pragma).children[0] body = [ ast.Decl('const int', ast.Symbol('index'), init=ast.Symbol('i%%%d' % cdim)), ast.Decl('double', ast.Symbol('sum'), init=ast.Symbol('0.0')), ast.c_for('j', ndofs, body).children[0], ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'index')), 'sum') ] body = ast.Block([ast.c_for('i', ndofs * cdim, body).children[0]]) funargs = [ ast.Decl('double* restrict', 'A'), ast.Decl('double *restrict *restrict', 'B'), ast.Decl('double *restrict *', 'C') ] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) else: body = ast.Incr( ast.Symbol('C', ('i/%d' % cdim, 'index')), ast.Prod( ast.Symbol('A', ('i', ), ((ndofs * cdim, 'j*%d + k' % cdim), )), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body).children[0] body = [ ast.Decl('const int', ast.Symbol('index'), init=ast.Symbol('i%%%d' % cdim)), ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'index' % cdim)), '0.0'), ast.c_for('j', ndofs, body).children[0] ] body = ast.Block([ast.c_for('i', ndofs * cdim, body).children[0]]) funargs = [ ast.Decl('double* restrict', 'A'), ast.Decl('double *restrict *restrict', 'B'), ast.Decl('double *restrict *', 'C') ] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) # Track the AST for later fast retrieval self.asts[identifier] = fundecl return fundecl
def compile_expression(slate_expr, tsfc_parameters=None): """Takes a SLATE expression `slate_expr` and returns the appropriate :class:`firedrake.op2.Kernel` object representing the SLATE expression. :arg slate_expr: a :class:'TensorBase' expression. :arg tsfc_parameters: an optional `dict` of form compiler parameters to be passed onto TSFC during the compilation of ufl forms. """ if not isinstance(slate_expr, TensorBase): raise ValueError( "Expecting a `slate.TensorBase` expression, not a %r" % slate_expr) # TODO: Get PyOP2 to write into mixed dats if any(len(a.function_space()) > 1 for a in slate_expr.arguments()): raise NotImplementedError("Compiling mixed slate expressions") # Initialize shape and statements list shape = slate_expr.shape statements = [] # Create a builder for the SLATE expression builder = KernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) # Initialize coordinate and facet symbols coordsym = ast.Symbol("coords") coords = None cellfacetsym = ast.Symbol("cell_facets") inc = [] # Now we construct the list of statements to provide to the builder context_temps = builder.temps.copy() for exp, t in context_temps.items(): statements.append(ast.Decl(eigen_matrixbase_type(exp.shape), t)) statements.append(ast.FlatBlock("%s.setZero();\n" % t)) for splitkernel in builder.kernel_exprs[exp]: clist = [] index = splitkernel.indices kinfo = splitkernel.kinfo integral_type = kinfo.integral_type if integral_type not in [ "cell", "interior_facet", "exterior_facet" ]: raise NotImplementedError( "Integral type %s not currently supported." % integral_type) coordinates = exp.ufl_domain().coordinates if coords is not None: assert coordinates == coords else: coords = coordinates for cindex in kinfo.coefficient_map: c = exp.coefficients()[cindex] # Handles both mixed and non-mixed coefficient cases clist.extend(builder.extract_coefficient(c)) inc.extend(kinfo.kernel._include_dirs) tensor = eigen_tensor(exp, t, index) if integral_type in ["interior_facet", "exterior_facet"]: builder.require_cell_facets() itsym = ast.Symbol("i0") clist.append(ast.FlatBlock("&%s" % itsym)) loop_body = [] nfacet = exp.ufl_domain().ufl_cell().num_facets() if integral_type == "exterior_facet": checker = 1 else: checker = 0 loop_body.append( ast.If( ast.Eq(ast.Symbol(cellfacetsym, rank=(itsym, )), checker), [ ast.Block([ ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist) ], open_scope=True) ])) loop = ast.For(ast.Decl("unsigned int", itsym, init=0), ast.Less(itsym, nfacet), ast.Incr(itsym, 1), loop_body) statements.append(loop) else: statements.append( ast.FunCall(kinfo.kernel.name, tensor, coordsym, *clist)) # Now we handle any terms that require auxiliary data (if any) if bool(builder.aux_exprs): aux_temps, aux_statements = auxiliary_information(builder) context_temps.update(aux_temps) statements.extend(aux_statements) result_sym = ast.Symbol("T%d" % len(builder.temps)) result_data_sym = ast.Symbol("A%d" % len(builder.temps)) result_type = "Eigen::Map<%s >" % eigen_matrixbase_type(shape) result = ast.Decl(SCALAR_TYPE, ast.Symbol(result_data_sym, shape)) result_statement = ast.FlatBlock( "%s %s((%s *)%s);\n" % (result_type, result_sym, SCALAR_TYPE, result_data_sym)) statements.append(result_statement) cpp_string = ast.FlatBlock( metaphrase_slate_to_cpp(slate_expr, context_temps)) statements.append(ast.Assign(result_sym, cpp_string)) # Generate arguments for the macro kernel args = [result, ast.Decl("%s **" % SCALAR_TYPE, coordsym)] for c in slate_expr.coefficients(): if isinstance(c, Constant): ctype = "%s *" % SCALAR_TYPE else: ctype = "%s **" % SCALAR_TYPE args.extend([ ast.Decl(ctype, sym_c) for sym_c in builder.extract_coefficient(c) ]) if builder.needs_cell_facets: args.append(ast.Decl("char *", cellfacetsym)) macro_kernel_name = "compile_slate" kernel_ast, oriented = builder.construct_ast( name=macro_kernel_name, args=args, statements=ast.Block(statements)) inc.extend(["%s/include/eigen3/" % d for d in PETSC_DIR]) op2kernel = op2.Kernel( kernel_ast, macro_kernel_name, cpp=True, include_dirs=inc, headers=['#include <Eigen/Dense>', '#define restrict __restrict']) assert len(slate_expr.ufl_domains()) == 1 kinfo = KernelInfo(kernel=op2kernel, integral_type="cell", oriented=oriented, subdomain_id="otherwise", domain_number=0, coefficient_map=range(len(slate_expr.coefficients())), needs_cell_facets=builder.needs_cell_facets) idx = tuple([0] * slate_expr.rank) return (SplitKernel(idx, kinfo), )
def ker_init(): return ast.FunDecl('void', 'ker_init', [ast.Decl('int', 'B', qualifiers=['unsigned'], pointers=[''])], ast.Block([ast.Assign(ast.Symbol('B', (0,)), 0)]))
def compile_c_kernel(expression, to_pts, to_element, fs, coords): """Produce a :class:`PyOP2.Kernel` from the c expression provided.""" coords_space = coords.function_space() coords_element = create_element(coords_space.ufl_element(), vector_is_mixed=False) names = {v[0] for v in expression._user_args} X = list(coords_element.tabulate(0, to_pts).values())[0] # Produce C array notation of X. X_str = "{{" + "},\n{".join([",".join(map(str, x)) for x in X.T]) + "}}" A = utils.unique_name("A", names) X = utils.unique_name("X", names) x_ = utils.unique_name("x_", names) k = utils.unique_name("k", names) d = utils.unique_name("d", names) i_ = utils.unique_name("i", names) # x is a reserved name. x = "x" if "x" in names: raise ValueError( "cannot use 'x' as a user-defined Expression variable") ass_exp = [ ast.Assign(ast.Symbol(A, (k, ), ((len(expression.code), i), )), ast.FlatBlock("%s" % code)) for i, code in enumerate(expression.code) ] dim = coords_space.value_size ndof = to_element.space_dimension() xndof = coords_element.space_dimension() nfdof = to_element.space_dimension() * numpy.prod(fs.value_size, dtype=int) init_X = ast.Decl(typ="double", sym=ast.Symbol(X, rank=(ndof, xndof)), qualifiers=["const"], init=X_str) init_x = ast.Decl(typ="double", sym=ast.Symbol(x, rank=(coords_space.value_size, ))) init_pi = ast.Decl(typ="double", sym="pi", qualifiers=["const"], init="3.141592653589793") init = ast.Block([init_X, init_x, init_pi]) incr_x = ast.Incr( ast.Symbol(x, rank=(d, )), ast.Prod(ast.Symbol(X, rank=(k, i_)), ast.Symbol(x_, rank=(ast.Sum(ast.Prod(i_, dim), d), )))) assign_x = ast.Assign(ast.Symbol(x, rank=(d, )), 0) loop_x = ast.For(init=ast.Decl("unsigned int", i_, 0), cond=ast.Less(i_, xndof), incr=ast.Incr(i_, 1), body=[incr_x]) block = ast.For(init=ast.Decl("unsigned int", d, 0), cond=ast.Less(d, dim), incr=ast.Incr(d, 1), body=[assign_x, loop_x]) loop = ast.c_for(k, ndof, ast.Block([block] + ass_exp, open_scope=True)) user_args = [] user_init = [] for _, arg in expression._user_args: if arg.shape == (1, ): user_args.append(ast.Decl("double *", "%s_" % arg.name)) user_init.append( ast.FlatBlock("const double %s = *%s_;" % (arg.name, arg.name))) else: user_args.append(ast.Decl("double *", arg.name)) kernel_code = ast.FunDecl( "void", "expression_kernel", [ ast.Decl("double", ast.Symbol(A, (nfdof, ))), ast.Decl("double*", x_) ] + user_args, ast.Block(user_init + [init, loop], open_scope=False)) coefficients = [coords] for _, arg in expression._user_args: coefficients.append(GlobalWrapper(arg)) return op2.Kernel(kernel_code, kernel_code.name), False, tuple(coefficients)
def exterior_facet_boundary_node_map(self, method): '''The :class:`pyop2.Map` from exterior facets to the nodes on those facets. Note that this differs from :meth:`exterior_facet_node_map` in that only surface nodes are referenced, not all nodes in cells touching the surface. :arg method: The method for determining boundary nodes. See :class:`~.bcs.DirichletBC`. ''' el = self.fiat_element dim = self._mesh.facet_dimension() if method == "topological": boundary_dofs = el.entity_closure_dofs()[dim] elif method == "geometric": boundary_dofs = el.facet_support_dofs() nodes_per_facet = \ len(boundary_dofs[0]) # HACK ALERT # The facet set does not have a halo associated with it, since # we only construct halos for DoF sets. Fortunately, this # loop is direct and we already have all the correct # information available locally. So We fake a set of the # correct size and carry out a direct loop facet_set = op2.Set(self._mesh.exterior_facets.set.total_size) fs_dat = op2.Dat(facet_set**el.space_dimension(), data=self.exterior_facet_node_map().values_with_halo) facet_dat = op2.Dat(facet_set**nodes_per_facet, dtype=np.int32) local_facet_nodes = np.array( [dofs for e, dofs in boundary_dofs.iteritems()]) # Helper function to turn the inner index of an array into c # array literals. c_array = lambda xs: "{" + ", ".join(map(str, xs)) + "}" body = ast.Block([ ast.Decl("int", ast.Symbol("l_nodes", (len(el.get_reference_element().topology[dim]), nodes_per_facet)), init=ast.ArrayInit( c_array(map(c_array, local_facet_nodes))), qualifiers=["const"]), ast.For( ast.Decl("int", "n", 0), ast.Less("n", nodes_per_facet), ast.Incr("n", 1), ast.Assign( ast.Symbol("facet_nodes", ("n", )), ast.Symbol("cell_nodes", ("l_nodes[facet[0]][n]", )))) ]) kernel = op2.Kernel( ast.FunDecl("void", "create_bc_node_map", [ ast.Decl("int*", "cell_nodes"), ast.Decl("int*", "facet_nodes"), ast.Decl("unsigned int*", "facet") ], body), "create_bc_node_map") local_facet_dat = op2.Dat( facet_set**self._mesh.exterior_facets._rank, self._mesh.exterior_facets.local_facet_dat.data_ro_with_halos, dtype=np.uintc) op2.par_loop(kernel, facet_set, fs_dat(op2.READ), facet_dat(op2.WRITE), local_facet_dat(op2.READ)) if isinstance(self._mesh, mesh_t.ExtrudedMesh): offset = self.offset[boundary_dofs[0]] else: offset = None return op2.Map(facet_set, self.node_set, nodes_per_facet, facet_dat.data_ro_with_halos, name="exterior_facet_boundary_node", offset=offset)
def coefficient_temporaries(builder, declared_temps): """Generates coefficient temporary statements for assigning coefficients to vector temporaries. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg declared_temps: A `dict` keeping track of all declared temporaries. This dictionary is updated as coefficients are assigned temporaries. 'AssembledVector's require creating coefficient temporaries to store data. The temporaries are created by inspecting the function space of the coefficient to compute node and dof extents. The coefficient is then assigned values by looping over both the node extent and dof extent (double FOR-loop). A double FOR-loop is needed for each function space (if the function space is mixed, then a loop will be constructed for each component space). The general structure of each coefficient loop will be: FOR (i1=0; i1<node_extent; i1++): FOR (j1=0; j1<dof_extent; j1++): VT0[offset + (dof_extent * i1) + j1] = w_0_0[i1][j1] VT1[offset + (dof_extent * i1) + j1] = w_1_0[i1][j1] . . . where wT0, wT1, ... are temporaries for coefficients sharing the same node and dof extents. The offset is computed based on whether the function space is mixed. The offset is always 0 for non-mixed coefficients. If the coefficient is mixed, then the offset is incremented by the total number of nodal unknowns associated with the component spaces of the mixed space. """ statements = [ast.FlatBlock("/* Coefficient temporaries */\n")] j = ast.Symbol("j1") loops = [ast.FlatBlock("/* Loops for coefficient temps */\n")] for dofs, cinfo_list in builder.coefficient_vecs.items(): # Collect all coefficients which share the same node/dof extent assignments = [] for cinfo in cinfo_list: fs_i = cinfo.space_index offset = cinfo.offset_index c_shape = cinfo.shape vector = cinfo.vector function = vector._function t = cinfo.local_temp if vector not in declared_temps: # Declare and initialize coefficient temporary c_type = eigen_matrixbase_type(shape=c_shape) statements.append(ast.Decl(c_type, t)) declared_temps[vector] = t # Assigning coefficient values into temporary coeff_sym = ast.Symbol(builder.coefficient(function)[fs_i], rank=(j, )) index = ast.Sum(offset, j) coeff_temp = ast.Symbol(t, rank=(index, )) assignments.append(ast.Assign(coeff_temp, coeff_sym)) # loop over dofs loop = ast.For(ast.Decl("unsigned int", j, init=0), ast.Less(j, dofs), ast.Incr(j, 1), assignments) loops.append(loop) statements.extend(loops) return statements
def statement_initialise(leaf, parameters): if parameters.declare[leaf]: return coffee.Decl(parameters.scalar_type, _decl_symbol(leaf.indexsum, parameters), 0.0) else: return coffee.Assign(_ref_symbol(leaf.indexsum, parameters), 0.0)
def init_cell_orientations(self, expr): """Compute and initialise :attr:`cell_orientations` relative to a specified orientation. :arg expr: an :class:`.Expression` evaluated to produce a reference normal direction. """ import firedrake.function as function import firedrake.functionspace as functionspace if expr.value_shape()[0] != 3: raise NotImplementedError('Only implemented for 3-vectors') if self.ufl_cell() not in (ufl.Cell('triangle', 3), ufl.Cell("quadrilateral", 3), ufl.OuterProductCell(ufl.Cell('interval'), ufl.Cell('interval'), gdim=3)): raise NotImplementedError('Only implemented for triangles and quadrilaterals embedded in 3d') if hasattr(self.topology, '_cell_orientations'): raise RuntimeError("init_cell_orientations already called, did you mean to do so again?") v0 = lambda x: ast.Symbol("v0", (x,)) v1 = lambda x: ast.Symbol("v1", (x,)) n = lambda x: ast.Symbol("n", (x,)) x = lambda x: ast.Symbol("x", (x,)) coords = lambda x, y: ast.Symbol("coords", (x, y)) body = [] body += [ast.Decl("double", v(3)) for v in [v0, v1, n, x]] body.append(ast.Decl("double", "dot")) body.append(ast.Assign("dot", 0.0)) body.append(ast.Decl("int", "i")) # if triangle, use v0 = x1 - x0, v1 = x2 - x0 # otherwise, for the various quads, use v0 = x2 - x0, v1 = x1 - x0 # recall reference element ordering: # triangle: 2 quad: 1 3 # 0 1 0 2 if self.ufl_cell() == ufl.Cell('triangle', 3): body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Assign(v0("i"), ast.Sub(coords(1, "i"), coords(0, "i"))), ast.Assign(v1("i"), ast.Sub(coords(2, "i"), coords(0, "i"))), ast.Assign(x("i"), 0.0)])) else: body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Assign(v0("i"), ast.Sub(coords(2, "i"), coords(0, "i"))), ast.Assign(v1("i"), ast.Sub(coords(1, "i"), coords(0, "i"))), ast.Assign(x("i"), 0.0)])) # n = v0 x v1 body.append(ast.Assign(n(0), ast.Sub(ast.Prod(v0(1), v1(2)), ast.Prod(v0(2), v1(1))))) body.append(ast.Assign(n(1), ast.Sub(ast.Prod(v0(2), v1(0)), ast.Prod(v0(0), v1(2))))) body.append(ast.Assign(n(2), ast.Sub(ast.Prod(v0(0), v1(1)), ast.Prod(v0(1), v1(0))))) body.append(ast.For(ast.Assign("i", 0), ast.Less("i", 3), ast.Incr("i", 1), [ast.Incr(x(j), coords("i", j)) for j in range(3)])) body.extend([ast.FlatBlock("dot += (%(x)s) * n[%(i)d];\n" % {"x": x_, "i": i}) for i, x_ in enumerate(expr.code)]) body.append(ast.Assign("orientation[0][0]", ast.Ternary(ast.Less("dot", 0), 1, 0))) kernel = op2.Kernel(ast.FunDecl("void", "cell_orientations", [ast.Decl("int**", "orientation"), ast.Decl("double**", "coords")], ast.Block(body)), "cell_orientations") # Build the cell orientations as a DG0 field (so that we can # pass it in for facet integrals and the like) fs = functionspace.FunctionSpace(self, 'DG', 0) cell_orientations = function.Function(fs, name="cell_orientations", dtype=np.int32) op2.par_loop(kernel, self.cell_set, cell_orientations.dat(op2.WRITE, cell_orientations.cell_node_map()), self.coordinates.dat(op2.READ, self.coordinates.cell_node_map())) self.topology._cell_orientations = cell_orientations
def exterior_facet_boundary_node_map(self, V, method): """Return the :class:`pyop2.Map` from exterior facets to nodes on the boundary. :arg V: The function space. :arg method: The method for determining boundary nodes. See :class:`~.DirichletBC` for details. """ try: return self.map_caches["boundary_node"][method] except KeyError: pass el = V.finat_element dim = self.mesh.facet_dimension() if method == "topological": boundary_dofs = el.entity_closure_dofs()[dim] elif method == "geometric": # This function is only called on extruded meshes when # asking for the nodes that live on the "vertical" # exterior facets. boundary_dofs = entity_support_dofs(el, dim) nodes_per_facet = \ len(boundary_dofs[0]) # HACK ALERT # The facet set does not have a halo associated with it, since # we only construct halos for DoF sets. Fortunately, this # loop is direct and we already have all the correct # information available locally. So We fake a set of the # correct size and carry out a direct loop facet_set = op2.Set(self.mesh.exterior_facets.set.total_size, comm=self.mesh.comm) fs_dat = op2.Dat( facet_set**el.space_dimension(), data=V.exterior_facet_node_map().values_with_halo.view()) facet_dat = op2.Dat(facet_set**nodes_per_facet, dtype=IntType) # Ensure these come out in sorted order. local_facet_nodes = numpy.array( [boundary_dofs[e] for e in sorted(boundary_dofs.keys())]) # Helper function to turn the inner index of an array into c # array literals. c_array = lambda xs: "{" + ", ".join(map(str, xs)) + "}" # AST for: l_nodes[facet[0]][n] rank_ast = ast.Symbol("l_nodes", rank=(ast.Symbol("facet", rank=(0, )), "n")) body = ast.Block([ ast.Decl("int", ast.Symbol("l_nodes", (len(el.cell.topology[dim]), nodes_per_facet)), init=ast.ArrayInit( c_array(map(c_array, local_facet_nodes))), qualifiers=["const"]), ast.For( ast.Decl("int", "n", 0), ast.Less("n", nodes_per_facet), ast.Incr("n", 1), ast.Assign(ast.Symbol("facet_nodes", ("n", )), ast.Symbol("cell_nodes", (rank_ast, )))) ]) kernel = op2.Kernel( ast.FunDecl("void", "create_bc_node_map", [ ast.Decl("%s*" % as_cstr(fs_dat.dtype), "cell_nodes"), ast.Decl("%s*" % as_cstr(facet_dat.dtype), "facet_nodes"), ast.Decl("unsigned int*", "facet") ], body), "create_bc_node_map") local_facet_dat = op2.Dat( facet_set**self.mesh.exterior_facets._rank, self.mesh.exterior_facets.local_facet_dat.data_ro_with_halos, dtype=numpy.uintc) op2.par_loop(kernel, facet_set, fs_dat(op2.READ), facet_dat(op2.WRITE), local_facet_dat(op2.READ)) if self.extruded: offset = self.offset[boundary_dofs[0]] else: offset = None val = op2.Map(facet_set, self.node_set, nodes_per_facet, facet_dat.data_ro_with_halos, name="exterior_facet_boundary_node", offset=offset) self.map_caches["boundary_node"][method] = val return val
def coefficient_temporaries(builder, declared_temps): """Generates coefficient temporary statements for assigning coefficients to vector temporaries. :arg builder: The :class:`LocalKernelBuilder` containing all relevant expression information. :arg declared_temps: A `dict` keeping track of all declared temporaries. This dictionary is updated as coefficients are assigned temporaries. Action computations require creating coefficient temporaries to compute the matrix-vector product. The temporaries are created by inspecting the function space of the coefficient to compute node and dof extents. The coefficient is then assigned values by looping over both the node extent and dof extent (double FOR-loop). A double FOR-loop is needed for each function space (if the function space is mixed, then a loop will be constructed for each component space). The general structure of each coefficient loop will be: FOR (i1=0; i1<node_extent; i1++): FOR (j1=0; j1<dof_extent; j1++): wT0[offset + (dof_extent * i1) + j1] = w_0_0[i1][j1] wT1[offset + (dof_extent * i1) + j1] = w_1_0[i1][j1] . . . where wT0, wT1, ... are temporaries for coefficients sharing the same node and dof extents. The offset is computed based on whether the function space is mixed. The offset is always 0 for non-mixed coefficients. If the coefficient is mixed, then the offset is incremented by the total number of nodal unknowns associated with the component spaces of the mixed space. """ statements = [ast.FlatBlock("/* Coefficient temporaries */\n")] i_sym = ast.Symbol("i1") j_sym = ast.Symbol("j1") loops = [ast.FlatBlock("/* Loops for coefficient temps */\n")] for (nodes, dofs), cinfo_list in builder.action_coefficients.items(): # Collect all coefficients which share the same node/dof extent assignments = [] for cinfo in cinfo_list: fs_i = cinfo.space_index offset = cinfo.offset_index c_shape = cinfo.shape actee = cinfo.coefficient if actee not in declared_temps: # Declare and initialize coefficient temporary c_type = eigen_matrixbase_type(shape=c_shape) t = ast.Symbol("wT%d" % len(declared_temps)) statements.append(ast.Decl(c_type, t)) statements.append(ast.FlatBlock("%s.setZero();\n" % t)) declared_temps[actee] = t # Assigning coefficient values into temporary coeff_sym = ast.Symbol(builder.coefficient(actee)[fs_i], rank=(i_sym, j_sym)) index = ast.Sum(offset, ast.Sum(ast.Prod(dofs, i_sym), j_sym)) coeff_temp = ast.Symbol(t, rank=(index, )) assignments.append(ast.Assign(coeff_temp, coeff_sym)) # Inner-loop running over dof extent inner_loop = ast.For(ast.Decl("unsigned int", j_sym, init=0), ast.Less(j_sym, dofs), ast.Incr(j_sym, 1), assignments) # Outer-loop running over node extent loop = ast.For(ast.Decl("unsigned int", i_sym, init=0), ast.Less(i_sym, nodes), ast.Incr(i_sym, 1), inner_loop) loops.append(loop) statements.extend(loops) return statements