Exemple #1
0
    def ast_matmul(self, F_a):
        """Generate an AST for a PyOP2 kernel performing a matrix-vector multiplication.

        :param F_a: Assembled firedrake.Function object for the RHS"""

        # The number of dofs on each element is /ndofs*cdim/
        F_a_fs = F_a.function_space()
        ndofs = sum(F_a_fs.topological.dofs_per_entity)
        cdim = F_a_fs.dim
        name = 'mat_vec_mul_kernel_%s' % F_a_fs.name

        identifier = (ndofs, cdim, name)
        if identifier in self.asts:
            return self.asts[identifier]

        # Craft the AST
        body = ast.Incr(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)),
                        ast.Prod(ast.Symbol('A', ('i',), ((ndofs*cdim, 'j*%d + k' % cdim),)),
                                 ast.Symbol('B', ('j', 'k'))))
        body = ast.c_for('k', cdim, body).children[0]
        body = [ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)), '0.0'),
                ast.c_for('j', ndofs, body).children[0]]
        body = ast.Root([ast.c_for('i', ndofs*cdim, body).children[0]])
        funargs = [ast.Decl('double*', 'A'), ast.Decl('double**', 'B'), ast.Decl('double**', 'C')]
        fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline'])

        # Track the AST for later fast retrieval
        self.asts[identifier] = fundecl

        return fundecl
Exemple #2
0
def _tabulate_tensor(ir, parameters):
    "Generate code for a single integral (tabulate_tensor())."

    p_format        = parameters["format"]
    precision       = parameters["precision"]

    f_comment       = format["comment"]
    f_G             = format["geometry constant"]
    f_const_double  = format["assign"]
    f_float         = format["float"]
    f_assign        = format["assign"]
    f_A             = format["element tensor"][p_format]
    f_r             = format["free indices"][0]
    f_j             = format["first free index"]
    f_k             = format["second free index"]
    f_loop          = format["generate loop"]
    f_int           = format["int"]
    f_weight        = format["weight"]

    # Get data.
    opt_par     = ir["optimise_parameters"]
    integral_type = ir["integral_type"]
    cell        = ir["cell"]
    gdim        = cell.geometric_dimension()
    tdim        = cell.topological_dimension()
    num_facets  = ir["num_facets"]
    num_vertices= ir["num_vertices"]
    integrals   = ir["trans_integrals"]
    geo_consts  = ir["geo_consts"]
    oriented    = ir["needs_oriented"]

    # Create sets of used variables.
    used_weights    = set()
    used_psi_tables = set()
    used_nzcs       = set()
    trans_set       = set()
    sets = [used_weights, used_psi_tables, used_nzcs, trans_set]

    affine_tables = {} # TODO: This is not populated anywhere, remove?
    quadrature_weights = ir["quadrature_weights"]

    #The pyop2 format requires dereferencing constant coefficients since
    # these are passed in as double *
    common = []
    if p_format == "pyop2":
        for n, c in zip(ir["coefficient_names"], ir["coefficient_elements"]):
            if c.family() == 'Real':
                # Second index is always? 0, so we cast to (double (*)[1]).
                common += ['double (*w%(n)s)[1] = (double (*)[1])c%(n)s;\n' %
                           {'n': n[1:]}]

    operations = []
    if integral_type == "cell":
        # Update transformer with facets and generate code + set of used geometry terms.
        nest_ir, num_ops = _generate_element_tensor(integrals, sets, \
                                                    opt_par, parameters)

        # Set operations equal to num_ops (for printing info on operations).
        operations.append([num_ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        jacobi_code += format["compute_jacobian"](cell)
        jacobi_code += "\n"
        jacobi_code += format["compute_jacobian_inverse"](cell)
        if oriented and tdim != gdim:
            # NEED TO THINK ABOUT THIS FOR EXTRUSION
            jacobi_code += format["orientation"][p_format](tdim, gdim)
        jacobi_code += "\n"
        jacobi_code += format["scale factor snippet"][p_format]

        # Generate code for cell volume and circumradius -- note that the
        # former will be incorrect on extruded meshes by a constant factor.
        jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type)
        jacobi_code += "\n\n" + format["generate circumradius"][p_format](tdim, gdim, integral_type)

    elif integral_type in ("exterior_facet", "exterior_facet_vert"):
        if p_format == 'pyop2':
            common += ["unsigned int facet = *facet_p;\n"]

        # Generate tensor code for facets + set of used geometry terms.
        nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters)

        # Save number of operations (for printing info on operations).
        operations.append([ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        jacobi_code += format["compute_jacobian"](cell)
        jacobi_code += "\n"
        jacobi_code += format["compute_jacobian_inverse"](cell)
        if oriented and tdim != gdim:
            # NEED TO THINK ABOUT THIS FOR EXTRUSION
            jacobi_code += format["orientation"][p_format](tdim, gdim)
        jacobi_code += "\n"
        if integral_type == "exterior_facet":
            jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type)
            jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)
            jacobi_code += "\n\n" + format["generate facet area"](tdim, gdim)
            if tdim == 3:
                jacobi_code += "\n\n" + format["generate min facet edge length"](tdim, gdim)
                jacobi_code += "\n\n" + format["generate max facet edge length"](tdim, gdim)

            # Generate code for cell volume and circumradius
            jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type)
            jacobi_code += "\n\n" + format["generate circumradius"][p_format](tdim, gdim, integral_type)

        elif integral_type == "exterior_facet_vert":
            jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type)
            jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)
            # OTHER THINGS NOT IMPLEMENTED YET
        else:
            raise RuntimeError("Invalid integral_type")

    elif integral_type in ("exterior_facet_top", "exterior_facet_bottom"):
        nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters)
        operations.append([ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        jacobi_code += format["compute_jacobian"](cell)
        jacobi_code += "\n"
        jacobi_code += format["compute_jacobian_inverse"](cell)
        if oriented:
            # NEED TO THINK ABOUT THIS FOR EXTRUSION
            jacobi_code += format["orientation"][p_format](tdim, gdim)
        jacobi_code += "\n"
        jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type)
        jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)
        # THE REST IS NOT IMPLEMENTED YET

    elif integral_type in ("interior_facet", "interior_facet_vert"):
        if p_format == 'pyop2':
            common += ["unsigned int facet_0 = facet_p[0];"]
            common += ["unsigned int facet_1 = facet_p[1];"]
            common += ["double **coordinate_dofs_0 = coordinate_dofs;"]
            # Note that the following line is unsafe for isoparametric elements.
            common += ["double **coordinate_dofs_1 = coordinate_dofs + %d;" % num_vertices]

        # Generate tensor code for facets + set of used geometry terms.
        nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters)

        # Save number of operations (for printing info on operations).
        operations.append([ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        for _r in ["+", "-"]:
            if p_format == "pyop2":
                jacobi_code += format["compute_jacobian_interior"](cell, r=_r)
            else:
                jacobi_code += format["compute_jacobian"](cell, r=_r)

            jacobi_code += "\n"
            jacobi_code += format["compute_jacobian_inverse"](cell, r=_r)
            if oriented and tdim != gdim:
                # NEED TO THINK ABOUT THIS FOR EXTRUSION
                jacobi_code += format["orientation"][p_format](tdim, gdim, r=_r)
            jacobi_code += "\n"

        if integral_type == "interior_facet":
            jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+")
            jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)

            jacobi_code += "\n\n" + format["generate facet area"](tdim, gdim)
            if tdim == 3:
                jacobi_code += "\n\n" + format["generate min facet edge length"](tdim, gdim, r="+")
                jacobi_code += "\n\n" + format["generate max facet edge length"](tdim, gdim, r="+")

            # Generate code for cell volume and circumradius
            jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type)
            jacobi_code += "\n\n" + format["generate circumradius interior"](tdim, gdim, integral_type)

        elif integral_type == "interior_facet_vert":
            # THE REST IS NOT IMPLEMENTED YET
            jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+")
            jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)
        else:
            raise RuntimeError("Invalid integral_type")

    elif integral_type == "interior_facet_horiz":
        common += ["double **coordinate_dofs_0 = coordinate_dofs;"]
        # Note that the following line is unsafe for isoparametric elements.
        common += ["double **coordinate_dofs_1 = coordinate_dofs + %d;" % num_vertices]

        nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters)

        # Save number of operations (for printing info on operations).
        operations.append([ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        for _r in ["+", "-"]:
            jacobi_code += format["compute_jacobian_interior"](cell, r=_r)
            jacobi_code += "\n"
            jacobi_code += format["compute_jacobian_inverse"](cell, r=_r)
            if oriented:
                # NEED TO THINK ABOUT THIS FOR EXTRUSION
                jacobi_code += format["orientation"][p_format](tdim, gdim, r=_r)
            jacobi_code += "\n"

        # TODO: verify that this is correct (we think it is)
        jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+")
        jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type)
        # THE REST IS NOT IMPLEMENTED YET

    elif integral_type == "point":
        # Update transformer with vertices and generate code + set of used geometry terms.
        nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters)

        # Save number of operations (for printing info on operations).
        operations.append([ops])

        # Generate code for basic geometric quantities
        # @@@: Jacobian snippet
        jacobi_code  = ""
        jacobi_code += format["compute_jacobian"](cell)
        jacobi_code += "\n"
        jacobi_code += format["compute_jacobian_inverse"](cell)
        if oriented and tdim != gdim:
            jacobi_code += format["orientation"][p_format](tdim, gdim)
        jacobi_code += "\n"

    else:
        error("Unhandled integral type: " + str(integral_type))

    # Embedded manifold, need to pass in cell orientations
    if oriented and tdim != gdim and p_format == 'pyop2':
        if integral_type in ("interior_facet", "interior_facet_vert", "interior_facet_horiz"):
            common += ["const int cell_orientation%s = cell_orientation_[0][0];" % _choose_map('+'),
                       "const int cell_orientation%s = cell_orientation_[1][0];" % _choose_map('-')]
        else:
            common += ["const int cell_orientation = cell_orientation_[0][0];"]
    # After we have generated the element code for all facets we can remove
    # the unused transformations and tabulate the used psi tables and weights.
    common += [remove_unused(jacobi_code, trans_set)]
    jacobi_ir = pyop2.FlatBlock("\n".join(common))

    # @@@: const double W3[3] = {{...}}
    pyop2_weights = []
    for weights, points in [quadrature_weights[p] for p in used_weights]:
        n_points = len(points)
        w_sym = pyop2.Symbol(f_weight(n_points), () if n_points == 1 else (n_points,))
        pyop2_weights.append(pyop2.Decl("double", w_sym,
                                        pyop2.ArrayInit(weights, precision),
                                        qualifiers=["static", "const"]))

    name_map = ir["name_map"]
    tables = ir["unique_tables"]
    tables.update(affine_tables) # TODO: This is not populated anywhere, remove?

    # @@@: const double FE0[] = {{...}}
    code, decl = _tabulate_psis(tables, used_psi_tables, name_map, used_nzcs, opt_par, parameters)
    pyop2_basis = []
    for name, data in decl.items():
        rank, _, values = data
        zeroflags = values.get_zeros()
        feo_sym = pyop2.Symbol(name, rank)
        init = pyop2.ArrayInit(values, precision)
        if zeroflags is not None and not zeroflags.all():
            nz_indices = numpy.logical_not(zeroflags).nonzero()
            # Note: in the following, we take the last entry of /nz_indices/ since we /know/
            # we have been tracking only zero-valued columns
            nz_indices = nz_indices[-1]
            nz_bounds = tuple([(i, 0)] for i in rank[:-1])
            nz_bounds += ([(max(nz_indices) - min(nz_indices) + 1, min(nz_indices))],)
            init = pyop2.SparseArrayInit(values, precision, nz_bounds)
        pyop2_basis.append(pyop2.Decl("double", feo_sym, init, ["static", "const"]))

    # Build the root of the PyOP2' ast
    pyop2_tables = pyop2_weights + [tab for tab in pyop2_basis]
    root = pyop2.Root([jacobi_ir] + pyop2_tables + nest_ir)

    return root
Exemple #3
0
def build_hard_fusion_kernel(base_loop, fuse_loop, fusion_map, loop_chain_index):
    """
    Build AST and :class:`Kernel` for two loops suitable to hard fusion.

    The AST consists of three functions: fusion, base, fuse. base and fuse
    are respectively the ``base_loop`` and the ``fuse_loop`` kernels, whereas
    fusion is the orchestrator that invokes, for each ``base_loop`` iteration,
    base and, if still to be executed, fuse.

    The orchestrator has the following structure: ::

        fusion (buffer, ..., executed):
            base (buffer, ...)
            for i = 0 to arity:
                if not executed[i]:
                    additional pointer staging required by kernel2
                    fuse (sub_buffer, ...)
                    insertion into buffer

    The executed array tracks whether the i-th iteration (out of /arity/)
    adjacent to the main kernel1 iteration has been executed.
    """

    finder = Find((ast.FunDecl, ast.PreprocessNode))

    base = base_loop.kernel
    base_ast = dcopy(base._ast)
    base_info = finder.visit(base_ast)
    base_headers = base_info[ast.PreprocessNode]
    base_fundecl = base_info[ast.FunDecl]
    assert len(base_fundecl) == 1
    base_fundecl = base_fundecl[0]

    fuse = fuse_loop.kernel
    fuse_ast = dcopy(fuse._ast)
    fuse_info = finder.visit(fuse_ast)
    fuse_headers = fuse_info[ast.PreprocessNode]
    fuse_fundecl = fuse_info[ast.FunDecl]
    assert len(fuse_fundecl) == 1
    fuse_fundecl = fuse_fundecl[0]

    # Create /fusion/ arguments and signature
    body = ast.Block([])
    fusion_name = '%s_%s' % (base_fundecl.name, fuse_fundecl.name)
    fusion_args = dcopy(base_fundecl.args + fuse_fundecl.args)
    fusion_fundecl = ast.FunDecl(base_fundecl.ret, fusion_name, fusion_args, body)

    # Make sure kernel and variable names are unique
    base_fundecl.name = "%s_base" % base_fundecl.name
    fuse_fundecl.name = "%s_fuse" % fuse_fundecl.name
    for i, decl in enumerate(fusion_args):
        decl.sym.symbol += '_%d' % i

    # Filter out duplicate arguments, and append extra arguments to the fundecl
    binding = WeakFilter().kernel_args([base_loop, fuse_loop], fusion_fundecl)
    fusion_args += [ast.Decl('int*', 'executed'),
                    ast.Decl('int*', 'fused_iters'),
                    ast.Decl('int', 'i')]

    # Which args are actually used in /fuse/, but not in /base/ ? The gather for
    # such arguments is moved to /fusion/, to avoid usless memory LOADs
    base_dats = set(a.data for a in base_loop.args)
    fuse_dats = set(a.data for a in fuse_loop.args)
    unshared = OrderedDict()
    for arg, decl in binding.items():
        if arg.data in fuse_dats - base_dats:
            unshared.setdefault(decl, arg)

    # Track position of Args that need a postponed gather
    # Can't track Args themselves as they change across different parloops
    fargs = {fusion_args.index(i): ('postponed', False) for i in unshared.keys()}
    fargs.update({len(set(binding.values())): ('onlymap', True)})

    # Add maps for arguments that need a postponed gather
    for decl, arg in unshared.items():
        decl_pos = fusion_args.index(decl)
        fusion_args[decl_pos].sym.symbol = arg.c_arg_name()
        if arg._is_indirect:
            fusion_args[decl_pos].sym.rank = ()
            fusion_args.insert(decl_pos + 1, ast.Decl('int*', arg.c_map_name(0, 0)))

    # Append the invocation of /base/; then, proceed with the invocation
    # of the /fuse/ kernels
    base_funcall_syms = [binding[a].sym.symbol for a in base_loop.args]
    body.children.append(ast.FunCall(base_fundecl.name, *base_funcall_syms))

    for idx in range(fusion_map.arity):

        fused_iter = ast.Assign('i', ast.Symbol('fused_iters', (idx,)))
        fuse_funcall = ast.FunCall(fuse_fundecl.name)
        if_cond = ast.Not(ast.Symbol('executed', ('i',)))
        if_update = ast.Assign(ast.Symbol('executed', ('i',)), 1)
        if_body = ast.Block([fuse_funcall, if_update], open_scope=True)
        if_exec = ast.If(if_cond, [if_body])
        body.children.extend([ast.FlatBlock('\n'), fused_iter, if_exec])

        # Modify the /fuse/ kernel
        # This is to take into account that many arguments are shared with
        # /base/, so they will only staged once for /base/. This requires
        # tweaking the way the arguments are declared and accessed in /fuse/.
        # For example, the shared incremented array (called /buffer/ in
        # the pseudocode in the comment above) now needs to take offsets
        # to be sure the locations that /base/ is supposed to increment are
        # actually accessed. The same concept apply to indirect arguments.
        init = lambda v: '{%s}' % ', '.join([str(j) for j in v])
        for i, fuse_loop_arg in enumerate(fuse_loop.args):
            fuse_kernel_arg = binding[fuse_loop_arg]

            buffer_name = '%s_vec' % fuse_kernel_arg.sym.symbol
            fuse_funcall_sym = ast.Symbol(buffer_name)

            # What kind of temporaries do we need ?
            if fuse_loop_arg.access == INC:
                op, lvalue, rvalue = ast.Incr, fuse_kernel_arg.sym.symbol, buffer_name
                stager = lambda b, l: b.children.extend(l)
                indexer = lambda indices: [(k, j) for j, k in enumerate(indices)]
                pointers = []
            elif fuse_loop_arg.access == READ:
                op, lvalue, rvalue = ast.Assign, buffer_name, fuse_kernel_arg.sym.symbol
                stager = lambda b, l: [b.children.insert(0, j) for j in reversed(l)]
                indexer = lambda indices: [(j, k) for j, k in enumerate(indices)]
                pointers = list(fuse_kernel_arg.pointers)

            # Now gonna handle arguments depending on their type and rank ...

            if fuse_loop_arg._is_global:
                # ... Handle global arguments. These can be dropped in the
                # kernel without any particular fiddling
                fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol)

            elif fuse_kernel_arg in unshared:
                # ... Handle arguments that appear only in /fuse/
                staging = unshared[fuse_kernel_arg].c_vec_init(False).split('\n')
                rvalues = [ast.FlatBlock(j.split('=')[1]) for j in staging]
                lvalues = [ast.Symbol(buffer_name, (j,)) for j in range(len(staging))]
                staging = [ast.Assign(j, k) for j, k in zip(lvalues, rvalues)]

                # Set up the temporary
                buffer_symbol = ast.Symbol(buffer_name, (len(staging),))
                buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol,
                                       qualifiers=fuse_kernel_arg.qual,
                                       pointers=list(pointers))

                # Update the if-then AST body
                stager(if_exec.children[0], staging)
                if_exec.children[0].children.insert(0, buffer_decl)

            elif fuse_loop_arg._is_mat:
                # ... Handle Mats
                staging = []
                for b in fused_inc_arg._block_shape:
                    for rc in b:
                        lvalue = ast.Symbol(lvalue, (idx, idx),
                                            ((rc[0], 'j'), (rc[1], 'k')))
                        rvalue = ast.Symbol(rvalue, ('j', 'k'))
                        staging = ItSpace(mode=0).to_for([(0, rc[0]), (0, rc[1])],
                                                         ('j', 'k'),
                                                         [op(lvalue, rvalue)])[:1]

                # Set up the temporary
                buffer_symbol = ast.Symbol(buffer_name, (fuse_kernel_arg.sym.rank,))
                buffer_init = ast.ArrayInit(init([init([0.0])]))
                buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init,
                                       qualifiers=fuse_kernel_arg.qual, pointers=pointers)

                # Update the if-then AST body
                stager(if_exec.children[0], staging)
                if_exec.children[0].children.insert(0, buffer_decl)

            elif fuse_loop_arg._is_indirect:
                cdim = fuse_loop_arg.data.cdim

                if cdim == 1 and fuse_kernel_arg.sym.rank:
                    # [Special case]
                    # ... Handle rank 1 indirect arguments that appear in both
                    # /base/ and /fuse/: just point into the right location
                    rank = (idx,) if fusion_map.arity > 1 else ()
                    fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol, rank)

                else:
                    # ... Handle indirect arguments. At the C level, these arguments
                    # are of pointer type, so simple pointer arithmetic is used
                    # to ensure the kernel accesses are to the correct locations
                    fuse_arity = fuse_loop_arg.map.arity
                    base_arity = fuse_arity*fusion_map.arity
                    size = fuse_arity*cdim

                    # Set the proper storage layout before invoking /fuse/
                    ofs_vals = [[base_arity*j + k for k in range(fuse_arity)]
                                for j in range(cdim)]
                    ofs_vals = [[fuse_arity*j + k for k in flatten(ofs_vals)]
                                for j in range(fusion_map.arity)]
                    ofs_vals = list(flatten(ofs_vals))
                    indices = [ofs_vals[idx*size + j] for j in range(size)]

                    staging = [op(ast.Symbol(lvalue, (j,)), ast.Symbol(rvalue, (k,)))
                               for j, k in indexer(indices)]

                    # Set up the temporary
                    buffer_symbol = ast.Symbol(buffer_name, (size,))
                    if fuse_loop_arg.access == INC:
                        buffer_init = ast.ArrayInit(init([0.0]))
                    else:
                        buffer_init = ast.EmptyStatement()
                        pointers.pop()
                    buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init,
                                           qualifiers=fuse_kernel_arg.qual,
                                           pointers=pointers)

                    # Update the if-then AST body
                    stager(if_exec.children[0], staging)
                    if_exec.children[0].children.insert(0, buffer_decl)

            else:
                # Nothing special to do for direct arguments
                pass

            # Finally update the /fuse/ funcall
            fuse_funcall.children.append(fuse_funcall_sym)

    fused_headers = set([str(h) for h in base_headers + fuse_headers])
    fused_ast = ast.Root([ast.PreprocessNode(h) for h in fused_headers] +
                         [base_fundecl, fuse_fundecl, fusion_fundecl])

    return Kernel([base, fuse], fused_ast, loop_chain_index), fargs