def count_flops(n): mesh = Mesh(VectorElement('CG', interval, 1)) tfs = FunctionSpace(mesh, TensorElement('DG', interval, 1, shape=(n, n))) vfs = FunctionSpace(mesh, VectorElement('DG', interval, 1, dim=n)) ensemble_f = Coefficient(vfs) ensemble2_f = Coefficient(vfs) phi = TestFunction(tfs) i, j = indices(2) nc = 42 # magic number L = ((IndexSum( IndexSum( Product(nc * phi[i, j], Product(ensemble_f[i], ensemble_f[i])), MultiIndex((i, ))), MultiIndex((j, ))) * dx) + (IndexSum( IndexSum( Product(nc * phi[i, j], Product( ensemble2_f[j], ensemble2_f[j])), MultiIndex( (i, ))), MultiIndex((j, ))) * dx) - (IndexSum( IndexSum( 2 * nc * Product(phi[i, j], Product(ensemble_f[i], ensemble2_f[j])), MultiIndex((i, ))), MultiIndex((j, ))) * dx)) kernel, = compile_form(L, parameters=dict(mode='spectral')) return EstimateFlops().visit(kernel.ast)
def __init__(self, node, main_loop, nest, linear_reads_costs=None): self.level = -1 self.pushed = False self.readby = [] self.node = node self.main_loop = main_loop self.nest = nest self.linear_reads_costs = linear_reads_costs or OrderedDict() self.flops = EstimateFlops().visit(node)
def count_flops(form): kernel, = compile_form(form, parameters=dict(mode='spectral')) return EstimateFlops().visit(kernel.ast)
def v(): return EstimateFlops()
def plan_cpu(self, opts): """Optimize this :class:`ASTKernel` for CPU execution. :param opts: a dictionary of optimizations to be applied. For a description of the recognized optimizations, please refer to the ``coffee.set_opt_level`` documentation. If equal to ``None``, the default optimizations in ``coffee.options['optimizations']`` are applied; these are either the optimizations set when COFFEE was initialized or those changed through a call to ``set_opt_level``. In this way, a default set of optimizations is applied to all kernels, but users are also allowed to select specific transformations for individual kernels. """ start_time = time.time() kernels = Find(FunDecl, stop_when_found=True).visit(self.ast)[FunDecl] if opts is None: opts = coffee.OptimizationLevel.retrieve( coffee.options['optimizations']) else: opts = coffee.OptimizationLevel.retrieve(opts.get('optlevel', {})) flops_pre = EstimateFlops().visit(self.ast) for kernel in kernels: rewrite = opts.get('rewrite') vectorize = opts.get('vectorize', (None, None)) align_pad = opts.get('align_pad') split = opts.get('split') dead_ops_elimination = opts.get('dead_ops_elimination') info = visit(kernel, info_items=['decls', 'exprs']) # Collect expressions and related metadata nests = defaultdict(OrderedDict) for stmt, expr_info in info['exprs'].items(): parent, nest = expr_info if not nest: continue if kernel.template: typ = "double" else: typ = check_type(stmt, info['decls']) metaexpr = MetaExpr(typ, parent, nest) nests[nest[0]].update({stmt: metaexpr}) loop_opts = [ CPULoopOptimizer(loop, header, exprs) for (loop, header), exprs in nests.items() ] # Combining certain optimizations is forbidden. if dead_ops_elimination and split: warn("Split forbidden with dead-ops elimination") return if dead_ops_elimination and vectorize[0]: warn("Vect forbidden with dead-ops elimination") return if rewrite == 'auto' and len(info['exprs']) > 1: warn("Rewrite auto forbidden with multiple exprs") rewrite = 4 # Main Ootimization pipeline for loop_opt in loop_opts: # 0) Expression Rewriting if rewrite: loop_opt.rewrite(rewrite) # 1) Dead-operations elimination if dead_ops_elimination: loop_opt.eliminate_zeros() # 2) Code specialization if split: loop_opt.split(split) if coffee.initialized and flatten(loop_opt.expr_linear_loops): vect = LoopVectorizer(loop_opt, kernel) if align_pad: # Padding and data alignment vect.autovectorize() if vectorize[0] and vectorize[0] != VectStrategy.AUTO: # Specialize vectorization for the memory access pattern # of the expression vect.specialize(*vectorize) # Ensure kernel is always marked static inline # Remove either or both of static and inline (so that we get the order right) kernel.pred = [ q for q in kernel.pred if q not in ['static', 'inline'] ] kernel.pred.insert(0, 'inline') kernel.pred.insert(0, 'static') # Post processing of the AST ensures higher-quality code postprocess(kernel) flops_post = EstimateFlops().visit(self.ast) tot_time = time.time() - start_time output = "COFFEE finished in %g seconds (flops: %d -> %d)" % \ (tot_time, flops_pre, flops_post) log(output, PERF_OK if flops_post <= flops_pre else PERF_WARN)