def generate_loopy_kernel(slate_expr, tsfc_parameters=None): cpu_time = time.time() if len(slate_expr.ufl_domains()) > 1: raise NotImplementedError("Multiple domains not implemented.") Citations().register("Gibson2018") # Create a loopy builder for the Slate expression, # e.g. contains the loopy kernels coming from TSFC gem_expr, var2terminal = slate_to_gem(slate_expr) scalar_type = tsfc_parameters["scalar_type"] slate_loopy, output_arg = gem_to_loopy(gem_expr, var2terminal, scalar_type) builder = LocalLoopyKernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) loopy_merged = merge_loopy(slate_loopy, output_arg, builder, var2terminal) loopy_merged = loopy.register_function_id_to_in_knl_callable_mapper( loopy_merged, inv_fn_lookup) loopy_merged = loopy.register_function_id_to_in_knl_callable_mapper( loopy_merged, solve_fn_lookup) # WORKAROUND: Generate code directly from the loopy kernel here, # then attach code as a c-string to the op2kernel code = loopy.generate_code_v2(loopy_merged).device_code() code = code.replace(f'void {loopy_merged.name}', f'static void {loopy_merged.name}') loopykernel = op2.Kernel(code, loopy_merged.name, include_dirs=BLASLAPACK_INCLUDE.split(), ldargs=BLASLAPACK_LIB.split()) kinfo = KernelInfo( kernel=loopykernel, integral_type= "cell", # slate can only do things as contributions to the cell integrals oriented=builder.bag.needs_cell_orientations, subdomain_id="otherwise", domain_number=0, coefficient_map=tuple(range(len(slate_expr.coefficients()))), needs_cell_facets=builder.bag.needs_cell_facets, pass_layer_arg=builder.bag.needs_mesh_layers, needs_cell_sizes=builder.bag.needs_cell_sizes) # Cache the resulting kernel # Slate kernels are never split, so indicate that with None in the index slot. idx = tuple([None] * slate_expr.rank) logger.info(GREEN % "compile_slate_expression finished in %g seconds.", time.time() - cpu_time) return (SplitKernel(idx, kinfo), )
def generate_loopy_kernel(slate_expr, tsfc_parameters=None): cpu_time = time.time() if len(slate_expr.ufl_domains()) > 1: raise NotImplementedError("Multiple domains not implemented.") Citations().register("Gibson2018") # Create a loopy builder for the Slate expression, # e.g. contains the loopy kernels coming from TSFC gem_expr, var2terminal = slate_to_gem(slate_expr) scalar_type = tsfc_parameters["scalar_type"] slate_loopy, output_arg = gem_to_loopy(gem_expr, var2terminal, scalar_type) builder = LocalLoopyKernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) name = "slate_wrapper" loopy_merged = merge_loopy(slate_loopy, output_arg, builder, var2terminal, name) loopy_merged = loopy.register_callable(loopy_merged, INVCallable.name, INVCallable()) loopy_merged = loopy.register_callable(loopy_merged, SolveCallable.name, SolveCallable()) loopykernel = op2.Kernel(loopy_merged, name, include_dirs=BLASLAPACK_INCLUDE.split(), ldargs=BLASLAPACK_LIB.split()) kinfo = KernelInfo( kernel=loopykernel, integral_type= "cell", # slate can only do things as contributions to the cell integrals oriented=builder.bag.needs_cell_orientations, subdomain_id="otherwise", domain_number=0, coefficient_map=tuple(range(len(slate_expr.coefficients()))), needs_cell_facets=builder.bag.needs_cell_facets, pass_layer_arg=builder.bag.needs_mesh_layers, needs_cell_sizes=builder.bag.needs_cell_sizes) # Cache the resulting kernel # Slate kernels are never split, so indicate that with None in the index slot. idx = tuple([None] * slate_expr.rank) logger.info(GREEN % "compile_slate_expression finished in %g seconds.", time.time() - cpu_time) return (SplitKernel(idx, kinfo), )
def generate_kernel(slate_expr, tsfc_parameters=None): cpu_time = time.time() # TODO: Get PyOP2 to write into mixed dats if slate_expr.is_mixed: raise NotImplementedError("Compiling mixed slate expressions") if len(slate_expr.ufl_domains()) > 1: raise NotImplementedError("Multiple domains not implemented.") Citations().register("Gibson2018") # Create a builder for the Slate expression builder = LocalKernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) # Keep track of declared temporaries declared_temps = {} statements = [] # Declare terminal tensor temporaries terminal_declarations = terminal_temporaries(builder, declared_temps) statements.extend(terminal_declarations) # Generate assembly calls for tensor assembly subkernel_calls = tensor_assembly_calls(builder) statements.extend(subkernel_calls) # Create coefficient temporaries if necessary if builder.coefficient_vecs: coefficient_temps = coefficient_temporaries(builder, declared_temps) statements.extend(coefficient_temps) # Create auxiliary temporaries/expressions (if necessary) statements.extend(auxiliary_expressions(builder, declared_temps)) # Generate the kernel information with complete AST kinfo = generate_kernel_ast(builder, statements, declared_temps) # Cache the resulting kernel idx = tuple([0] * slate_expr.rank) logger.info(GREEN % "compile_slate_expression finished in %g seconds.", time.time() - cpu_time) return (SplitKernel(idx, kinfo), )