def generate_definition(kernel_info, target='cpu'): """Generates the definition (i.e. implementation) of the kernel function""" ast = kernel_info.ast result = generate_c(ast, dialect='cuda' if target == 'gpu' else 'c') result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result) return result
def write_to_file(self, restrict_qualifier, function_prefix, file): headers = {'<math.h>', '<stdint.h>'} for ast in self._ast_nodes: headers.update(get_headers(ast)) header_list = list(headers) header_list.sort() header_list.insert(0, '"Python.h"') includes = "\n".join( ["#include %s" % (include_file, ) for include_file in header_list]) print(includes, file=file) print("\n", file=file) print("#define RESTRICT %s" % (restrict_qualifier, ), file=file) print("#define FUNC_PREFIX %s" % (function_prefix, ), file=file) print("\n", file=file) for ast, name in zip(self._ast_nodes, self._function_names): old_name = ast.function_name ast.function_name = "kernel_" + name print(generate_c(ast, custom_backend=self._custom_backend), file=file) print(create_function_boilerplate_code(ast.get_parameters(), name), file=file) ast.function_name = old_name print(create_module_boilerplate_code(self.module_name, self._function_names), file=file)
def expr_to_dict(expr_or_node: NodeOrExpr, with_c_code=True, full_class_names=False): """Converts a SymPy expression to a serializable dict (mainly for debugging purposes) The dict recursively contains all args of the expression as ``dict``s See :func:`.write_json` Args: expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node` with_c_code (bool, optional): include C representation of the nodes full_class_names (bool, optional): use full class names (type(object) instead of ``type(object).__name__`` """ self = {'str': str(expr_or_node)} if with_c_code: try: self.update({'c': generate_c(expr_or_node)}) except Exception: try: self.update({'c': CustomSympyPrinter().doprint(expr_or_node)}) except Exception: pass for a in expr_or_node.args: self.update({ str(a.__class__ if full_class_names else a.__class__.__name__): expr_to_dict(a) }) return self
def generate_declaration(kernel_info, target='cpu'): """Generates the declaration of the kernel function""" ast = kernel_info.ast result = generate_c(ast, signature_only=True, dialect='cuda' if target == 'gpu' else 'c') + ";" result = "namespace internal_%s {\n%s\n}" % ( ast.function_name, result, ) return result
def generate_opencl(astnode: Node, signature_only: bool = False) -> str: """Prints an abstract syntax tree node (made for target 'gpu') as OpenCL code. Args: astnode: KernelFunction node to generate code for signature_only: if True only the signature is printed Returns: C-like code for the ast node and its descendants """ return generate_c(astnode, signature_only, dialect='opencl')
def generate_cuda(astnode: Node, signature_only: bool = False) -> str: """Prints an abstract syntax tree node as CUDA code. Args: astnode: KernelFunction node to generate code for signature_only: if True only the signature is printed Returns: C-like code for the ast node and its descendants """ return generate_c(astnode, signature_only, dialect='cuda')
def generate_benchmark(ast, likwid=False, openmp=False, timing=False): """Return C code of a benchmark program for the given kernel. Args: ast: the pystencils AST object as returned by create_kernel likwid: if True likwid markers are added to the code openmp: relevant only if likwid=True, to generated correct likwid initialization code timing: add timing output to the code, prints time per iteration to stdout Returns: C code as string """ accessed_fields = {f.name: f for f in ast.fields_accessed} constants = [] fields = [] call_parameters = [] for p in ast.get_parameters(): if not p.is_field_parameter: constants.append((p.symbol.name, str(p.symbol.dtype))) call_parameters.append(p.symbol.name) else: assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" field = accessed_fields[p.field_name] dtype = str(get_base_type(p.symbol.dtype)) fields.append((p.field_name, dtype, prod(field.shape))) call_parameters.append(p.field_name) header_list = get_headers(ast) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) # Strip "#pragma omp parallel" from within kernel, because main function takes care of that # when likwid and openmp are enabled if likwid and openmp: if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): ast.body.args[0].pragma_line = '' args = { 'likwid': likwid, 'openmp': openmp, 'kernel_code': generate_c(ast, dialect='c'), 'kernelName': ast.function_name, 'fields': fields, 'constants': constants, 'call_argument_list': ",".join(call_parameters), 'includes': includes, 'timing': timing, } return benchmark_template.render(**args)
def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str: """Prints an abstract syntax tree node as CUDA code. Args: ast_node: ast representation of kernel signature_only: generate signature without function body custom_backend: use own custom printer for code generation with_globals: enable usage of global variables Returns: CUDA code for the ast node and its descendants """ return generate_c(ast_node, signature_only, dialect=Backend.CUDA, custom_backend=custom_backend, with_globals=with_globals)
def create_code_string(self, restrict_qualifier, function_prefix): self._code_string = str() headers = {'<math.h>', '<stdint.h>'} for ast in self._ast_nodes: headers.update(get_headers(ast)) header_list = list(headers) header_list.sort() header_list.insert(0, '"Python.h"') ps_headers = [ os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list if os.path.exists( os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1])) ] header_hash = b''.join([ hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers ]) includes = "\n".join( [f"#include {include_file}" for include_file in header_list]) self._code_string += includes self._code_string += "\n" self._code_string += f"#define RESTRICT {restrict_qualifier} \n" self._code_string += f"#define FUNC_PREFIX {function_prefix}" self._code_string += "\n" for ast, name in zip(self._ast_nodes, self._function_names): old_name = ast.function_name ast.function_name = f"kernel_{name}" self._code_string += generate_c( ast, custom_backend=self._custom_backend) self._code_string += create_function_boilerplate_code( ast.get_parameters(), name, ast) ast.function_name = old_name self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode() + header_hash).hexdigest() self._code_string += create_module_boilerplate_code( self._code_hash, self._function_names)
def compile_and_load(ast, custom_backend=None): cache_config = get_cache_config() code_hash_str = "mod_" + hashlib.sha256( generate_c(ast, dialect='c', custom_backend=custom_backend).encode()).hexdigest() code = ExtensionModuleCode(module_name=code_hash_str, custom_backend=custom_backend) code.add_function(ast, ast.function_name) if cache_config['object_cache'] is False: with TemporaryDirectory() as base_dir: lib_file = compile_module(code, code_hash_str, base_dir) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) else: lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache']) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) return KernelWrapper(result, ast.get_parameters(), ast)
def __repr__(self): return generate_c(self.ast, dialect=dialect, custom_backend=custom_backend)
def _repr_html_(self): return highlight_cpp( generate_c(self.ast, dialect=dialect, custom_backend=custom_backend)).__html__()
def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None): """ Creates a kernel function from an abstract syntax tree which was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` Args: kernel_function_node: the abstract syntax tree argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the returned kernel functor. Returns: compiled kernel as Python function """ import pycuda.autoinit # NOQA from pycuda.compiler import SourceModule if argument_dict is None: argument_dict = {} header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node)) includes = "\n".join( ["#include %s" % (include_file, ) for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str( generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend)) options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: options.append("-use_fast_math") mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) func = mod.get_function(kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() cache = {} cache_values = [] def wrapper(**kwargs): key = hash( tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) for k, v in kwargs.items())) try: args, block_and_thread_numbers = cache[key] func(*args, **block_and_thread_numbers) except KeyError: full_arguments = argument_dict.copy() full_arguments.update(kwargs) shape = _check_arguments(parameters, full_arguments) indexing = kernel_function_node.indexing block_and_thread_numbers = indexing.call_parameters(shape) block_and_thread_numbers['block'] = tuple( int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple( int(i) for i in block_and_thread_numbers['grid']) args = _build_numpy_argument_list(parameters, full_arguments) cache[key] = (args, block_and_thread_numbers) cache_values.append( kwargs) # keep objects alive such that ids remain unique func(*args, **block_and_thread_numbers) # import pycuda.driver as cuda # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() wrapper.num_regs = func.num_regs return wrapper
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None): """ Creates a **OpenCL** kernel function from an abstract syntax tree which was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` Args: opencl_queue: a valid :class:`pyopencl.CommandQueue` opencl_ctx: a valid :class:`pyopencl.Context` kernel_function_node: the abstract syntax tree argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the returned kernel functor. Returns: compiled kernel as Python function """ import pyopencl as cl assert opencl_ctx, "No valid OpenCL context" assert opencl_queue, "No valid OpenCL queue" if argument_dict is None: argument_dict = {} # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!) kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node)) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __kernel\n" code += "#define RESTRICT restrict\n\n" code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend)) options = [] if USE_FAST_MATH: options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only") options.append("-I \"" + get_pystencils_include_path() + "\"") mod = cl.Program(opencl_ctx, code).build(options=options) func = getattr(mod, kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() cache = {} cache_values = [] def wrapper(**kwargs): key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) for k, v in kwargs.items())) try: args, block_and_thread_numbers = cache[key] func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) except KeyError: full_arguments = argument_dict.copy() full_arguments.update(kwargs) shape = _check_arguments(parameters, full_arguments) indexing = kernel_function_node.indexing block_and_thread_numbers = indexing.call_parameters(shape) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'], block_and_thread_numbers['grid'])) args = _build_numpy_argument_list(parameters, full_arguments) args = [a.data if hasattr(a, 'data') else a for a in args] cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() return wrapper