def visit_Operator(self, o): blankline = c.Line("") # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Line(), c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: esigns.append(c.FunctionDeclaration(c.Value(i.root.retval, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([i.root.ccode, blankline]) # Header files, extra definitions, ... header = [c.Define(*i) for i in o._headers] + [blankline] includes = [c.Include(i, system=(False if i.endswith('.h') else True)) for i in o._includes] includes += [blankline] cdefs = [i._C_typedecl for i in o.parameters if i._C_typedecl is not None] for i in o._func_table.values(): if i.local: cdefs.extend([j._C_typedecl for j in i.root.parameters if j._C_typedecl is not None]) cdefs = filter_sorted(cdefs, key=lambda i: i.tpname) if o._compiler.src_ext == 'cpp': cdefs += [c.Extern('C', signature)] cdefs = [i for j in cdefs for i in (j, blankline)] return c.Module(header + includes + cdefs + esigns + [blankline, kernel] + efuncs)
def _alloc_array_on_high_bw_mem(self, obj, storage): """Allocate an Array in the high bandwidth memory.""" if obj in storage._high_bw_mem: return size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = prod(obj.symbolic_shape) alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full) init = c.Initializer(decl, alloc) free = c.Statement('acc_free(%s)' % obj.name) storage._high_bw_mem[obj] = (None, init, free)
def _generate_lib_func(self): block = cgen.Block([ self.loop_timer.get_cpp_pre_loop_code_ast(), self._components['LIB_OUTER_LOOP'], self.loop_timer.get_cpp_post_loop_code_ast() ]) self._components['LIB_FUNC'] = cgen.FunctionBody( cgen.FunctionDeclaration( cgen.Value("void", self._components['LIB_NAME']) , self._components['LIB_ARG_DECLS'] + \ self._components['KERNEL_LIB_ARG_DECLS'] ), block )
def generate(self, funcname, field_args, kernel_ast, adaptive=False): ccode = [] # Add include for Parcels and math header ccode += [str(c.Include("parcels.h", system=False))] ccode += [str(c.Include("math.h", system=False))] # Generate type definition for particle type vdecl = [c.POD(dtype, var) for var, dtype in self.ptype.var_types.items()] ccode += [str(c.Typedef(c.GenerableStruct("", vdecl, declname=self.ptype.name)))] # Insert kernel code ccode += [str(kernel_ast)] # Generate outer loop for repeated kernel invocation args = [c.Value("int", "num_particles"), c.Pointer(c.Value(self.ptype.name, "particles")), c.Value("double", "endtime"), c.Value("float", "dt")] for field, _ in field_args.items(): args += [c.Pointer(c.Value("CField", "%s" % field))] fargs_str = ", ".join(['particles[p].time', 'particles[p].dt'] + list(field_args.keys())) # Inner loop nest for forward runs dt_fwd = c.Statement("__dt = fmin(particles[p].dt, endtime - particles[p].time)") body_fwd = [c.Statement("res = %s(&(particles[p]), %s)" % (funcname, fargs_str)), c.If("res == SUCCESS", c.Statement("particles[p].time += __dt")), dt_fwd] time_fwd = c.While("__dt > __tol", c.Block(body_fwd)) part_fwd = c.For("p = 0", "p < num_particles", "++p", c.Block([dt_fwd, time_fwd])) # Inner loop nest for backward runs dt_bwd = c.Statement("__dt = fmax(particles[p].dt, endtime - particles[p].time)") body_bwd = [c.Statement("res = %s(&(particles[p]), %s)" % (funcname, fargs_str)), c.If("res == SUCCESS", c.Statement("particles[p].time += __dt")), dt_bwd] time_bwd = c.While("__dt < -1. * __tol", c.Block(body_bwd)) part_bwd = c.For("p = 0", "p < num_particles", "++p", c.Block([dt_bwd, time_bwd])) time_if = c.If("dt > 0.0", c.Block([part_fwd]), c.Block([part_bwd])) fbody = c.Block([c.Value("int", "p"), c.Value("KernelOp", "res"), c.Value("double", "__dt, __tol"), c.Assign("__tol", "1.e-6"), time_if]) fdecl = c.FunctionDeclaration(c.Value("void", "particle_loop"), args) ccode += [str(c.FunctionBody(fdecl, fbody))] return "\n\n".join(ccode)
def _generate_kernel_call(self): kernel_call = cgen.Module([cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.GlobalArrayClassic): kernel_call_symbols.append(symbol+'_c') elif issubclass(type(obj), host._Array): kernel_call_symbols.append(symbol) elif issubclass(type(obj), host.Matrix): call_symbol = symbol + '_c' kernel_call_symbols.append(call_symbol) nc = str(obj.ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc _jshift = '+' + self._components['LIB_PAIR_INDEX_1'] + '*' + nc if mode.write and obj.ncomp <= self._gather_size_limit: isym = '&'+ symbol+'i[0]' else: isym = symbol + _ishift jsym = symbol + _jshift g = cgen.Value('_'+symbol+'_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + ', ' + jsym + '}') kernel_call.append(g) else: raise RuntimeError("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx +',' kernel_call_symbols_s=kernel_call_symbols_s[:-1] kernel_call.append(cgen.Line( 'k_'+self._kernel.name+'(' + kernel_call_symbols_s + ');' )) self._components['LIB_KERNEL_CALL'] = kernel_call
def _alloc_array_on_high_bw_mem(self, site, obj, storage, *args): """ Allocate an Array in the high bandwidth memory. """ decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(obj._C_typedata, decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) size = "sizeof(%s%s)" % (obj._C_typedata, shape) alloc = c.Statement(self.lang['alloc-host'](obj.name, obj._data_alignment, size)) free = c.Statement(self.lang['free-host'](obj.name)) storage.update(obj, site, allocs=(decl, alloc), frees=free)
def push_array_on_heap(self, obj): """Define an Array on the heap.""" if obj in self.heap: return decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(obj._C_typedata, decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) alloc = "posix_memalign((void**)&%s, %d, sizeof(%s%s))" alloc = alloc % (obj.name, obj._data_alignment, obj._C_typedata, shape) alloc = c.Statement(alloc) free = c.Statement('free(%s)' % obj.name) self.heap[obj] = (decl, alloc, free)
def push_heap(self, obj): """Generate cgen objects to declare an Array and allocate/free its memory.""" if obj in self.heap: return decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(obj._C_typedata, decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) alloc = "posix_memalign((void**)&%s, %d, sizeof(%s%s))" alloc = alloc % (obj.name, obj._data_alignment, obj._C_typedata, shape) alloc = c.Statement(alloc) free = c.Statement('free(%s)' % obj.name) self.heap[obj] = (decl, alloc, free)
def _alloc_array_on_high_bw_mem(self, site, obj, storage): """ Allocate an Array in the high bandwidth memory. """ decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(obj._C_typedata, decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) alloc = "posix_memalign((void**)&%s, %d, sizeof(%s%s))" alloc = alloc % (obj.name, obj._data_alignment, obj._C_typedata, shape) alloc = c.Statement(alloc) free = c.Statement('free(%s)' % obj.name) storage.update(obj, site, allocs=(decl, alloc), frees=free)
def ccode(self): """Generate C code for the represented stencil loop :returns: :class:`cgen.For` object representing the loop """ forward = self.limits[1] >= self.limits[0] loop_body = cgen.Block([s.ccode for s in self.expressions]) loop_init = cgen.InlineInitializer(cgen.Value("int", self.index), self.limits[0]) loop_cond = '%s %s %s' % (self.index, '<' if forward else '>', self.limits[1]) if self.limits[2] == 1: loop_inc = '%s%s' % (self.index, '++' if forward else '--') else: loop_inc = '%s %s %s' % (self.index, '+=' if forward else '-=', self.limits[2]) return cgen.For(loop_init, loop_cond, loop_inc, loop_body)
def __init__(self, dtype, name, args, body, template: tuple = None): self.name = name self.is_template = template is not None if self.is_template: self.name = f'{self.name}<{template[1]}>' self.value = c.Value(dtype, self.name) self.decl = c.FunctionDeclaration(self.value, args) if body is None: self.fnc = None else: self.fnc = c.FunctionBody(self.decl, body) self.modifiers = [] self.constraints = []
def _alloc_array_on_low_lat_mem(self, site, obj, storage): """ Allocate an Array in the low latency memory. """ shape = "".join("[%s]" % ccode(i) for i in obj.symbolic_shape) alignment = self.lang['aligned'](obj._data_alignment) decl = c.Value(obj._C_typedata, "%s%s %s" % (obj._C_name, shape, alignment)) if obj.initvalue is not None: storage.update(obj, site, allocs=c.Initializer(decl, ListInitializer( obj.initvalue))) else: storage.update(obj, site, allocs=decl)
def visit_Operator(self, o): # Generate the code for the cfile ccode = super().visit_Operator(o, mode='private') # Generate the code for the hfile typedecls = self._operator_typedecls(o, mode='public') guarded_typedecls = [] for i in typedecls: guard = "DEVITO_%s" % i.tpname.upper() iflines = [c.Define(guard, ""), blankline, i, blankline] guarded_typedecl = c.IfNDef(guard, iflines, []) guarded_typedecls.extend([guarded_typedecl, blankline]) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) hcode = c.Module(guarded_typedecls + [blankline, signature, blankline]) return ccode, hcode
def __init__(self, dtype, name, args, body, template: tuple = None, modifiers=''): self.dtype = dtype self.name = name self.value = c.Value(dtype, name) self.modifiers = modifiers self.is_template = template is not None decl = c.FunctionDeclaration(self.value, args) if self.is_template: decl = c.Template(template[0], decl) self.fnc = c.FunctionBody(decl, body)
def push_heap(self, obj): """ Generate cgen objects to declare, allocate memory, and free memory for ``obj``, of type :class:`Array`. """ if obj in self.heap: return decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(c.dtype_to_ctype(obj.dtype), decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) alloc = "posix_memalign((void**)&%s, 64, sizeof(%s%s))" alloc = alloc % (obj.name, c.dtype_to_ctype(obj.dtype), shape) alloc = c.Statement(alloc) free = c.Statement('free(%s)' % obj.name) self.heap[obj] = (decl, alloc, free)
def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage): """ Allocate the following objects in the high bandwidth memory: * The pointer array `obj`; * The pointee Array `obj.array` If the pointer array is defined over `sregistry.threadid`, that it a thread Dimension, then each `obj.array` slice is allocated and freed individually by the logically-owning thread. """ # The pointer array decl = "**%s" % obj.name decl = c.Value(obj._C_typedata, decl) alloc0 = "posix_memalign((void**)&%s, %d, sizeof(%s*)*%s)" alloc0 = alloc0 % (obj.name, obj._data_alignment, obj._C_typedata, obj.dim.symbolic_size) alloc0 = c.Statement(alloc0) free0 = c.Statement('free(%s)' % obj.name) # The pointee Array shape = "".join("[%s]" % i for i in obj.array.symbolic_shape) alloc1 = "posix_memalign((void**)&%s[%s], %d, sizeof(%s%s))" alloc1 = alloc1 % (obj.name, obj.dim.name, obj._data_alignment, obj._C_typedata, shape) alloc1 = c.Statement(alloc1) free1 = c.Statement('free(%s[%s])' % (obj.name, obj.dim.name)) if obj.dim is self.sregistry.threadid: storage.update(obj, site, allocs=(decl, alloc0), frees=free0, pallocs=(obj.dim, alloc1), pfrees=(obj.dim, free1)) else: storage.update(obj, site, allocs=(decl, alloc0, alloc1), frees=(free0, free1))
def _args_cast(self, args): """Build cgen type casts for an iterable of :class:`Argument`.""" ret = [] for i in args: if i.is_TensorArgument: align = "__attribute__((aligned(64)))" shape = ''.join( ["[%s]" % ccode(j) for j in i.provider.symbolic_shape[1:]]) lvalue = c.POD(i.dtype, '(*restrict %s)%s %s' % (i.name, shape, align)) rvalue = '(%s (*)%s) %s' % (c.dtype_to_ctype( i.dtype), shape, '%s_vec' % i.name) ret.append(c.Initializer(lvalue, rvalue)) elif i.is_PtrArgument: ctype = ctypes_to_C(i.dtype) lvalue = c.Pointer(c.Value(ctype, i.name)) rvalue = '(%s*) %s' % (ctype, '_%s' % i.name) ret.append(c.Initializer(lvalue, rvalue)) return ret
def visit_FunctionDef(self, node): # Generate "ccode" attribute by traversing the Python AST for stmt in node.body: if not (hasattr(stmt, 'value') and type(stmt.value) is ast.Str): # ignore docstrings self.visit(stmt) # Create function declaration and argument list decl = c.Static( c.DeclSpecifier(c.Value("ErrorCode", node.name), spec='inline')) args = [ c.Pointer(c.Value(self.ptype.name, "particle")), c.Value("double", "time"), c.Value("float", "dt") ] for field_name, field in self.field_args.items(): if field_name != 'UV': args += [c.Pointer(c.Value("CField", "%s" % field_name))] for field_name, field in self.field_args.items(): if field_name == 'UV': fieldset = field.fieldset for f in ['U', 'V', 'cosU', 'sinU', 'cosV', 'sinV']: try: getattr(fieldset, f) if f not in self.field_args: args += [c.Pointer(c.Value("CField", "%s" % f))] except: if fieldset.U.grid.gtype in [ GridCode.CurvilinearZGrid, GridCode.CurvilinearSGrid ]: raise RuntimeError( "cosU, sinU, cosV and sinV fields must be defined for a proper rotation of U, V fields in curvilinear grids" ) for const, _ in self.const_args.items(): args += [c.Value("float", const)] # Create function body as C-code object body = [ stmt.ccode for stmt in node.body if not (hasattr(stmt, 'value') and type(stmt.value) is ast.Str) ] body += [c.Statement("return SUCCESS")] node.ccode = c.FunctionBody(c.FunctionDeclaration(decl, args), c.Block(body))
def eval(self, generator): if isinstance(self.base, MessageBox): methods, inherits = base_methods(self.base.base, self.base.name.eval()) return CSharpClass(f'{self.base.name.eval()}{inherits}', methods, [], [], WrapStruct(self.base.block).eval(generator), [], []) elif isinstance(self.base, BlockBox): return [ WrapStruct(x).eval(generator) for x in self.base.fields.eval() ] elif isinstance(self.base, DeclarationBox): dtype = self.base.dtype.eval(generator) if self.base.optional: dtype = f'Optional<{dtype}>' return c.Value(dtype, self.base.name.eval())
def visit_FunctionDef(self, node): # Generate "ccode" attribute by traversing the Python AST for stmt in node.body: if not (hasattr(stmt, 'value') and type(stmt.value) is ast.Str): # ignore docstrings self.visit(stmt) # Create function declaration and argument list decl = c.Static( c.DeclSpecifier(c.Value("ErrorCode", node.name), spec='inline')) args = [ c.Pointer(c.Value(self.ptype.name, "particle")), c.Value("double", "time"), c.Value("float", "dt") ] for field_name, field in self.field_args.items(): args += [c.Pointer(c.Value("CField", "%s" % field_name))] for field_name, field in self.vector_field_args.items(): fieldset = field.fieldset Wname = field.W.name if field.W else 'not_defined' for f in [field.U.name, field.V.name, Wname]: try: # Next line will break for example if field.U was created but not added to the fieldset getattr(fieldset, f) if f not in self.field_args: args += [c.Pointer(c.Value("CField", "%s" % f))] except: if f != Wname: raise RuntimeError( "Field %s needed by a VectorField but it does not exist" % f) else: pass for const, _ in self.const_args.items(): args += [c.Value("float", const)] # Create function body as C-code object body = [ stmt.ccode for stmt in node.body if not (hasattr(stmt, 'value') and type(stmt.value) is ast.Str) ] body += [c.Statement("return SUCCESS")] node.ccode = c.FunctionBody(c.FunctionDeclaration(decl, args), c.Block(body))
def generate(self, py_ast, funcvars): # Untangle Pythonic tuple-assignment statements py_ast = TupleSplitter().visit(py_ast) # Replace occurences of intrinsic objects in Python AST transformer = IntrinsicTransformer(self.grid, self.ptype) py_ast = transformer.visit(py_ast) # Generate C-code for all nodes in the Python AST self.visit(py_ast) self.ccode = py_ast.ccode # Insert variable declarations for non-instrinsics for kvar in self.kernel_vars + self.array_vars: if kvar in funcvars: funcvars.remove(kvar) if len(funcvars) > 0: self.ccode.body.insert(0, c.Value("float", ", ".join(funcvars))) return self.ccode
def visit_Assign(self, node): self.visit(node.targets[0]) self.visit(node.value) if isinstance(node.value, ast.List): # Detect in-place initialisation of multi-dimensional arrays tmp_node = node.value decl = c.Value('float', node.targets[0].id) while isinstance(tmp_node, ast.List): decl = c.ArrayOf(decl, len(tmp_node.elts)) if isinstance(tmp_node.elts[0], ast.List): # Check type and dimension are the same if not all(isinstance(e, ast.List) for e in tmp_node.elts): raise TypeError("Non-list element discovered in array declaration") if not all(len(e.elts) == len(tmp_node.elts[0].elts) for e in tmp_node.elts): raise TypeError("Irregular array length not allowed in array declaration") tmp_node = tmp_node.elts[0] node.ccode = c.Initializer(decl, node.value.ccode) self.array_vars += [node.targets[0].id] else: node.ccode = c.Assign(node.targets[0].ccode, node.value.ccode)
def map_Subroutine(self, node): assert not node.prefix assert not hasattr(node, "suffix") scope = Scope(node.name, list(node.args)) self.scope_stack.append(scope) body = self.map_statement_list(node.content) pre_func_decl, in_func_decl = self.get_declarations() body = in_func_decl + [cgen.Line()] + body if isinstance(body[-1], cgen.Statement) and body[-1].text == "return": body.pop() def get_arg_decl(arg_idx, arg_name): decl = self.get_declarator(arg_name) if self.arg_needs_pointer(node.name, arg_idx): hint = self.addr_space_hints.get((node.name, arg_name)) if hint: decl = hint(cgen.Pointer(decl)) else: if self.use_restrict_pointers: decl = cgen.RestrictPointer(decl) else: decl = cgen.Pointer(decl) return decl result = cgen.FunctionBody( cgen.FunctionDeclaration( cgen.Value("void", node.name), [get_arg_decl(i, arg) for i, arg in enumerate(node.args)]), cgen.Block(body)) self.scope_stack.pop() if pre_func_decl: return pre_func_decl + [cgen.Line(), result] else: return result
def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage): """ Allocate the following objects in the high bandwidth memory: * The pointer array `obj`; * The pointee Array `obj.array` If the pointer array is defined over `sregistry.threadid`, that is a thread Dimension, then each `obj.array` slice is allocated and freed individually by the owner thread. """ # The pointer array decl = "**%s" % obj.name decl = c.Value(obj._C_typedata, decl) size = 'sizeof(%s*)*%s' % (obj._C_typedata, obj.dim.symbolic_size) alloc0 = c.Statement(self.lang['alloc-host'](obj.name, obj._data_alignment, size)) free0 = c.Statement(self.lang['free-host'](obj.name)) # The pointee Array pobj = '%s[%s]' % (obj.name, obj.dim.name) shape = "".join("[%s]" % i for i in obj.array.symbolic_shape) size = "sizeof(%s%s)" % (obj._C_typedata, shape) alloc1 = c.Statement(self.lang['alloc-host'](pobj, obj._data_alignment, size)) free1 = c.Statement(self.lang['free-host'](pobj)) if obj.dim is self.sregistry.threadid: storage.update(obj, site, allocs=(decl, alloc0), frees=free0, pallocs=(obj.dim, alloc1), pfrees=(obj.dim, free1)) else: storage.update(obj, site, allocs=(decl, alloc0, alloc1), frees=(free0, free1))
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: # posix_memalign + copy-to-device super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # acc_malloc -- the Array only resides on the device, ie, it never # needs to be accessed on the host assert obj._mem_default size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = "sizeof(%s[%s])" % (obj._C_typedata, prod(obj.symbolic_shape)) alloc = "%s %s" % (cast, self.lang['device-alloc'](size_full)) init = c.Initializer(decl, alloc) free = c.Statement(self.lang['device-free'](obj.name)) storage.update(obj, site, allocs=init, frees=free)
def eval(self, generator): if isinstance(self.base, MessageBox): inherits = '' if self.base.base.name: inherits = f' : public {self.base.base.name.eval()}' if self.base.base.template: inherits = f'{inherits}<{self.base.base.template.eval()}>' return c.Struct(f'{self.base.name.eval()}{inherits}', WrapStruct(self.base.block).eval(generator)) elif isinstance(self.base, BlockBox): return [ WrapStruct(x).eval(generator) for x in self.base.fields.eval() ] elif isinstance(self.base, DeclarationBox): dtype = self.base.dtype.eval(generator) if self.base.optional: dtype = f'std::optional<{dtype}>' return c.Value(dtype, self.base.name.eval())
def _alloc_array_on_high_bw_mem(self, site, obj, storage): """ Allocate an Array in the high bandwidth memory. """ if obj._mem_mapped: # posix_memalign + copy-to-device super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # acc_malloc -- the Array only resides on the device, ie, it never # needs to be accessed on the host assert obj._mem_default size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = prod(obj.symbolic_shape) alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full) init = c.Initializer(decl, alloc) free = c.Statement('acc_free(%s)' % obj.name) storage.update(obj, site, allocs=init, frees=free)
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides # on the device as it never needs to be accessed on the host assert obj._mem_default decl = c.Value(obj._C_typedata, "*%s" % obj._C_name) size = "sizeof(%s[%s])" % (obj._C_typedata, prod( obj.symbolic_shape)) deviceid = self.lang['device-get'] doalloc = self.lang['device-alloc'] dofree = self.lang['device-free'] alloc = "(%s*) %s" % (obj._C_typedata, doalloc(size, deviceid)) init = c.Initializer(decl, alloc) free = c.Statement(dofree(obj._C_name, deviceid)) storage.update(obj, site, allocs=init, frees=free)
def _generate_kernel_gather(self): kernel_gather = cgen.Module( [cgen.Comment('#### Pre kernel gather ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): pass for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): pass elif issubclass(type(dat[1][0]), host.Matrix) \ and dat[1][1].write \ and dat[1][0].ncomp <= self._gather_size_limit: isym = dat[0] + 'i' nc = dat[1][0].ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[dat[1][0].dtype] t = '{' for tx in range(nc): t += '*(' + dat[0] + '+' + self._components[ 'LIB_PAIR_INDEX_0'] t += '*' + str(nc) + '+' + str(tx) + '),' t = t[:-1] + '}' g = cgen.Value(dtype, isym + ncb) ''' if not dat[1][1].write: g = cgen.Const(g) ''' g = cgen.Initializer(g, t) kernel_gather.append(g) self._components['LIB_KERNEL_GATHER'] = kernel_gather
def _generate_kernel_call(self): kernel_call = cgen.Module( [cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): kernel_call_symbols.append(dat[0]) elif issubclass(type(dat[1][0]), host.Matrix): call_symbol = dat[0] + '_c' kernel_call_symbols.append(call_symbol) nc = str(dat[1][0].ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc isym = dat[0] + _ishift g = cgen.Value('_' + dat[0] + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + '}') kernel_call.append(g) else: raise RuntimeError("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call