def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): _kernel_arg_decls.append( cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0]))) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) # print host.ctypes_map[dat[1][0].dtype], dat[1][0].dtype if issubclass(type(dat[1][0]), host._Array): kernel_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) if not dat[1][1].write: kernel_arg = cgen.Const(kernel_arg) _kernel_arg_decls.append(kernel_arg) elif issubclass(type(dat[1][0]), host.Matrix): # MAKE STRUCT TYPE dtype = dat[1][0].dtype ti = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'i'))) tj = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'j'))) if not dat[1][1].write: ti = cgen.Const(ti) tj = cgen.Const(tj) typename = '_' + dat[0] + '_t' _kernel_structs.append( cgen.Typedef(cgen.Struct('', [ti, tj], typename))) # MAKE STRUCT ARG _kernel_arg_decls.append(cgen.Value(typename, dat[0])) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def _generate_kernel_gather(self): kernel_gather = cgen.Module([ cgen.Comment('#### Pre kernel gather ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) shared_syms = self._components['OMP_SHARED_SYMS'] for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] shared_syms.append(symbol) if issubclass(type(obj), data.GlobalArrayClassic): isym = symbol + '_c' val = symbol + '[' + self._components[ 'OMP_THREAD_INDEX_SYM'] + ']' g = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], isym)) if not mode.write: g = cgen.Const(g) g = cgen.Initializer(g, val) kernel_gather.append(g) elif issubclass(type(obj), host.Matrix) \ and mode.write \ and obj.ncomp <= self._gather_size_limit: isym = symbol + 'i' nc = obj.ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[obj.dtype] t = '{' for tx in range(nc): t += '*(' + symbol + '+' + self._components[ 'LIB_PAIR_INDEX_0'] t += '*' + str(nc) + '+' + str(tx) + '),' t = t[:-1] + '}' g = cgen.Value(dtype, isym + ncb) g = cgen.Initializer(g, t) kernel_gather.append(g) self._components['LIB_KERNEL_GATHER'] = kernel_gather
def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): arg = cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0])) _kernel_arg_decls.append(arg) _kernel_lib_arg_decls.append(arg) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" obj = dat[1][0] mode = dat[1][1] symbol = dat[0] kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[obj.dtype], Restrict(self._cc.restrict_keyword, symbol))) if issubclass(type(obj), data.GlobalArrayClassic): kernel_lib_arg = cgen.Pointer(kernel_lib_arg) gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_arg_decls.append(gen.kernel_arg_decl) elif issubclass(type(obj), host._Array): gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_arg_decls.append(gen.kernel_arg_decl) if mode.write is True: assert issubclass(type(obj), data.GlobalArrayClassic), \ "global array must be a thread safe type for \ write access. Type is:" + str(type(obj)) elif issubclass(type(dat[1][0]), host.Matrix): gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_structs.append(gen.header) _kernel_arg_decls.append(gen.kernel_arg_decl[0]) _kernel_arg_decls.append(gen.kernel_arg_decl[1]) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def _generate_kernel_func(self): if_block = cgen.If( self._components['LIB_PAIR_INDEX_0']+'<_D_N_LOCAL', cgen.Block([ self._components['KERNEL_GATHER'], cgen.For('int _k=1', '_k<=_D_NMATRIX['+self._components['LIB_PAIR_INDEX_0']+']', '_k++', cgen.Block([ cgen.Initializer( cgen.Const(cgen.Value( host.int32_str, self._components['LIB_PAIR_INDEX_1'])), '_D_NMATRIX['+self._components['LIB_PAIR_INDEX_0']+\ ' + _D_N_LOCAL * _k ]' ), self._components['KERNEL_MAPPING'], cgen.Line(self._kernel.code) ]) ), self._components['KERNEL_SCATTER'] ]) ) func = cgen.Block([ cgen.Initializer( cgen.Const( cgen.Value( host.int32_str, self._components['LIB_PAIR_INDEX_0'] )), 'threadIdx.x + blockIdx.x*blockDim.x' ), self._components['IF_GATHER'], if_block, self._components['IF_SCATTER'] ]) self._components['KERNEL_FUNC'] = cgen.FunctionBody( cgen.FunctionDeclaration( cgen.DeclSpecifier( cgen.Value("void", 'k_' + self._kernel.name), '__global__' ), self._components['KERNEL_ARG_DECLS'] ), func )
def _generate_lib_specific_args(self): self._components['LIB_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int32_str, '_NUM_THREADS')), cgen.Const(cgen.Value(host.int32_str, '_N_START')), cgen.Const(cgen.Value(host.int32_str, '_N_LOCAL')), cgen.Const(cgen.Value(host.int32_str, '_LIST_OFFSET')), cgen.Const( cgen.Pointer( cgen.Value( host.int32_str, Restrict(self._cc.restrict_keyword, '_CELL_LIST')))), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_CRL')), )), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_CCC')), )), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_OFFSET')), )), cgen.Pointer( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_JSTORE')), )), self.loop_timer.get_cpp_arguments_ast() ]
def _generate_lib_specific_args(self): self._components['LIB_ARG_DECLS'] = [ cgen.Const( cgen.Pointer( cgen.Value( host.int32_str, Restrict(self._cc.restrict_keyword, '_H_BLOCKSIZE')))), cgen.Const( cgen.Pointer( cgen.Value( host.int32_str, Restrict(self._cc.restrict_keyword, '_H_THREADSIZE')))), cgen.Const(cgen.Value(host.int32_str, '_H_N_LOCAL')), self.loop_timer.get_cpp_arguments_ast() ]
def _generate_lib_specific_args(self): self._components['LIB_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int64_str, '_NUM_THREADS')), cgen.Const(cgen.Value(host.int64_str, '_N_LOCAL')), cgen.Const(cgen.Value(host.int64_str, '_STRIDE')), cgen.Const( cgen.Pointer( cgen.Value(host.int64_str, Restrict(self._cc.restrict_keyword, '_NN')))), cgen.Const( cgen.Pointer( cgen.Value(host.int64_str, Restrict(self._cc.restrict_keyword, '_NLIST')), )), self.loop_timer.get_cpp_arguments_ast() ]
def generate_optimmat_code(self, pos, name=None): """Generates the code for computing the local optimization matrix for the optimization over site nr. `pos` The function has the following signature: DTYPE const *const A, DTYPE const *const X_0, ..., DTYPE const *const X_N, DTYPE *const result :param pos: The local tensor to copy (should be `< len(X)`) :param name: Name of the C function (default: get_optimmat_%(pos)) :returns: cgen.FunctionBody with given name """ name = 'get_optimmat_%i' % pos if name is None else name finalization_src = ''' if (mid < {nr_meas:d}) {{ for (uint i = 0; i < {pdim:d}; ++i) {{ for (uint k_l = 0; k_l < {rank_l:d}; ++k_l) {{ for (uint k_r = 0; k_r < {rank_r:d}; ++k_r) {{ result[mid * {rank_l:d} * {pdim:d} * {rank_r:d} + k_l * {pdim:d} * {rank_r:d} + i * {rank_r:d} + k_r] = left_c[k_l] * current_row[{offset:d} + i] * right_c[k_r]; }} }} }} }} '''.format(nr_meas=self._meas, pdim=self._dims[pos], rank_l=1 if pos == 0 else self._ranks[pos - 1], rank_r=1 if pos == self._sites - 1 else self._ranks[pos], offset=sum(self._dims[:pos])) finalization = c.LiteralLines(finalization_src) arg_decls = [ConstPointerToConstDecl(self._dtype, 'A')] arg_decls += [ ConstPointerToConstDecl(self._dtype, 'X%i' % i) for i in range(self._sites) ] arg_decls += [c.Pointer(c.Const(c.POD(self._dtype, 'result')))] return c.FunctionBody( ccu.CudaGlobal( c.FunctionDeclaration(c.Value('void', 'get_optimmat_%i' % pos), arg_decls=arg_decls)), c.Block( self.declaration(pos) + self.left_contractions(pos) + self.right_contractions(pos) + [finalization]))
def _generate_kernel_call(self): kernel_call = cgen.Module([ cgen.Comment('#### Kernel call arguments ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) kernel_call_symbols = [] shared_syms = self._components['OMP_SHARED_SYMS'] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): sym = dat[0] if issubclass(type(dat[1][0]), data.GlobalArrayClassic): sym += '[' + self._components['OMP_THREAD_INDEX_SYM'] + ']' kernel_call_symbols.append(sym) shared_syms.append(dat[0]) elif issubclass(type(dat[1][0]), host.Matrix): call_symbol = dat[0] + '_c' kernel_call_symbols.append(call_symbol) nc = str(dat[1][0].ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc isym = dat[0] + _ishift g = cgen.Value('_' + dat[0] + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + '}') kernel_call.append(g) shared_syms.append(dat[0]) else: print("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def _generate_lib_specific_args(self): self._components['LIB_ARG_DECLS'] = [ cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict( self._cc.restrict_keyword,'_H_BLOCKSIZE' ) ) ) ), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict( self._cc.restrict_keyword,'_H_THREADSIZE' ) ) ) ), cgen.Const(cgen.Value(host.int32_str, '_H_N_LOCAL')), cgen.Const(cgen.Value(host.int32_str, '_H_NMATRIX_STRIDE')), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict( self._cc.restrict_keyword,'_D_NMATRIX' ) ) ) ), self.loop_timer.get_cpp_arguments_ast() ] self._components['LIB_SPECIFIC_KERNEL_ARGS'] = ['_H_N_LOCAL','_H_NMATRIX_STRIDE','_D_NMATRIX'] self._components['KERNEL_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int32_str, '_D_N_LOCAL')), cgen.Const(cgen.Value(host.int32_str, '_D_NMATRIX_STRIDE')), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict( self._cc.restrict_keyword,'_D_NMATRIX' ) ) ) ) ] self._components['KERNEL_LIB_ARG_DECLS'] = []
def declaration(self, pos): """Generates the declarative instructions for the optimizations over sites nr. `pos` :param pos: The local tensor to copy (should be `< len(X)`) :returns: List containing cgen Statements """ max_ltens_size = max(self._ltens_sizes) max_left_size = 1 if pos == 0 else max(self._ranks[:pos]) max_right_size = 1 if pos == self._sites - 1 else max( self._ranks[pos:]) max_tmat_size = max(self._ranks[i] * self._ranks[i + 1] for i in range(self._sites - 2)) init_statements = [ c.LineComment( "Define the row number the current thread is operating on"), c.Initializer(c.Const(c.POD(np.int32, 'mid')), 'threadIdx.x + blockIdx.x * blockDim.x'), c.LineComment("Allocate shared memory for the local tensors"), ccu.CudaShared( c.ArrayOf(c.POD(self._dtype, 'x_shared'), max_ltens_size)), c.LineComment( "Allocate the left-, right-, and transfer contractions"), c.ArrayOf(c.POD(self._dtype, 'left_c'), max_left_size), c.ArrayOf(c.POD(self._dtype, 'right_c'), max_right_size), c.ArrayOf(c.POD(self._dtype, 'tmat_c'), max_tmat_size), c.ArrayOf(c.POD(self._dtype, 'buf_c'), max(max_right_size, max_left_size)), c.LineComment("Shortcut for current row of design matrix"), c.LineComment("Carefull, current_row might be out of bounds!"), ConstPointerToConst(self._dtype, 'current_row', 'A + (mid * %i)' % sum(self._dims)) ] return init_statements
def ConstPointerToConst(dtype, name, value): """Returns a cgen variable declaration & assignment of a constant pointer to a constant of type `dtype` """ return c.Constant(c.Pointer(c.Const(c.POD(dtype, name))), value)
def _generate_kernel_func(self): IX = self._components['LIB_PAIR_INDEX_0'] IY = self._components['LIB_PAIR_INDEX_1'] CX = '_CX' CY = '_CY' if_block = cgen.If( IX + '<_D_N_LOCAL', cgen.Block([ self._components['KERNEL_GATHER'], cgen.Initializer(cgen.Const(cgen.Value(host.int32_str, CX)), '_D_CRL[' + IX +']'), cgen.For('int _jk=0','_jk<_D_CCC['+CX+']', '_jk++', cgen.Block([ cgen.Initializer( cgen.Const(cgen.Value(host.int32_str, IY)), '_D_L_MATRIX[' + CX+'*_D_N_LAYERS' + '+_jk]' ), cgen.If( IX+'!='+IY, cgen.Block([ self._components['KERNEL_MAPPING'], cgen.Line(self._kernel.code) ]) ), ]) ), cgen.For('int _k=0','_k<_D_N_OFFSETS', '_k++', cgen.Block([ cgen.Initializer(cgen.Const(cgen.Value(host.int32_str, CY)), CX + '+ _D_OFFSETS[_k]'), cgen.For('int _jk=0','_jk<_D_CCC['+CY+']', '_jk++', cgen.Block([ cgen.Initializer( cgen.Const(cgen.Value(host.int32_str, IY)), '_D_L_MATRIX[' + CY+'*_D_N_LAYERS' + '+_jk]' ), #cgen.If(IX+'!='+IY, #cgen.Block([ self._components['KERNEL_MAPPING'], cgen.Line(self._kernel.code) #])) ])) ]) ), self._components['KERNEL_SCATTER'] ]) ) func = cgen.Block([ cgen.Initializer( cgen.Const( cgen.Value( host.int32_str, self._components['LIB_PAIR_INDEX_0'] )), 'threadIdx.x + blockIdx.x*blockDim.x' ), self._components['IF_GATHER'], if_block, self._components['IF_SCATTER'] ]) self._components['KERNEL_FUNC'] = cgen.FunctionBody( cgen.FunctionDeclaration( cgen.DeclSpecifier( cgen.Value("void", 'k_' + self._kernel.name), '__global__' ), self._components['KERNEL_ARG_DECLS'] ), func )
def _data_cref(self, dtype, name): return c.Const(c.Reference(c.Value(dtype, name)))
import cgen as c func = c.FunctionBody( c.FunctionDeclaration(c.Const(c.Pointer(c.Value("char", "greet"))), []), c.Block([c.Statement('return "hello world"')])) code = c.Module([]) code.append(c.Value('int', 'cont')) code.append(c.Assign('cont', '0')) code.append(c.Increment('cont', '5')) print(code)
def _generate_kernel_gather(self): cp = self._components cx = cp['LIB_CELL_CX'] cy = cp['LIB_CELL_CY'] cz = cp['LIB_CELL_CZ'] ncx = cp['N_CELL_X'] ncy = cp['N_CELL_Y'] ncz = cp['N_CELL_Z'] ci = cp['LIB_CELL_INDEX_0'] kernel_gather = cgen.Module([ cgen.Comment('#### Pre kernel gather ####'), # compute the linear cell index cgen.Initializer( cgen.Const(cgen.Value('INT64', ci)), cx + '+' + ncx + '*(' + cy + '+' + ncy + '*' + cz + ')'), # get the thread index cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) # partition this threads space for temporary vars self._components['PARTICLE_DAT_PARTITION'] = \ DSLPartitionTempSpace(self._dat_dict, self._components['CCC_MAX'], '_GATHER_SPACE[_threadid]', extras=((cp['TMP_INDEX'], 1, INT64),)) kernel_gather.append( self._components['PARTICLE_DAT_PARTITION'].ptr_init) src_sym = '__tmp_gpx' dst_sym = cp['CCC_0'] record_local = DSLRecordLocal( ind_sym=src_sym, nlocal_sym=cp['N_LOCAL'], store_sym=cp['PARTICLE_DAT_PARTITION'].idict[cp['TMP_INDEX']], store_ind_sym=dst_sym, count_sym=cp['I_LOCAL_SYM']) kernel_gather.append(record_local[0]) inner_l = [record_local[1]] # add dats to omp shared and init global array reduction shared_syms = self._components['OMP_SHARED_SYMS'] for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] shared_syms.append(symbol) if issubclass(type(obj), data.GlobalArrayClassic): isym = symbol + '_c' val = symbol + '[' + self._components[ 'OMP_THREAD_INDEX_SYM'] + ']' g = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], isym)) if not mode.write: g = cgen.Const(g) g = cgen.Initializer(g, val) kernel_gather.append(g) if issubclass(type(obj), data.ParticleDat): tsym = cp['PARTICLE_DAT_PARTITION'].idict[symbol] inner_l.append( DSLStrideGather(symbol, tsym, obj.ncomp, src_sym, dst_sym, self._components['CCC_MAX'])) inner_l.append(cgen.Line(dst_sym + '++;')) inner = cgen.Module(inner_l) g = self._components['CELL_LIST_ITER'](src_sym, ci, inner) kernel_gather.append( cgen.Initializer(cgen.Value('INT64', dst_sym), '0')) kernel_gather.append(g) # skip cell if there are not local particles kernel_gather.append( cgen.If(cp['I_LOCAL_SYM'] + '==0', cgen.Block((cgen.Line('continue;'), )))) self._components['LIB_KERNEL_GATHER'] = kernel_gather
def _generate_per_dat(self): # =================== DICT INIT =============================== self._components['KERNEL_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int32_str, '_D_N_LOCAL')) ] self._components['KERNEL_LIB_ARG_DECLS'] = [] self._components['KERNEL_STRUCT_TYPEDEFS'] = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) self._components['LIB_KERNEL_CALL'] = cgen.Module( [cgen.Comment('#### Kernel call ####')]) kernel_call_symbols = ['_H_N_LOCAL'] self._components['KERNEL_SCATTER'] = cgen.Module( [cgen.Comment('#### kernel scatter ####')]) self._components['KERNEL_GATHER'] = cgen.Module( [cgen.Comment('#### kernel gather ####')]) self._components['IF_SCATTER'] = cgen.Module( [cgen.Comment('#### if scatter ####')]) self._components['IF_GATHER'] = cgen.Module( [cgen.Comment('#### if gather ####')]) self._components['KERNEL_MAPPING'] = cgen.Module( [cgen.Comment('#### kernel symbol mapping ####')]) # =================== Static Args =============================== if self._kernel.static_args is not None: for i, datt in enumerate(self._kernel.static_args.items()): ksym = datt[0] ktype = datt[1] # Add to kernel args g = cgen.Const(cgen.Value(host.ctypes_map[ktype], ksym)) self._components['KERNEL_ARG_DECLS'].append(g) self._components['KERNEL_LIB_ARG_DECLS'].append(g) kernel_call_symbols.append(ksym) # =================== Dynamic Args =============================== for i, datt in enumerate(self._dat_dict.items()): assert type(datt[1]) is tuple, "Access descriptors not found" dati = datt[1][0] ksym = datt[0] dsym = 'd_' + ksym kacc = datt[1][1] # add to lib args kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dati.dtype], Restrict(self._cc.restrict_keyword, ksym))) if type(dati) is cuda_data.GlobalArray or \ issubclass(type(dati), cuda_base.Array): # KERNEL ARGS DECLS ----------------------------- kernel_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dati.dtype], Restrict(self._cc.restrict_keyword, dsym))) if not kacc.write: kernel_arg = cgen.Const(kernel_arg) self._components['KERNEL_ARG_DECLS'].append(kernel_arg) # KERNEL CALL SYMS ----------------------------- kernel_call_symbols.append(ksym) # KERNEL GATHER/SCATTER START ------------------ if not kacc.incremented: a = cgen.Pointer( cgen.Value(host.ctypes_map[dati.dtype], ksym)) a = cgen.Const(a) a = cgen.Initializer(a, dsym) self._components['IF_GATHER'].append(a) else: a = cgen.Initializer( cgen.Value(host.ctypes_map[dati.dtype], ksym + '[' + str(dati.ncomp) + ']'), '{0}') self._components['IF_GATHER'].append(a) # add the scatter code self._components['IF_SCATTER'].append( cgen.Line( generate_reduction_final_stage(dsym, ksym, dati))) # KERNEL GATHER/SCATTER END ------------------ elif issubclass(type(dati), cuda_base.Matrix): # KERNEL ARGS DECLS, STRUCT DECLS ---------------- dtype = dati.dtype ti = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'i'))) if not kacc.write: ti = cgen.Const(ti) typename = '_' + ksym + '_t' self._components['KERNEL_STRUCT_TYPEDEFS'].append( cgen.Typedef(cgen.Struct('', [ti], typename))) # add to kernel args kernel_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dati.dtype], Restrict(self._cc.restrict_keyword, dsym))) if not kacc.write: kernel_arg = cgen.Const(kernel_arg) self._components['KERNEL_ARG_DECLS'].append(kernel_arg) # KERNEL CALL SYMS ----------------------------- kernel_call_symbols.append(ksym) # KERNEL GATHER/SCATTER START ------------------ nc = str(dati.ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc isym = dsym + _ishift g = cgen.Value(typename, ksym) g = cgen.Initializer(g, '{ ' + isym + '}') self._components['KERNEL_MAPPING'].append(g) # KERNEL GATHER/SCATTER END ------------------ # END OF IF ------------------------ # add to lib args if not kacc.write: kernel_lib_arg = cgen.Const(kernel_lib_arg) self._components['KERNEL_LIB_ARG_DECLS'].append(kernel_lib_arg) # KERNEL CALL SYMS ----------------------------- kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] self._components['LIB_KERNEL_CALL'].append( cgen.Module([ cgen.Value('dim3', '_B'), cgen.Value('dim3', '_T'), cgen.Assign('_B.x', '_H_BLOCKSIZE[0]'), cgen.Assign('_B.y', '_H_BLOCKSIZE[1]'), cgen.Assign('_B.z', '_H_BLOCKSIZE[2]'), cgen.Assign('_T.x', '_H_THREADSIZE[0]'), cgen.Assign('_T.y', '_H_THREADSIZE[1]'), cgen.Assign('_T.z', '_H_THREADSIZE[2]') ])) self._components['LIB_KERNEL_CALL'].append( cgen.Line('k_' + self._kernel.name + '<<<_B,_T>>>(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'].append( cgen.Line('checkCudaErrors(cudaDeviceSynchronize());'))
def _generate_lib_specific_args(self): cp = self._components ncx = cp['N_CELL_X'] ncy = cp['N_CELL_Y'] ncz = cp['N_CELL_Z'] npad = cp['N_CELL_PAD'] nloc = cp['N_LOCAL'] exec_count = cp['EXEC_COUNT'] self._components['LIB_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int32_str, '_NUM_THREADS')), cgen.Const(cgen.Value(host.int64_str, ncx)), cgen.Const(cgen.Value(host.int64_str, ncy)), cgen.Const(cgen.Value(host.int64_str, ncz)), cgen.Const(cgen.Value(host.int64_str, npad)), cgen.Const(cgen.Value(host.int32_str, nloc)), cgen.Const(cgen.Value(host.int32_str, '_LIST_OFFSET')), cgen.Const( cgen.Pointer( cgen.Value( host.int32_str, Restrict(self._cc.restrict_keyword, '_CELL_LIST')))), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_CRL')), )), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_CCC')), )), cgen.Const( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_OFFSET')), )), cgen.Pointer( cgen.Pointer( cgen.Value(host.int32_str, Restrict(self._cc.restrict_keyword, '_JSTORE')), )), cgen.Pointer( cgen.Pointer( cgen.Value( host.uint8_str, Restrict(self._cc.restrict_keyword, '_GATHER_SPACE')), )), cgen.Const(cgen.Value(host.int64_str, self._components['CCC_MAX'])), cgen.Pointer(cgen.Value(host.int64_str, exec_count)), self.loop_timer.get_cpp_arguments_ast() ]
c.Define('OPS_2D', ''), c.Include('ops_seq.h'), c.Include('ops_lib_cpp.h'), c.Include('sources.cpp'), c.Include('velocity-model.cpp'), c.Include('wave-propagation-ops.h'), c.Line(), c.Statement('using namespace std'), c.Line() ]) code.append( c.FunctionBody( c.FunctionDeclaration(c.Value('void', 'wavePropagation'), [ c.Pointer(c.Value('double', 'u_new')), c.Const(c.Pointer(c.Value('double', 'u_current'))), c.Const(c.Pointer(c.Value('double', 'u_previous'))), c.Const(c.Pointer(c.Value('double', 'velocity'))), c.Const(c.Pointer(c.Value('double', 'wx'))), c.Const(c.Pointer(c.Value('double', 'wy'))), c.Const(c.Pointer(c.Value('double', 'zetax'))), c.Const(c.Pointer(c.Value('double', 'zetay'))), c.Const(c.Pointer(c.Value('int', 'idx'))) ]), c.Block([ c.LineComment('Propagates the wave.'), c.Assign( 'u_new[OPS_ACC0(0, 0)]', '(velocity[OPS_ACC3(0, 0)] * velocity[OPS_ACC3(0, 0)]) *\n\ ((-205.0 / 72) * u_current[OPS_ACC1(0, 0)] +\n\ (8.0 / 5) * (u_current[OPS_ACC1(1, 0)] + u_current[OPS_ACC1(-1, 0)] + u_current[OPS_ACC1(0, 1)] + u_current[OPS_ACC1(0, -1)]) +\n\
def _generate_lib_specific_args(self): self._components['LIB_ARG_DECLS'] = [ cgen.Const(cgen.Value(host.int32_str, '_NUM_THREADS')), cgen.Const(cgen.Value(host.int32_str, '_N_LOCAL')), self.loop_timer.get_cpp_arguments_ast() ]
def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module([ cgen.Comment('#### Structs generated per ParticleDat ####') ]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): arg = cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0])) _kernel_arg_decls.append(arg) _kernel_lib_arg_decls.append(arg) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" obj = dat[1][0] mode = dat[1][1] symbol = dat[0] kernel_lib_arg = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], Restrict(self._cc.restrict_keyword, symbol)) ) if issubclass(type(obj), data.GlobalArrayClassic): kernel_lib_arg = cgen.Pointer(kernel_lib_arg) if issubclass(type(obj), host._Array): kernel_arg = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], Restrict(self._cc.restrict_keyword, symbol)) ) if not mode.write: kernel_arg = cgen.Const(kernel_arg) _kernel_arg_decls.append(kernel_arg) if mode.write is True: assert issubclass(type(obj), data.GlobalArrayClassic),\ "global array must be a thread safe type for \ write access. Type is:" + str(type(obj)) elif issubclass(type(dat[1][0]), host.Matrix): # MAKE STRUCT TYPE dtype = dat[1][0].dtype ti = cgen.Pointer(cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword,'i'))) tj = cgen.Pointer(cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword,'j'))) if not dat[1][1].write: ti = cgen.Const(ti) tj = cgen.Const(tj) typename = '_'+dat[0]+'_t' _kernel_structs.append(cgen.Typedef(cgen.Struct('', [ti,tj], typename))) # MAKE STRUCT ARG _kernel_arg_decls.append(cgen.Value(typename, dat[0])) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def ConstPointerToConstDecl(dtype, name): """Returns a cgen variable declaration of a constant pointer to a constant of type `dtype` """ return c.Const(c.Pointer(c.Const(c.POD(dtype, name))))