def op2_gen_openacc(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] any_soa = 0 for nk in range (0,len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) optidxs = [0]*nargs indopts = [-1]*nargs nopts = 0 for i in range(0,nargs): if optflags[i] == 1 and maps[i] == OP_ID: optidxs[i] = nopts nopts = nopts+1 elif optflags[i] == 1 and maps[i] == OP_MAP: if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination optidxs[i] = nopts indopts[inds[i]-1] = i nopts = nopts+1 else: optidxs[i] = optidxs[invinds[inds[i]-1]] # # set two logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE: j = i reduct = j >= 0 ########################################################################## # start with the user kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 comm('user function') #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l].strip() #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel('+name+'): must have '+str(nargs_novec)+' arguments' return for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i]: var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) signature_text = '//#pragma acc routine\ninline ' + head_text + '( '+signature_text + ') {' file_text += signature_text + body_text + '}\n' ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg ARG){'); code('') else: code('op_arg ARG,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args['+str(g_m)+'] = ARG;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm(' get plan') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ: if not dims[g_m].isdigit() or int(dims[g_m]) > 1: print 'ERROR: OpenACC does not support multi-dimensional op_arg_gbl variables' exit(-1) code('TYP ARG_l = ARGh[0];') if ninds > 0: code('') code('int ncolors = 0;') code('') IF('set->size >0') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;') ENDIF() code('') comm('Set up typed device pointers for OpenACC') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;') code('') for g_m in range(0,nargs): if maps[g_m] == OP_ID: code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;') for m in range(1,ninds+1): g_m = invinds[m-1] code('TYP *data'+str(g_m)+' = (TYP *)ARG.data_d;') # # kernel call for indirect version # if ninds>0: code('') code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') code('ncolors = Plan->ncolors;') code('int *col_reord = Plan->col_reord;') code('int set_size1 = set->size + set->exec_size;') code('') comm(' execute plan') FOR('col','0','Plan->ncolors') IF('col==1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('') # code('#pragma omp parallel for') line = '#pragma acc parallel loop independent deviceptr(col_reord,' if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] line = line + 'map'+str(mapinds[g_m])+',' for g_m in range(0,nargs): if maps[g_m] == OP_ID: line = line+'data'+str(g_m)+',' for m in range(1,ninds+1): g_m = invinds[m-1] line = line + 'data'+str(g_m)+',' line = line[:-1]+')' if reduct: for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE: if accs[g_m] == OP_INC: line = line + ' reduction(+:arg'+str(g_m)+'_l)' if accs[g_m] == OP_MIN: line = line + ' reduction(min:arg'+str(g_m)+'_l)' if accs[g_m] == OP_MAX: line = line + ' reduction(max:arg'+str(g_m)+'_l)' code(line) FOR('e','start','end') code('int n = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n + set_size1 * '+str(idxs[g_m])+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const TYP* ARG_vec[] = {\n' else: line = 'TYP* ARG_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &data'+str(first)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) code('') line = name+'(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if maps[g_m] == OP_ID: if soaflags[g_m]: line = line + indent + '&data'+str(g_m)+'[n]' else: line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+' * n]' if maps[g_m] == OP_MAP: if vectorised[g_m]: if g_m+1 in unique_args: line = line + indent + 'arg'+str(g_m)+'_vec' else: if soaflags[g_m]: line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' else: line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' if maps[g_m] == OP_GBL: line = line + indent +'&arg'+str(g_m)+'_l' if g_m < nargs-1: if g_m+1 in unique_args and not g_m+1 == unique_args[-1]: line = line +',' else: line = line +');' code(line) ENDFOR() code('') if reduct: comm(' combine reduction data') IF('col == Plan->ncolors_owned-1') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m]==OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') ENDFOR() elif accs[g_m]==OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: error('internal error: invalid reduction option') ENDFOR() ENDIF() ENDFOR() # # kernel call for direct version # else: line = '#pragma acc parallel loop independent deviceptr(' for g_m in range(0,nargs): if maps[g_m] == OP_ID: line = line+'data'+str(g_m)+',' line = line[:-1]+')' if reduct: for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE: if accs[g_m] == OP_INC: line = line + ' reduction(+:arg'+str(g_m)+'_l)' if accs[g_m] == OP_MIN: line = line + ' reduction(min:arg'+str(g_m)+'_l)' if accs[g_m] == OP_MAX: line = line + ' reduction(max:arg'+str(g_m)+'_l)' code(line) FOR('n','0','set->size') line = name+'(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if maps[g_m] == OP_ID: if soaflags[g_m]: line = line + indent + '&data'+str(g_m)+'[n]' else: line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+'*n]' if maps[g_m] == OP_GBL: line = line + indent +'&arg'+str(g_m)+'_l' if g_m < nargs-1: line = line +',' else: line = line +');' code(line) ENDFOR() if ninds>0: code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') ENDIF() code('') #zero set size issues if ninds>0: IF('set_size == 0 || set_size == set->core_size || ncolors == 1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: if ninds==0: #direct version only if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m]==OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m]==OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: print 'internal error: invalid reduction option' if typs[g_m] == 'double': #need for both direct and indirect code('op_mpi_reduce_double(&ARG,ARGh);') elif typs[g_m] == 'float': code('op_mpi_reduce_float(&ARG,ARGh);') elif typs[g_m] == 'int': code('op_mpi_reduce_int(&ARG,ARGh);') else: print 'Type '+typs[g_m]+' not supported in OpenACC code generator, please add it' exit(-1) code('op_mpi_set_dirtybit_cuda(nargs, args);') code('') # # update kernel record # comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if optflags[g_m]==1: IF('ARG.opt') if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ: code(line+' ARG.size;') else: code(line+' ARG.size * 2.0f;') if optflags[g_m]==1: ENDIF() depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('openacc'): os.makedirs('openacc') fid = open('openacc/'+name+'_acckernel.c','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text ='' comm(' global constants ') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') code('') comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_c.h" ') code('') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){}') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_acckernel.c"') master = master.split('.')[0] fid = open('openacc/'+master.split('.')[0]+'_acckernels.c','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_cuda_simple(master, date, consts, kernels,sets): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] inc_stage=1 ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name = kernels[nk]['name'] nargs = kernels[nk]['nargs'] dims = kernels[nk]['dims'] maps = kernels[nk]['maps'] var = kernels[nk]['var'] typs = kernels[nk]['typs'] accs = kernels[nk]['accs'] idxs = kernels[nk]['idxs'] inds = kernels[nk]['inds'] soaflags = kernels[nk]['soaflags'] ninds = kernels[nk]['ninds'] inddims = kernels[nk]['inddims'] indaccs = kernels[nk]['indaccs'] indtyps = kernels[nk]['indtyps'] invinds = kernels[nk]['invinds'] mapnames = kernels[nk]['mapnames'] invmapinds = kernels[nk]['invmapinds'] mapinds = kernels[nk]['mapinds'] nmaps = 0 if ninds > 0: nmaps = max(mapinds)+1 vec = [m for m in range(0,nargs) if int(idxs[m])<0 and maps[m] == OP_MAP] if len(vec) > 0: unique_args = [1]; vec_counter = 1; vectorised = [] new_dims = [] new_maps = [] new_vars = [] new_typs = [] new_accs = [] new_idxs = [] new_inds = [] new_soaflags = [] for m in range(0,nargs): if int(idxs[m])<0 and maps[m] == OP_MAP: if m > 0: unique_args = unique_args + [len(new_dims)+1] temp = [0]*(-1*int(idxs[m])) for i in range(0,-1*int(idxs[m])): temp[i] = var[m] new_vars = new_vars+temp for i in range(0,-1*int(idxs[m])): temp[i] = typs[m] new_typs = new_typs+temp for i in range(0,-1*int(idxs[m])): temp[i] = dims[m] new_dims = new_dims+temp new_maps = new_maps+[maps[m]]*int(-1*int(idxs[m])) new_soaflags = new_soaflags+[0]*int(-1*int(idxs[m])) new_accs = new_accs+[accs[m]]*int(-1*int(idxs[m])) for i in range(0,-1*int(idxs[m])): new_idxs = new_idxs+[i] new_inds = new_inds+[inds[m]]*int(-1*int(idxs[m])) vectorised = vectorised + [vec_counter]*int(-1*int(idxs[m])) vec_counter = vec_counter + 1; else: if m > 0: unique_args = unique_args + [len(new_dims)+1] new_dims = new_dims+[dims[m]] new_maps = new_maps+[maps[m]] new_accs = new_accs+[int(accs[m])] new_soaflags = new_soaflags+[soaflags[m]] new_idxs = new_idxs+[int(idxs[m])] new_inds = new_inds+[inds[m]] new_vars = new_vars+[var[m]] new_typs = new_typs+[typs[m]] vectorised = vectorised+[0] dims = new_dims maps = new_maps accs = new_accs idxs = new_idxs inds = new_inds var = new_vars typs = new_typs soaflags = new_soaflags; nargs = len(vectorised); for i in range(1,ninds+1): for index in range(0,len(inds)+1): if inds[index] == i: invinds[i-1] = index break else: vectorised = [0]*nargs unique_args = range(1,nargs+1) cumulative_indirect_index = [-1]*nargs; j = 0; for i in range (0,nargs): if maps[i] == OP_MAP and ((not inc_stage) or accs[i] == OP_INC): cumulative_indirect_index[i] = j j = j + 1 any_soa = 0 any_soa = any_soa or sum(soaflags) # # set two logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 if inc_stage: ninds_staged = 0 inds_staged = [-1]*nargs for i in range(0,nargs): if maps[i]==OP_MAP and accs[i]==OP_INC: if inds_staged[invinds[inds[i]-1]] == -1: inds_staged[i] = ninds_staged ninds_staged = ninds_staged + 1 else: inds_staged[i] = inds_staged[invinds[inds[i]-1]] invinds_staged = [-1]*ninds_staged inddims_staged = [-1]*ninds_staged indopts_staged = [-1]*ninds_staged for i in range(0,nargs): if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: invinds_staged[inds_staged[i]] = i inddims_staged[inds_staged[i]] = dims[i] # if optflags[i] == 1: # indopts_staged[inds_staged[i]] = i for i in range(0,nargs): inds_staged[i] = inds_staged[i] + 1 ########################################################################## # start with CUDA kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') found = 0 for files in glob.glob( "*.h" ): f = open( files, 'r' ) for line in f: match = re.search(r''+'\\b'+name+'\\b', line) if match : file_name = f.name found = 1; break if found == 1: break; if found == 0: print "COUND NOT FIND KERNEL", name f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] # check for number of arguments if len(signature_text.split(',')) != nargs: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0,nargs): var = signature_text.split(',')[i].strip() if soaflags[i] and not (maps[i] == OP_MAP and accs[i] == OP_INC): var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r''+var2+'\[([A-Za-z0-9]*)\]'+'', var2+r'[\1*'+op2_gen_common.get_stride_string(i,maps,mapnames,name)+']', body_text) signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {' file_text += signature_text + body_text + '}\n' comm('') comm(' CUDA kernel function') if FORTRAN: code('subroutine op_cuda_'+name+'(') elif CPP: code('__global__ void op_cuda_'+name+'(') depth = 2 for g_m in range(0,ninds): if (indaccs[g_m]==OP_READ): code('const INDTYP *__restrict INDARG,') else: code('INDTYP *__restrict INDARG,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ') for g_m in range(0,nargs): if maps[g_m] == OP_ID: if accs[g_m] == OP_READ: code('const TYP *__restrict ARG,') else: code('TYP *ARG,') elif maps[g_m] == OP_GBL: if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: code('TYP *ARG,') elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: code('const TYP *ARG,') if ind_inc and inc_stage==1: code('int *ind_map,') code('short *arg_map,') code('int *ind_arg_sizes,') code('int *ind_arg_offs, ') if ninds>0: code('int block_offset, ') code('int *blkmap, ') code('int *offset, ') code('int *nelems, ') code('int *ncolors, ') code('int *colors, ') code('int nblocks, ') code('int set_size) { ') else: code('int set_size ) {') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: code('TYP ARG_l[DIM];') if accs[g_m] == OP_INC: FOR('d','0','DIM') code('ARG_l[d]=ZERO_TYP;') ENDFOR() else: FOR('d','0','DIM') code('ARG_l[d]=ARG[d+blockIdx.x*DIM];') ENDFOR() elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC: code('TYP ARG_l[DIM];') for m in range (1,ninds+1): g_m = m -1 v = [int(inds[i]==m) for i in range(len(inds))] v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] if sum(v)>1 and sum(v_i)>0: #check this sum(v_i) if indaccs[m-1] == OP_INC: ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 code('INDTYP *ARG_vec['+str(ind)+'] = {'); depth += 2; for n in range(0,nargs): if inds[n] == m: g_m = n code('ARG_l,') depth -= 2 code('};') else: ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 code('INDTYP *ARG_vec['+str(ind)+'];') # # lengthy code for general case with indirection # if ninds>0: code('') if inc_stage==1: for g_m in range (0,ninds): if indaccs[g_m] == OP_INC: code('__shared__ int *INDARG_map, INDARG_size;') code('__shared__ INDTYP *INDARG_s;') code('') if ind_inc: code('__shared__ int nelems2, ncolor;') code('__shared__ int nelem, offset_b;') code('') code('extern __shared__ char shared[];') code('') IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') code('return;') ENDIF() IF('threadIdx.x==0') code('') comm('get sizes and shift pointers and direct-mapped data') code('') code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];') code('') code('nelem = nelems[blockId];') code('offset_b = offset[blockId];') code('') if ind_inc: code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') code('ncolor = ncolors[blockId];') code('') if inc_stage==1 and ind_inc: for g_m in range (0,ninds_staged): code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];') code('') for m in range (1,ninds_staged+1): g_m = m - 1 c = [i for i in range(nargs) if inds_staged[i]==m] code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];') code('') comm('set shared memory pointers') code('int nbytes = 0;') for g_m in range(0,ninds_staged): code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];') if g_m < ninds_staged-1: code('nbytes += ROUND_UP(ind_ARG_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');') ENDIF() code('__syncthreads(); // make sure all of above completed') code('') if inc_stage==1: for g_m in range(0,ninds): if indaccs[g_m] == OP_INC: FOR_INC('n','threadIdx.x','ind_ARG_size*INDDIM','blockDim.x') code('ind_ARG_s[n] = ZERO_INDTYP;') ENDFOR() if ind_inc: code('') code('__syncthreads();') code('') if ind_inc: FOR_INC('n','threadIdx.x','nelems2','blockDim.x') code('int col2 = -1;') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') IF('n<nelem') comm('initialise local variables') for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: FOR('d','0','DIM') code('ARG_l[d] = ZERO_TYP;') ENDFOR() else: FOR_INC('n','threadIdx.x','nelem','blockDim.x') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') # # simple alternative when no indirection # else: code('') comm('process set elements') FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x') # # kernel call # code('') comm('user-supplied kernel call') line = name+'_gpu(' prefix = ' '*len(name) a = 0 #only apply indentation if its not the 0th argument indent ='' for m in range (0, nargs): if a > 0: indent = ' '+' '*len(name) if maps[m] == OP_GBL: if accs[m] == OP_READ: line += rep(indent+'ARG,\n',m) else: line += rep(indent+'ARG_l,\n',m); a =a+1 elif maps[m]==OP_MAP and accs[m]==OP_INC: line += rep(indent+'ARG_l,\n',m) a =a+1 elif maps[m]==OP_MAP: if soaflags[m]: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m) else: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*DIM,'+'\n',m) a =a+1 elif maps[m]==OP_ID: if ninds>0: if soaflags[m]: line += rep(indent+'ARG+(n+offset_b),\n',m) else: line += rep(indent+'ARG+(n+offset_b)*DIM,\n',m) a =a+1 else: if soaflags[m]: line += rep(indent+'ARG+n,\n',m) else: line += rep(indent+'ARG+n*DIM,\n',m) a =a+1 else: print 'internal error 1 ' code(line[0:-2]+');') #remove final ',' and \n # # updating for indirect kernels ... # if ninds>0: if ind_inc: code('col2 = colors[n+offset_b];') ENDIF() code('') comm('store local variables') code('') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: code('int ARG_map;') IF('col2>=0') for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('ARG_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];') ENDIF() code('') FOR('col','0','ncolor') IF('col2==col') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[invinds[g_m]]: code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[ARG_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];') else: code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+ARG_map*DIM];') for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[invinds[g_m]]: code('ind_arg'+str(inds[g_m]-1)+'_s[ARG_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = ARG_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+ARG_map*DIM] = ARG_l['+str(d)+'];') else: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];') else: code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*DIM];') for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = ARG_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*DIM] = ARG_l['+str(d)+'];') ENDFOR() code('__syncthreads();') ENDFOR() ENDFOR() if inc_stage: for g_m in range(0,ninds): if indaccs[g_m]==OP_INC: if soaflags[invinds[g_m]]: FOR_INC('n','threadIdx.x','INDARG_size','blockDim.x') for d in range(0,int(dims[invinds[g_m]])): code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = INDARG_s[n+'+str(d)+'*INDARG_size] + INDARG[INDARG_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];') for d in range(0,int(dims[invinds[g_m]])): code('INDARG[INDARG_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];') ENDFOR() else: FOR_INC('n','threadIdx.x','INDARG_size*INDDIM','blockDim.x') code('INDARG[n%INDDIM+INDARG_map[n/INDDIM]*INDDIM] += INDARG_s[n];') ENDFOR() # # global reduction # if reduct: code('') comm('global reductions') code('') for m in range (0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ: FOR('d','0','DIM') if accs[m]==OP_INC: code('op_reduction<OP_INC>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);') elif accs[m]==OP_MIN: code('op_reduction<OP_MIN>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);') elif accs[m]==OP_MAX: code('op_reduction<OP_MAX>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);') else: print 'internal error: invalid reduction option' sys.exit(2); ENDFOR() depth -= 2 code('}') code('') ########################################################################## # then C++ stub function ########################################################################## code('') comm('host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg ARG){') code('') else: code('op_arg ARG,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') #print vectorised for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args['+str(g_m)+'] = ARG;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = op_arg_dat(arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('') code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm('get plan') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') IF('set->size > 0') code('') # # kernel call for indirect version # if ninds>0: if inc_stage==1 and ind_inc: code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);') else: code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);') code('') # # transfer constants # g = [i for i in range(0,nargs) if maps[i] == OP_GBL and accs[i] == OP_READ] if len(g)>0: comm('transfer constants to GPU') code('int consts_bytes = 0;') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]==OP_READ: code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));') code('reallocConstArrays(consts_bytes);') code('consts_bytes = 0;') for m in range(0,nargs): if maps[m]==OP_GBL and accs[m]==OP_READ: g_m = m code('ARG.data = OP_consts_h + consts_bytes;') code('ARG.data_d = OP_consts_d + consts_bytes;') FOR('d','0','DIM') code('((TYP *)ARG.data)[d] = ARGh[d];') ENDFOR() code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));') code('mvConstArraysToDevice(consts_bytes);') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() # # transfer global reduction initial data # if ninds == 0: comm('set CUDA execution parameters') code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code(' int nthread = OP_block_size;') comm(' int nthread = 128;') code('#endif') code('') code('int nblocks = 200;') code('') if reduct: comm('transfer global reduction data to GPU') if ninds>0: code('int maxblocks = 0;') FOR('col','0','Plan->ncolors') code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') ENDFOR() else: code('int maxblocks = nblocks;') code('int reduct_bytes = 0;') code('int reduct_size = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));') code('reduct_size = MAX(reduct_size,sizeof(TYP));') code('reallocReductArrays(reduct_bytes);') code('reduct_bytes = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: code('ARG.data = OP_reduct_h + reduct_bytes;') code('ARG.data_d = OP_reduct_d + reduct_bytes;') FOR('b','0','maxblocks') FOR('d','0','DIM') if accs[g_m]==OP_INC: code('((TYP *)ARG.data)[d+b*DIM] = ZERO_TYP;') else: code('((TYP *)ARG.data)[d+b*DIM] = ARGh[d];') ENDFOR() ENDFOR() code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));') code('mvReductArraysToDevice(reduct_bytes);') code('') # # kernel call for indirect version # if ninds>0: comm('execute plan') code('') code('int block_offset = 0;') FOR('col','0','Plan->ncolors') IF('col==Plan->ncolors_core') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code('int nthread = OP_block_size;') code('#endif') code('') code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],') code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);') IF('Plan->ncolblk[col] > 0') if reduct or (inc_stage==1 and ind_inc): if reduct and inc_stage==1: code('int nshared = MAX(Plan->nshared,reduct_size*nthread);') elif reduct: code('int nshared = reduct_size*nthread;') else: code('int nshared = Plan->nsharedCol[col];') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') for m in range(1,ninds+1): g_m = invinds[m-1] code('(TYP *)ARG.data_d,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('arg'+str(g_m)+'.map_data_d, ') for g_m in range(0,nargs): if inds[g_m]==0: code('(TYP*)ARG.data_d,') if inc_stage==1 and ind_inc: code('Plan->ind_map,') code('Plan->loc_map,') code('Plan->ind_sizes,') code('Plan->ind_offs,') code('block_offset,') code('Plan->blkmap,') code('Plan->offset,') code('Plan->nelems,') code('Plan->nthrcol,') code('Plan->thrcol,') code('Plan->ncolblk[col],') code('set->size+set->exec_size);') code('') if reduct: comm('transfer global reduction data back to CPU') IF('col == Plan->ncolors_owned-1') code('mvReductArraysToHost(reduct_bytes);') ENDIF() ENDFOR() code('block_offset += Plan->ncolblk[col];') ENDIF() # # kernel call for direct version # else: if reduct: code('int nshared = reduct_size*nthread;') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') indent = ' '#*(len(name)+42) for g_m in range(0,nargs): if g_m > 0: code(indent+'(TYP *) ARG.data_d,') else: code(indent+'(TYP *) ARG.data_d,') code(indent+'set->size );') if ninds>0: code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') # # transfer global reduction initial data # if reduct: if ninds == 0: comm('transfer global reduction data back to CPU') code('mvReductArraysToHost(reduct_bytes);') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ: FOR('b','0','maxblocks') FOR('d','0','DIM') if accs[m]==OP_INC: code('ARGh[d] = ARGh[d] + ((TYP *)ARG.data)[d+b*DIM];') elif accs[m]==OP_MIN: code('ARGh[d] = MIN(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);') elif accs[m]==OP_MAX: code('ARGh[d] = MAX(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);') ENDFOR() ENDFOR() code('ARG.data = (char *)ARGh;') code('op_mpi_reduce(&ARG,ARGh);') ENDIF() code('op_mpi_set_dirtybit_cuda(nargs, args);') # # update kernel record # code('cutilSafeCall(cudaDeviceSynchronize());') comm('update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ or accs[g_m]==OP_WRITE: code(line+' ARG.size;') else: code(line+' ARG.size * 2.0f;') depth = depth - 2 code('}') ########################################################################## # output individual kernel file ########################################################################## fid = open(name+'_kernel.cu','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm('header') code('#include "op_lib_cpp.h"') code('') code('#include "op_cuda_rt_support.h"') code('#include "op_cuda_reduction.h"') code('') comm('global constants') code('#ifndef MAX_CONST_SIZE') code('#define MAX_CONST_SIZE 128') code('#endif') code('') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') code('') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') for nc in range(0,len(consts)): IF('!strcmp(name,"'+consts[nc]['name']+'")') if consts[nc]['dim'] < 0: IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {') code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+', dat, dim*size));') ENDIF() code('else ') code('{') depth = depth + 2 code('printf("error: unknown const name\\n"); exit(1);') ENDIF() depth = depth - 2 code('}') code('') comm('user kernel files') for nk in range(0,len(kernels)): file_text = file_text +\ '#include "'+kernels[nk]['name']+'_kernel.cu"\n' master = master.split('.')[0] fid = open(master.split('.')[0]+'_kernels.cu','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_openmp4(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] op2_compiler = os.getenv('OP2_COMPILER','0'); any_soa = 0 maptype = 'map' for nk in range (0,len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) optidxs = [0]*nargs indopts = [-1]*nargs nopts = 0 for i in range(0,nargs): if optflags[i] == 1 and maps[i] == OP_ID: optidxs[i] = nopts nopts = nopts+1 elif optflags[i] == 1 and maps[i] == OP_MAP: if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination optidxs[i] = nopts indopts[inds[i]-1] = i nopts = nopts+1 else: optidxs[i] = optidxs[invinds[inds[i]-1]] # # set two logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 ########################################################################## # start with the user kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 comm('user function') #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i]: var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0,len(consts)): varname = consts[nc]['name'] body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text) # if consts[nc]['dim'] == 1: # body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text) # else: # body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text) # body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text) vec = 0 for n in range(0,nargs): if (vectorised[n] == 1): vec = 1 kernel_params = [ var.strip() for var in signature_text.split(',')] if vec: new_kernel_params = [] for m in range(0,nargs_novec): if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP: new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m])) else: new_kernel_params = new_kernel_params + [kernel_params[m]] kernel_params = new_kernel_params # collect constants used by kernel kernel_consts = [] for nc in range(0,len(consts)): if body_text.find(consts[nc]['name']+'_ompkernel') != -1: kernel_consts.append(nc) ############################################################ # omp4 function call definition ############################################################ code('') func_call_signaure_text = 'void ' + name + '_omp4_kernel(' params = '' indent = '\n' + ' ' k = [] for g_m in range(0, nargs): if maps[g_m] == OP_GBL: params += indent + rep('TYP *ARG,',g_m) if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] params += indent + 'int *map'+str(mapinds[g_m])+',' if maptype == 'map': params += indent + 'int map'+str(mapinds[g_m])+'size,' if maps[g_m] == OP_ID: params += indent + rep('TYP *data'+str(g_m)+',', g_m) if maptype == 'map': params += indent + 'int dat'+str(g_m)+'size,' for m in range(1,ninds+1): g_m = invinds[m-1] params += indent + rep('TYP *data'+str(g_m)+',', g_m) if maptype == 'map': params += indent + 'int dat'+str(g_m)+'size,' if ninds>0: # add indirect kernel specific params to kernel func call params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' else: # add direct kernel specific params to kernel func call params += indent + 'int count,' params += indent + 'int num_teams,' + indent + 'int nthread' #add strides for SoA to params if any_soa: indent = ','+indent if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT' if dir_soa<>-1: params += indent + 'int direct_'+name+'_stride_OP2CONSTANT' code(func_call_signaure_text+params+');') ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg ARG){'); code('') else: code('op_arg ARG,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args['+str(g_m)+'] = ARG;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm(' get plan') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') # # get part and block size # code('') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('#ifdef OP_BLOCK_SIZE_'+ str(nk)) code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code(' int nthread = OP_block_size;') code('#endif') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ: if not dims[g_m].isdigit() or int(dims[g_m]) > 1: print 'ERROR: OpenMP 4 does not support multi-dimensional variables' exit(-1) code('TYP ARG_l = ARGh[0];') if ninds > 0: code('') code('int ncolors = 0;') code('int set_size1 = set->size + set->exec_size;') code('') IF('set->size >0') #managing constants if any_soa: code('') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;') ENDIF() code('') comm('Set up typed device pointers for OpenMP') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;') if maptype == 'map': code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') code('') for g_m in range(0,nargs): if maps[g_m] == OP_ID: code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;') if maptype == 'map': if optflags[g_m]: code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') else: code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') for m in range(1,ninds+1): g_m = invinds[m-1] code('TYP *data'+str(g_m)+' = (TYP *)ARG.data_d;') if maptype == 'map': if optflags[g_m]: code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') else: code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') # # prepare kernel params for indirect version # if ninds>0: code('') code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') code('ncolors = Plan->ncolors;') code('int *col_reord = Plan->col_reord;') code('') comm(' execute plan') FOR('col','0','Plan->ncolors') IF('col==1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('') # # kernel function call # indent = '\n' + ' ' * (depth+2) call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')]) call_params = call_params.replace('*','') # set params for indirect version if ninds>0: call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread') # set params for direct version else: call_params = re.sub('count','set->size',call_params); call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') code(func_call_signaure_text.split(' ')[-1]+call_params+');') code('') if ninds>0: if reduct: comm(' combine reduction data') IF('col == Plan->ncolors_owned-1') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m]==OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m]==OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: error('internal error: invalid reduction option') ENDIF() ENDFOR() code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') ENDIF() code('') #zero set size issues if ninds>0: IF('set_size == 0 || set_size == set->core_size || ncolors == 1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: if ninds==0: #direct version only if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m]==OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m]==OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: print 'internal error: invalid reduction option' if typs[g_m] == 'double': #need for both direct and indirect code('op_mpi_reduce_double(&ARG,ARGh);') elif typs[g_m] == 'float': code('op_mpi_reduce_float(&ARG,ARGh);') elif typs[g_m] == 'int': code('op_mpi_reduce_int(&ARG,ARGh);') else: print 'Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it' exit(-1) code('op_mpi_set_dirtybit_cuda(nargs, args);') code('') # # update kernel record # code('if (OP_diags>1) deviceSync();') comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if optflags[g_m]==1: IF('ARG.opt') if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ: code(line+' ARG.size;') else: code(line+' ARG.size * 2.0f;') if optflags[g_m]==1: ENDIF() depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('openmp4'): os.makedirs('openmp4') fid = open('openmp4/'+name+'_omp4kernel.cpp','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ############################################################## # generate ****_omp4kernel_func.cpp ############################################################## file_text = '' code(func_call_signaure_text+params+'){') code('') depth += 2 for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('TYP ARG_l = *ARG;') line = '#pragma omp target teams' if op2_compiler == 'clang': line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' ' line +=' num_teams(num_teams) thread_limit(nthread) ' map_clause = '' if maptype == 'map': map_clause = 'map(to:' elif maptype == 'is_device_ptr': map_clause = 'is_device_ptr(' for g_m in range(0,nargs): if maps[g_m] == OP_ID: if maptype == 'map': map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' else: map_clause += 'data'+str(g_m)+',' if map_clause != 'is_device_ptr(' and map_clause != 'map(to:': map_clause = map_clause[:-1]+')' line += map_clause # mapping global consts if len(kernel_consts) != 0: line += ' \\\n' + (depth+2)*' ' + 'map(to:' for nc in kernel_consts: line += ' ' + consts[nc]['name']+'_ompkernel,' if consts[nc]['dim'] != 1: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' line = line[:-1] + '[:'+ num +'],' line = line[:-1]+')' # prepare reduction reduction_string = '' reduction_mapping = '' if reduct: reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:' for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: if accs[g_m] == OP_INC: reduction_string += ' reduction(+:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MIN: reduction_string += ' reduction(min:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MAX: reduction_string += ' reduction(max:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_WRITE: reduction_mapping += ' arg%d_l,' % g_m reduction_mapping = reduction_mapping[0:-1]+')' # # map extra pointers for indirect version # if ninds>0: if maptype == 'map': line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],' else: line += '\\\n'+(depth+2)*' '+'map(to:col_reord,' if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] if maptype == 'map': line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],' else: line = line + 'map'+str(mapinds[g_m])+',' for m in range(1,ninds+1): g_m = invinds[m-1] if maptype == 'map': line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' else: line = line + 'data'+str(g_m)+',' line = line[:-1]+')' # # write omp pragma # code(line + reduction_mapping + reduction_string) if op2_compiler != 'clang': line = '#pragma omp distribute parallel for schedule(static,1)' code(line + reduction_string) # # start for loop indirect version # if ninds>0: FOR('e','start','end') code('int n_op = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n_op + set_size1 * '+str(idxs[g_m])+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const TYP* ARG_vec[] = {\n' else: line = 'TYP* ARG_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &data'+str(first)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # direct version # else: FOR('n_op','0','count') # # write inlined kernel function # comm('variable mapping') for g_m in range(0,nargs): line = kernel_params[g_m] + ' = ' if maps[g_m] == OP_ID: if soaflags[g_m]: line += '&data%d[n_op]' % g_m else: line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]' if maps[g_m] == OP_MAP: if vectorised[g_m]: if g_m+1 in unique_args: line += 'arg'+str(g_m)+'_vec' else: line = '' else: if soaflags[g_m]: line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' else: line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' if maps[g_m] == OP_GBL: line += '&arg%d_l' % g_m if len(line): line += ';' code(line) code('') comm('inline function') indent = ' ' * (depth-2) inline_body_text = '' for line in body_text.split('\n'): if len(line): inline_body_text += indent+line+'\n' else: inline_body_text += '\n' code(inline_body_text) comm('end inline func') ENDFOR() code('') # end kernel function for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('*ARG = ARG_l;') depth -= 2; code('}') ########################################################################## # output individual omp4kernel file ########################################################################## fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text ='' comm(' header ') code('#include "op_lib_cpp.h" ') code('') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"') master = master.split('.')[0] fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ########################################################################## # output omp4 master kernel file ########################################################################## file_text ='' comm(' global constants ') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];') code('') comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_cpp.h" ') code('') code('void op_decl_const_char(int dim, char const *type,') code(' int size, char *dat, char const *name){') indent = ' ' * ( 2+ depth) line = ' ' for nc in range (0,len(consts)): varname = consts[nc]['name'] if nc > 0: line += ' else ' line += 'if(!strcmp(name, "%s")) {\n' %varname + indent + 2*' ' + 'memcpy(' if consts[nc]['dim']==1: line += '&' line += varname+ '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel' if consts[nc]['dim'] !=1: line += '[:%s]' % str(consts[nc]['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE' line += ')\n'+indent + '}' code(line) code('}') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"') master = master.split('.')[0] fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_openacc(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1 OP_GBL = 2 OP_MAP = 3 OP_READ = 1 OP_WRITE = 2 OP_RW = 3 OP_INC = 4 OP_MAX = 5 OP_MIN = 6 accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN'] any_soa = 0 for nk in range(0, len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range(0, len(kernels)): name = kernels[nk]['name'] nargs = kernels[nk]['nargs'] dims = kernels[nk]['dims'] maps = kernels[nk]['maps'] var = kernels[nk]['var'] typs = kernels[nk]['typs'] accs = kernels[nk]['accs'] idxs = kernels[nk]['idxs'] inds = kernels[nk]['inds'] soaflags = kernels[nk]['soaflags'] decl_filepath = kernels[nk]['decl_filepath'] ninds = kernels[nk]['ninds'] inddims = kernels[nk]['inddims'] indaccs = kernels[nk]['indaccs'] indtyps = kernels[nk]['indtyps'] invinds = kernels[nk]['invinds'] mapnames = kernels[nk]['mapnames'] invmapinds = kernels[nk]['invmapinds'] mapinds = kernels[nk]['mapinds'] nmaps = 0 if ninds > 0: nmaps = max(mapinds) + 1 nargs_novec = nargs vec = [ m for m in range(0, nargs) if int(idxs[m]) < 0 and maps[m] == OP_MAP ] if len(vec) > 0: unique_args = [1] vec_counter = 1 vectorised = [] new_dims = [] new_maps = [] new_vars = [] new_typs = [] new_accs = [] new_idxs = [] new_inds = [] new_soaflags = [] new_mapnames = [] for m in range(0, nargs): if int(idxs[m]) < 0 and maps[m] == OP_MAP: if m > 0: unique_args = unique_args + [len(new_dims) + 1] temp = [0] * (-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): temp[i] = var[m] new_vars = new_vars + temp for i in range(0, -1 * int(idxs[m])): temp[i] = typs[m] new_typs = new_typs + temp for i in range(0, -1 * int(idxs[m])): temp[i] = dims[m] new_dims = new_dims + temp new_maps = new_maps + [maps[m]] * int(-1 * int(idxs[m])) new_mapnames = new_mapnames + [mapnames[m]] * int( -1 * int(idxs[m])) new_soaflags = new_soaflags + [soaflags[m]] * int( -1 * int(idxs[m])) new_accs = new_accs + [accs[m]] * int(-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): new_idxs = new_idxs + [i] new_inds = new_inds + [inds[m]] * int(-1 * int(idxs[m])) vectorised = vectorised + [vec_counter] * int( -1 * int(idxs[m])) vec_counter = vec_counter + 1 else: if m > 0: unique_args = unique_args + [len(new_dims) + 1] new_dims = new_dims + [dims[m]] new_maps = new_maps + [maps[m]] new_mapnames = new_mapnames + [mapnames[m]] new_accs = new_accs + [int(accs[m])] new_soaflags = new_soaflags + [soaflags[m]] new_idxs = new_idxs + [int(idxs[m])] new_inds = new_inds + [inds[m]] new_vars = new_vars + [var[m]] new_typs = new_typs + [typs[m]] vectorised = vectorised + [0] dims = new_dims maps = new_maps mapnames = new_mapnames accs = new_accs idxs = new_idxs inds = new_inds var = new_vars typs = new_typs soaflags = new_soaflags nargs = len(vectorised) mapinds = [0] * nargs for i in range(0, nargs): mapinds[i] = i for j in range(0, i): if (maps[i] == OP_MAP) and (mapnames[i] == mapnames[j]) and (idxs[i] == idxs[j]): mapinds[i] = mapinds[j] for i in range(1, ninds + 1): for index in range(0, len(inds) + 1): if inds[index] == i: invinds[i - 1] = index break invmapinds = invinds[:] for i in range(0, ninds): for j in range(0, i): if (mapnames[invinds[i]] == mapnames[invinds[j]]): invmapinds[i] = invmapinds[j] else: vectorised = [0] * nargs unique_args = range(1, nargs + 1) cumulative_indirect_index = [-1] * nargs j = 0 for i in range(0, nargs): if maps[i] == OP_MAP: cumulative_indirect_index[i] = j j = j + 1 # # set two logicals # j = -1 for i in range(0, nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0, nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE: j = i reduct = j >= 0 ########################################################################## # start with the user kernel function ########################################################################## FORTRAN = 0 CPP = 1 g_m = 0 file_text = '' depth = 0 comm('user function') #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT;') code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0, nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('int direct_' + name + '_stride_OP2CONSTANT;') code('int direct_' + name + '_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b' + name + '\\b') i = p.search(kernel_text).start() if (i < 0): print "\n********" print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}') signature_text = kernel_text[i:i + j] l = signature_text[0:].find('(') head_text = signature_text[0:l].strip() #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l + 1:m] body_text = kernel_text[i + j + 1:k] # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0, nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i]: var = var.replace('*', '') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length - 1].strip() if int(kernels[nk]['idxs'] [i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)', var2 + '[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) signature_text = '//#pragma acc routine\ninline ' + head_text + '( ' + signature_text + ') {' file_text += signature_text + body_text + '}\n' ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_' + name + '(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args) - 1]: code('op_arg ARG){') code('') else: code('op_arg ARG,') for g_m in range(0, nargs): if maps[g_m] == OP_GBL: #and accs[g_m] <> OP_READ: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = ' + str(nargs) + ';') code('op_arg args[' + str(nargs) + '];') code('') for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args[' + str(g_m) + '] = ARG;') v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] FOR('v', '1', str(sum(v))) code('args['+str(g_m)+' + v] = op_arg_dat(arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m] > 0: pass else: code('args[' + str(g_m) + '] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc(' + str(nk) + ');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' + str(nk) + '].name = name;') code('OP_kernels[' + str(nk) + '].count += 1;') code('') # # indirect bits # if ninds > 0: code('int ninds = ' + str(ninds) + ';') line = 'int inds[' + str(nargs) + '] = {' for m in range(0, nargs): line += str(inds[m] - 1) + ',' code(line[:-1] + '};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: ' + name + '\\n");') ENDIF() code('') comm(' get plan') code('#ifdef OP_PART_SIZE_' + str(nk)) code(' int part_size = OP_PART_SIZE_' + str(nk) + ';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code( 'int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: ' + name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') code('') for g_m in range(0, nargs): if maps[g_m] == OP_GBL: #and accs[g_m]<>OP_READ: if not dims[g_m].isdigit() or int(dims[g_m]) > 1: print 'ERROR: OpenACC does not support multi-dimensional op_arg_gbl variables' exit(-1) code('TYP ARG_l = ARGh[0];') if ninds > 0: code('') code('int ncolors = 0;') code('') IF('set->size >0') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' + str(nk) + '].count==1) || (opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(g_m) + '))') code('opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(g_m) + ');') code('opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT = opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST;') ENDIF() if dir_soa <> -1: IF('(OP_kernels[' + str(nk) + '].count==1) || (direct_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(dir_soa) + '))') code('direct_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(dir_soa) + ');') code('direct_' + name + '_stride_OP2CONSTANT = direct_' + name + '_stride_OP2HOST;') ENDIF() code('') comm('Set up typed device pointers for OpenACC') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1] in k): k = k + [invmapinds[inds[g_m] - 1]] code('int *map' + str(mapinds[g_m]) + ' = arg' + str(invmapinds[inds[g_m] - 1]) + '.map_data_d;') code('') for g_m in range(0, nargs): if maps[g_m] == OP_ID: code(typs[g_m] + '* data' + str(g_m) + ' = (' + typs[g_m] + '*)arg' + str(g_m) + '.data_d;') for m in range(1, ninds + 1): g_m = invinds[m - 1] code('TYP *data' + str(g_m) + ' = (TYP *)ARG.data_d;') # # kernel call for indirect version # if ninds > 0: code('') code( 'op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);' ) code('ncolors = Plan->ncolors;') code('int *col_reord = Plan->col_reord;') code('int set_size1 = set->size + set->exec_size;') code('') comm(' execute plan') FOR('col', '0', 'Plan->ncolors') IF('col==1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('') # code('#pragma omp parallel for') line = '#pragma acc parallel loop independent deviceptr(col_reord,' if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1] in k): k = k + [invmapinds[inds[g_m] - 1]] line = line + 'map' + str(mapinds[g_m]) + ',' for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + 'data' + str(g_m) + ',' for m in range(1, ninds + 1): g_m = invinds[m - 1] line = line + 'data' + str(g_m) + ',' line = line[:-1] + ')' if reduct: for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[ g_m] <> OP_WRITE: if accs[g_m] == OP_INC: line = line + ' reduction(+:arg' + str(g_m) + '_l)' if accs[g_m] == OP_MIN: line = line + ' reduction(min:arg' + str( g_m) + '_l)' if accs[g_m] == OP_MAX: line = line + ' reduction(max:arg' + str( g_m) + '_l)' code(line) FOR('e', 'start', 'end') code('int n = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n + set_size1 * '+str(idxs[g_m])+'];') code('') for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const TYP* ARG_vec[] = {\n' else: line = 'TYP* ARG_vec[] = {\n' v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] indent = ' ' * (depth + 2) for k in range(0, sum(v)): if soaflags[g_m]: line = line + indent + ' &data' + str( first) + '[map' + str( mapinds[g_m + k]) + 'idx],\n' else: line = line + indent + ' &data' + str( first) + '[DIM * map' + str( mapinds[g_m + k]) + 'idx],\n' line = line[:-2] + '};' code(line) code('') line = name + '(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: if soaflags[g_m]: line = line + indent + '&data' + str(g_m) + '[n]' else: line = line + indent + '&data' + str(g_m) + '[' + str( dims[g_m]) + ' * n]' if maps[g_m] == OP_MAP: if vectorised[g_m]: if g_m + 1 in unique_args: line = line + indent + 'arg' + str(g_m) + '_vec' else: if soaflags[g_m]: line = line + indent + '&data' + str( invinds[inds[g_m] - 1]) + '[map' + str( mapinds[g_m]) + 'idx]' else: line = line + indent + '&data' + str( invinds[inds[g_m] - 1]) + '[' + str( dims[g_m]) + ' * map' + str( mapinds[g_m]) + 'idx]' if maps[g_m] == OP_GBL: line = line + indent + '&arg' + str(g_m) + '_l' if g_m < nargs - 1: if g_m + 1 in unique_args and not g_m + 1 == unique_args[ -1]: line = line + ',' else: line = line + ');' code(line) ENDFOR() code('') if reduct: comm(' combine reduction data') IF('col == Plan->ncolors_owned-1') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m] == OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') ENDFOR() elif accs[g_m] == OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: error('internal error: invalid reduction option') ENDFOR() ENDIF() ENDFOR() # # kernel call for direct version # else: line = '#pragma acc parallel loop independent deviceptr(' for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + 'data' + str(g_m) + ',' line = line[:-1] + ')' if reduct: for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[ g_m] <> OP_WRITE: if accs[g_m] == OP_INC: line = line + ' reduction(+:arg' + str(g_m) + '_l)' if accs[g_m] == OP_MIN: line = line + ' reduction(min:arg' + str( g_m) + '_l)' if accs[g_m] == OP_MAX: line = line + ' reduction(max:arg' + str( g_m) + '_l)' code(line) FOR('n', '0', 'set->size') line = name + '(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: if soaflags[g_m]: line = line + indent + '&data' + str(g_m) + '[n]' else: line = line + indent + '&data' + str(g_m) + '[' + str( dims[g_m]) + '*n]' if maps[g_m] == OP_GBL: line = line + indent + '&arg' + str(g_m) + '_l' if g_m < nargs - 1: line = line + ',' else: line = line + ');' code(line) ENDFOR() if ninds > 0: code('OP_kernels[' + str(nk) + '].transfer += Plan->transfer;') code('OP_kernels[' + str(nk) + '].transfer2 += Plan->transfer2;') ENDIF() code('') #zero set size issues if ninds > 0: IF('set_size == 0 || set_size == set->core_size || ncolors == 1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if ninds == 0: #direct version only if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m] == OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m] == OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: print 'internal error: invalid reduction option' if typs[g_m] == 'double': #need for both direct and indirect code('op_mpi_reduce_double(&ARG,ARGh);') elif typs[g_m] == 'float': code('op_mpi_reduce_float(&ARG,ARGh);') elif typs[g_m] == 'int': code('op_mpi_reduce_int(&ARG,ARGh);') else: print 'Type ' + typs[ g_m] + ' not supported in OpenACC code generator, please add it' exit(-1) code('op_mpi_set_dirtybit_cuda(nargs, args);') code('') # # update kernel record # comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' + str(nk) + '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *' for g_m in range(0, nargs): if maps[g_m] <> OP_GBL: if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: code(line + ' ARG.size;') else: code(line + ' ARG.size * 2.0f;') depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('openacc'): os.makedirs('openacc') fid = open('openacc/' + name + '_acckernel.c', 'w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_c.h" ') code('') comm(' global constants ') for nc in range(0, len(consts)): if consts[nc]['dim'] == 1: code('extern ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + ';') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('extern ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + '[' + num + '];') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){}') comm(' user kernel files') for nk in range(0, len(kernels)): code('#include "' + kernels[nk]['name'] + '_acckernel.c"') master = master.split('.')[0] fid = open('openacc/' + master.split('.')[0] + '_acckernels.c', 'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_openmp4(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1 OP_GBL = 2 OP_MAP = 3 OP_READ = 1 OP_WRITE = 2 OP_RW = 3 OP_INC = 4 OP_MAX = 5 OP_MIN = 6 accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN'] op2_compiler = os.getenv('OP2_COMPILER', '0') any_soa = 0 maptype = 'map' for nk in range(0, len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range(0, len(kernels)): name = kernels[nk]['name'] nargs = kernels[nk]['nargs'] dims = kernels[nk]['dims'] maps = kernels[nk]['maps'] var = kernels[nk]['var'] typs = kernels[nk]['typs'] accs = kernels[nk]['accs'] idxs = kernels[nk]['idxs'] inds = kernels[nk]['inds'] soaflags = kernels[nk]['soaflags'] decl_filepath = kernels[nk]['decl_filepath'] ninds = kernels[nk]['ninds'] inddims = kernels[nk]['inddims'] indaccs = kernels[nk]['indaccs'] indtyps = kernels[nk]['indtyps'] invinds = kernels[nk]['invinds'] mapnames = kernels[nk]['mapnames'] invmapinds = kernels[nk]['invmapinds'] mapinds = kernels[nk]['mapinds'] nmaps = 0 if ninds > 0: nmaps = max(mapinds) + 1 nargs_novec = nargs vec = [ m for m in range(0, nargs) if int(idxs[m]) < 0 and maps[m] == OP_MAP ] if len(vec) > 0: unique_args = [1] vec_counter = 1 vectorised = [] new_dims = [] new_maps = [] new_vars = [] new_typs = [] new_accs = [] new_idxs = [] new_inds = [] new_soaflags = [] new_mapnames = [] for m in range(0, nargs): if int(idxs[m]) < 0 and maps[m] == OP_MAP: if m > 0: unique_args = unique_args + [len(new_dims) + 1] temp = [0] * (-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): temp[i] = var[m] new_vars = new_vars + temp for i in range(0, -1 * int(idxs[m])): temp[i] = typs[m] new_typs = new_typs + temp for i in range(0, -1 * int(idxs[m])): temp[i] = dims[m] new_dims = new_dims + temp new_maps = new_maps + [maps[m]] * int(-1 * int(idxs[m])) new_mapnames = new_mapnames + [mapnames[m]] * int( -1 * int(idxs[m])) new_soaflags = new_soaflags + [soaflags[m]] * int( -1 * int(idxs[m])) new_accs = new_accs + [accs[m]] * int(-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): new_idxs = new_idxs + [i] new_inds = new_inds + [inds[m]] * int(-1 * int(idxs[m])) vectorised = vectorised + [vec_counter] * int( -1 * int(idxs[m])) vec_counter = vec_counter + 1 else: if m > 0: unique_args = unique_args + [len(new_dims) + 1] new_dims = new_dims + [dims[m]] new_maps = new_maps + [maps[m]] new_mapnames = new_mapnames + [mapnames[m]] new_accs = new_accs + [int(accs[m])] new_soaflags = new_soaflags + [soaflags[m]] new_idxs = new_idxs + [int(idxs[m])] new_inds = new_inds + [inds[m]] new_vars = new_vars + [var[m]] new_typs = new_typs + [typs[m]] vectorised = vectorised + [0] dims = new_dims maps = new_maps mapnames = new_mapnames accs = new_accs idxs = new_idxs inds = new_inds var = new_vars typs = new_typs soaflags = new_soaflags nargs = len(vectorised) mapinds = [0] * nargs for i in range(0, nargs): mapinds[i] = i for j in range(0, i): if (maps[i] == OP_MAP) and (mapnames[i] == mapnames[j]) and (idxs[i] == idxs[j]): mapinds[i] = mapinds[j] for i in range(1, ninds + 1): for index in range(0, len(inds) + 1): if inds[index] == i: invinds[i - 1] = index break invmapinds = invinds[:] for i in range(0, ninds): for j in range(0, i): if (mapnames[invinds[i]] == mapnames[invinds[j]]): invmapinds[i] = invmapinds[j] else: vectorised = [0] * nargs unique_args = range(1, nargs + 1) cumulative_indirect_index = [-1] * nargs j = 0 for i in range(0, nargs): if maps[i] == OP_MAP: cumulative_indirect_index[i] = j j = j + 1 # # set two logicals # j = -1 for i in range(0, nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0, nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 ########################################################################## # start with the user kernel function ########################################################################## FORTRAN = 0 CPP = 1 g_m = 0 file_text = '' depth = 0 comm('user function') #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT;') code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0, nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('int direct_' + name + '_stride_OP2CONSTANT;') code('int direct_' + name + '_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b' + name + '\\b') i = p.search(kernel_text).start() if (i < 0): print "\n********" print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}') signature_text = kernel_text[i:i + j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l + 1:m] body_text = kernel_text[i + j + 1:k] # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0, nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i]: var = var.replace('*', '') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length - 1].strip() if int(kernels[nk]['idxs'] [i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)', var2 + '[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0, len(consts)): varname = consts[nc]['name'] body_text = re.sub('\\b' + varname + '\\b', varname + '_ompkernel', body_text) # if consts[nc]['dim'] == 1: # body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text) # else: # body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text) # body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text) kernel_params = [var.strip() for var in signature_text.split(',')] if len(vec) > 0: new_kernel_params = [] for m in range(0, nargs_novec): if int(kernels[nk]['idxs'][m]) < 0 and int( kernels[nk]['maps'][m]) == OP_MAP: new_kernel_params = new_kernel_params + [ kernel_params[m] ] * int(-1 * int(kernels[nk]['idxs'][m])) else: new_kernel_params = new_kernel_params + [kernel_params[m]] kernel_params = new_kernel_params # collect constants used by kernel kernel_consts = [] for nc in range(0, len(consts)): if body_text.find(consts[nc]['name'] + '_ompkernel') != -1: kernel_consts.append(nc) ############################################################ # omp4 function call definition ############################################################ code('') func_call_signaure_text = 'void ' + name + '_omp4_kernel(' params = '' indent = '\n' + ' ' k = [] for g_m in range(0, nargs): if maps[g_m] == OP_GBL: params += indent + rep('TYP *ARG,', g_m) if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1] in k): k = k + [invmapinds[inds[g_m] - 1]] params += indent + 'int *map' + str(mapinds[g_m]) + ',' if maptype == 'map': params += indent + 'int map' + str(mapinds[g_m]) + 'size,' if maps[g_m] == OP_ID: params += indent + rep('TYP *data' + str(g_m) + ',', g_m) if maptype == 'map': params += indent + 'int dat' + str(g_m) + 'size,' for m in range(1, ninds + 1): g_m = invinds[m - 1] params += indent + rep('TYP *data' + str(g_m) + ',', g_m) if maptype == 'map': params += indent + 'int dat' + str(g_m) + 'size,' if ninds > 0: # add indirect kernel specific params to kernel func call params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' else: # add direct kernel specific params to kernel func call params += indent + 'int count,' params += indent + 'int num_teams,' + indent + 'int nthread' #add strides for SoA to params if any_soa: indent = ',' + indent if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] params += indent + 'int opDat' + str( invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT' if dir_soa <> -1: params += indent + 'int direct_' + name + '_stride_OP2CONSTANT' code(func_call_signaure_text + params + ');') ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_' + name + '(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args) - 1]: code('op_arg ARG){') code('') else: code('op_arg ARG,') for g_m in range(0, nargs): if maps[g_m] == OP_GBL: #and accs[g_m] <> OP_READ: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = ' + str(nargs) + ';') code('op_arg args[' + str(nargs) + '];') code('') for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args[' + str(g_m) + '] = ARG;') v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] FOR('v', '1', str(sum(v))) code('args['+str(g_m)+' + v] = op_arg_dat(arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m] > 0: pass else: code('args[' + str(g_m) + '] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc(' + str(nk) + ');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' + str(nk) + '].name = name;') code('OP_kernels[' + str(nk) + '].count += 1;') code('') # # indirect bits # if ninds > 0: code('int ninds = ' + str(ninds) + ';') line = 'int inds[' + str(nargs) + '] = {' for m in range(0, nargs): line += str(inds[m] - 1) + ',' code(line[:-1] + '};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: ' + name + '\\n");') ENDIF() code('') comm(' get plan') code( 'int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: ' + name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') # # get part and block size # code('') code('#ifdef OP_PART_SIZE_' + str(nk)) code(' int part_size = OP_PART_SIZE_' + str(nk) + ';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('#ifdef OP_BLOCK_SIZE_' + str(nk)) code(' int nthread = OP_BLOCK_SIZE_' + str(nk) + ';') code('#else') code(' int nthread = OP_block_size;') code('#endif') code('') for g_m in range(0, nargs): if maps[g_m] == OP_GBL: #and accs[g_m]<>OP_READ: if not dims[g_m].isdigit() or int(dims[g_m]) > 1: print 'ERROR: OpenMP 4 does not support multi-dimensional variables' exit(-1) code('TYP ARG_l = ARGh[0];') if ninds > 0: code('') code('int ncolors = 0;') code('int set_size1 = set->size + set->exec_size;') code('') IF('set->size >0') #managing constants if any_soa: code('') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' + str(nk) + '].count==1) || (opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(g_m) + '))') code('opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(g_m) + ');') code('opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT = opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST;') ENDIF() if dir_soa <> -1: IF('(OP_kernels[' + str(nk) + '].count==1) || (direct_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(dir_soa) + '))') code('direct_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(dir_soa) + ');') code('direct_' + name + '_stride_OP2CONSTANT = direct_' + name + '_stride_OP2HOST;') ENDIF() code('') comm('Set up typed device pointers for OpenMP') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1] in k): k = k + [invmapinds[inds[g_m] - 1]] code('int *map' + str(mapinds[g_m]) + ' = arg' + str(invmapinds[inds[g_m] - 1]) + '.map_data_d;') if maptype == 'map': code(' int map' + str(mapinds[g_m]) + 'size = arg' + str(invmapinds[inds[g_m] - 1]) + '.map->dim * set_size1;') code('') for g_m in range(0, nargs): if maps[g_m] == OP_ID: code(typs[g_m] + '* data' + str(g_m) + ' = (' + typs[g_m] + '*)arg' + str(g_m) + '.data_d;') if maptype == 'map': code('int dat' + str(g_m) + 'size = getSetSizeFromOpArg(&arg' + str(g_m) + ') * arg' + str(g_m) + '.dat->dim;') for m in range(1, ninds + 1): g_m = invinds[m - 1] code('TYP *data' + str(g_m) + ' = (TYP *)ARG.data_d;') if maptype == 'map': code('int dat' + str(g_m) + 'size = getSetSizeFromOpArg(&arg' + str(g_m) + ') * arg' + str(g_m) + '.dat->dim;') # # prepare kernel params for indirect version # if ninds > 0: code('') code( 'op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);' ) code('ncolors = Plan->ncolors;') code('int *col_reord = Plan->col_reord;') code('') comm(' execute plan') FOR('col', '0', 'Plan->ncolors') IF('col==1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('') # # kernel function call # indent = '\n' + ' ' * (depth + 2) call_params = ','.join([ indent + re.sub(r'\*arg(\d+)', r'&arg\1_l', param.strip().split(' ')[-1]) for param in params.split(',') ]) call_params = call_params.replace('*', '') # set params for indirect version if ninds > 0: call_params = call_params.replace( 'num_teams', 'part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread') # set params for direct version else: call_params = re.sub('count', 'set->size', call_params) call_params = call_params.replace( 'num_teams', 'part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') code(func_call_signaure_text.split(' ')[-1] + call_params + ');') code('') if ninds > 0: if reduct: comm(' combine reduction data') IF('col == Plan->ncolors_owned-1') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m] == OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m] == OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: error('internal error: invalid reduction option') ENDIF() ENDFOR() code('OP_kernels[' + str(nk) + '].transfer += Plan->transfer;') code('OP_kernels[' + str(nk) + '].transfer2 += Plan->transfer2;') ENDIF() code('') #zero set size issues if ninds > 0: IF('set_size == 0 || set_size == set->core_size || ncolors == 1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if ninds == 0: #direct version only if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE: code('ARGh[0] = ARG_l;') elif accs[g_m] == OP_MIN: code('ARGh[0] = MIN(ARGh[0],ARG_l);') elif accs[g_m] == OP_MAX: code('ARGh[0] = MAX(ARGh[0],ARG_l);') else: print 'internal error: invalid reduction option' if typs[g_m] == 'double': #need for both direct and indirect code('op_mpi_reduce_double(&ARG,ARGh);') elif typs[g_m] == 'float': code('op_mpi_reduce_float(&ARG,ARGh);') elif typs[g_m] == 'int': code('op_mpi_reduce_int(&ARG,ARGh);') else: print 'Type ' + typs[ g_m] + ' not supported in OpenMP4 code generator, please add it' exit(-1) code('op_mpi_set_dirtybit_cuda(nargs, args);') code('') # # update kernel record # code('if (OP_diags>1) deviceSync();') comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' + str(nk) + '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *' for g_m in range(0, nargs): if maps[g_m] <> OP_GBL: if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: code(line + ' ARG.size;') else: code(line + ' ARG.size * 2.0f;') depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('openmp4'): os.makedirs('openmp4') fid = open('openmp4/' + name + '_omp4kernel.cpp', 'w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ############################################################## # generate ****_omp4kernel_func.cpp ############################################################## file_text = '' code(func_call_signaure_text + params + '){') code('') depth += 2 for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('TYP ARG_l = *ARG;') line = '#pragma omp target teams' if op2_compiler == 'clang': line += ' distribute parallel for schedule(static,1)\\\n' + ( depth + 2) * ' ' line += ' num_teams(num_teams) thread_limit(nthread) ' map_clause = '' if maptype == 'map': map_clause = 'map(to:' elif maptype == 'is_device_ptr': map_clause = 'is_device_ptr(' for g_m in range(0, nargs): if maps[g_m] == OP_ID: if maptype == 'map': map_clause += 'data' + str(g_m) + '[0:dat' + str( g_m) + 'size],' else: map_clause += 'data' + str(g_m) + ',' if map_clause != 'is_device_ptr(' and map_clause != 'map(to:': map_clause = map_clause[:-1] + ')' line += map_clause # mapping global consts if len(kernel_consts) != 0: line += ' \\\n' + (depth + 2) * ' ' + 'map(to:' for nc in kernel_consts: line += ' ' + consts[nc]['name'] + '_ompkernel,' if consts[nc]['dim'] != 1: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' line = line[:-1] + '[:' + num + '],' line = line[:-1] + ')' # prepare reduction reduction_string = '' reduction_mapping = '' if reduct: reduction_mapping = '\\\n' + (depth + 2) * ' ' + 'map(tofrom:' for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m] == OP_INC: reduction_string += ' reduction(+:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MIN: reduction_string += ' reduction(min:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MAX: reduction_string += ' reduction(max:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_WRITE: reduction_mapping += ' arg%d_l,' % g_m reduction_mapping = reduction_mapping[0:-1] + ')' # # map extra pointers for indirect version # if ninds > 0: if maptype == 'map': line += '\\\n' + (depth + 2) * ' ' + 'map(to:col_reord[0:set_size1],' else: line += '\\\n' + (depth + 2) * ' ' + 'map(to:col_reord,' if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1] in k): k = k + [invmapinds[inds[g_m] - 1]] if maptype == 'map': line = line + 'map' + str( mapinds[g_m]) + '[0:map' + str( mapinds[g_m]) + 'size],' else: line = line + 'map' + str(mapinds[g_m]) + ',' for m in range(1, ninds + 1): g_m = invinds[m - 1] if maptype == 'map': line = line + 'data' + str(g_m) + '[0:dat' + str( g_m) + 'size],' else: line = line + 'data' + str(g_m) + ',' line = line[:-1] + ')' # # write omp pragma # code(line + reduction_mapping + reduction_string) if op2_compiler != 'clang': line = '#pragma omp distribute parallel for schedule(static,1)' code(line + reduction_string) # # start for loop indirect version # if ninds > 0: FOR('e', 'start', 'end') code('int n_op = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n_op + set_size1 * '+str(idxs[g_m])+'];') code('') for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const TYP* ARG_vec[] = {\n' else: line = 'TYP* ARG_vec[] = {\n' v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] indent = ' ' * (depth + 2) for k in range(0, sum(v)): if soaflags[g_m]: line = line + indent + ' &data' + str( first) + '[map' + str( mapinds[g_m + k]) + 'idx],\n' else: line = line + indent + ' &data' + str( first) + '[DIM * map' + str( mapinds[g_m + k]) + 'idx],\n' line = line[:-2] + '};' code(line) # # direct version # else: FOR('n_op', '0', 'count') # # write inlined kernel function # comm('variable mapping') for g_m in range(0, nargs): line = kernel_params[g_m] + ' = ' if maps[g_m] == OP_ID: if soaflags[g_m]: line += '&data%d[n_op]' % g_m else: line += '&data' + str(g_m) + '[' + str( dims[g_m]) + '*n_op]' if maps[g_m] == OP_MAP: if vectorised[g_m]: if g_m + 1 in unique_args: line += 'arg' + str(g_m) + '_vec' else: line = '' else: if soaflags[g_m]: line += '&data' + str( invinds[inds[g_m] - 1]) + '[map' + str( mapinds[g_m]) + 'idx]' else: line += '&data' + str( invinds[inds[g_m] - 1]) + '[' + str( dims[g_m]) + ' * map' + str( mapinds[g_m]) + 'idx]' if maps[g_m] == OP_GBL: line += '&arg%d_l' % g_m if len(line): line += ';' code(line) code('') comm('inline function') indent = ' ' * (depth - 2) inline_body_text = '' for line in body_text.split('\n'): if len(line): inline_body_text += indent + line + '\n' else: inline_body_text += '\n' code(inline_body_text) comm('end inline func') ENDFOR() code('') # end kernel function for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('*ARG = ARG_l;') depth -= 2 code('}') ########################################################################## # output individual omp4kernel file ########################################################################## fid = open('openmp4/' + name + '_omp4kernel_func.cpp', 'w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm(' header ') code('#include "op_lib_cpp.h" ') code('') comm(' user kernel files') for nk in range(0, len(kernels)): code('#include "' + kernels[nk]['name'] + '_omp4kernel.cpp"') master = master.split('.')[0] fid = open('openmp4/' + master.split('.')[0] + '_omp4kernels.cpp', 'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ########################################################################## # output omp4 master kernel file ########################################################################## file_text = '' comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_cpp.h" ') code('') comm(' global constants ') for nc in range(0, len(consts)): if consts[nc]['dim'] == 1: code(consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + '_ompkernel;') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code(consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + '_ompkernel[' + num + '];') code('') code('void op_decl_const_char(int dim, char const *type,') code(' int size, char *dat, char const *name){') indent = ' ' * (2 + depth) line = ' ' for nc in range(0, len(consts)): varname = consts[nc]['name'] if nc > 0: line += ' else ' line += 'if(!strcmp(name, "%s")) {\n' % varname + indent + 2 * ' ' + 'memcpy(' if consts[nc]['dim'] == 1: line += '&' line += varname + '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:' + varname + '_ompkernel' if consts[nc]['dim'] != 1: line += '[:%s]' % str( consts[nc] ['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE' line += ')\n' + indent + '}' code(line) code('}') comm(' user kernel files') for nk in range(0, len(kernels)): code('#include "' + kernels[nk]['name'] + '_omp4kernel_func.cpp"') master = master.split('.')[0] fid = open('openmp4/' + master.split('.')[0] + '_omp4kernel_funcs.cpp', 'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] inc_stage=0 op_color2=0 op_color2_force=0 ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage) any_soa = 0 any_soa = any_soa or sum(soaflags) # # set logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_RW: j = i ind_rw = j >= 0 if ind_rw or op_color2_force: op_color2 = 1 else: op_color2 = 0 #no staging with 2 level colouring if op_color2: inc_stage=0 optidxs = [0]*nargs indopts = [-1]*nargs nopts = 0 for i in range(0,nargs): if optflags[i] == 1 and maps[i] == OP_ID: optidxs[i] = nopts nopts = nopts+1 elif optflags[i] == 1 and maps[i] == OP_MAP: if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination optidxs[i] = nopts indopts[inds[i]-1] = i nopts = nopts+1 else: optidxs[i] = optidxs[invinds[inds[i]-1]] j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE: j = i reduct = j >= 0 if inc_stage: ninds_staged = 0 inds_staged = [-1]*nargs for i in range(0,nargs): if maps[i]==OP_MAP and accs[i]==OP_INC: if inds_staged[invinds[inds[i]-1]] == -1: inds_staged[i] = ninds_staged ninds_staged = ninds_staged + 1 else: inds_staged[i] = inds_staged[invinds[inds[i]-1]] invinds_staged = [-1]*ninds_staged inddims_staged = [-1]*ninds_staged indopts_staged = [-1]*ninds_staged for i in range(0,nargs): if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: invinds_staged[inds_staged[i]] = i inddims_staged[inds_staged[i]] = dims[i] if optflags[i] == 1: indopts_staged[inds_staged[i]] = i for i in range(0,nargs): inds_staged[i] = inds_staged[i] + 1 ########################################################################## # start with CUDA kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() if CPP: includes = op2_gen_common.extract_includes(kernel_text) if len(includes) > 0: for include in includes: code(include) code("") comm('user function') kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l].strip() #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>: body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments' return for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i] and (op_color2 or not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)): var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0,len(consts)): varname = consts[nc]['name'] body_text = re.sub('\\b'+varname+'\\b', varname+'_cuda',body_text) signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {' file_text += signature_text + body_text + '}\n' comm('') comm(' CUDA kernel function') if FORTRAN: code('subroutine op_cuda_'+name+'(') elif CPP: code('__global__ void op_cuda_'+name+'(') depth = 2 if nopts > 0: code('int optflags,') for g_m in range(0,ninds): if (indaccs[g_m]==OP_READ): code('const <INDTYP> *__restrict <INDARG>,') else: code('<INDTYP> *__restrict <INDARG>,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ') for g_m in range(0,nargs): if maps[g_m] == OP_ID: if accs[g_m] == OP_READ: code('const <TYP> *__restrict <ARG>,') else: code('<TYP> *<ARG>,') elif maps[g_m] == OP_GBL: if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE: code('<TYP> *<ARG>,') elif accs[g_m] == OP_READ: code('const <TYP> *<ARG>,') if ind_inc and inc_stage==1: code('int *ind_map,') code('short *arg_map,') code('int *ind_arg_sizes,') code('int *ind_arg_offs, ') if ninds>0: if not op_color2: code('int block_offset, ') code('int *blkmap, ') code('int *offset, ') code('int *nelems, ') code('int *ncolors, ') code('int *colors, ') code('int nblocks, ') else: code('int start, ') code('int end, ') code('int *col_reord, ') code('int set_size) { ') else: code('int set_size ) {') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE: code('<TYP> <ARG>_l[<DIM>];') if accs[g_m] == OP_INC: FOR('d','0','<DIM>') code('<ARG>_l[d]=ZERO_<TYP>;') ENDFOR() else: FOR('d','0','<DIM>') code('<ARG>_l[d]=<ARG>[d+blockIdx.x*<DIM>];') ENDFOR() elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2: code('<TYP> <ARG>_l[<DIM>];') if not op_color2: for m in range (1,ninds+1): g_m = m -1 v = [int(inds[i]==m) for i in range(len(inds))] v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] if sum(v)>1 and sum(v_i)>0: #check this sum(v_i) if indaccs[m-1] == OP_INC: ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 code('<INDTYP> *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2; for n in range(0,nargs): if inds[n] == m: g_m = n code('<ARG>_l,') depth -= 2 code('};') # # lengthy code for general case with indirection # if ninds>0 and not op_color2: code('') if inc_stage==1: for g_m in range (0,ninds): if indaccs[g_m] == OP_INC: code('__shared__ int *<INDARG>_map, <INDARG>_size;') code('__shared__ <INDTYP> *<INDARG>_s;') code('') if ind_inc: code('__shared__ int nelems2, ncolor;') code('__shared__ int nelem, offset_b;') code('') code('extern __shared__ char shared[];') code('') IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') code('return;') ENDIF() IF('threadIdx.x==0') code('') comm('get sizes and shift pointers and direct-mapped data') code('') code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];') code('') code('nelem = nelems[blockId];') code('offset_b = offset[blockId];') code('') if ind_inc: code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') code('ncolor = ncolors[blockId];') code('') if inc_stage==1 and ind_inc: for g_m in range (0,ninds_staged): if indopts_staged[g_m-1] > 0: IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];') if indopts_staged[g_m-1] > 0: ENDIF() code('') for m in range (1,ninds_staged+1): g_m = m - 1 c = [i for i in range(nargs) if inds_staged[i]==m] code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];') code('') comm('set shared memory pointers') code('int nbytes = 0;') for g_m in range(0,ninds_staged): code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];') if g_m < ninds_staged-1: if indopts_staged[g_m-1] > 0: IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) code('nbytes += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');') if indopts_staged[g_m-1] > 0: ENDIF() ENDIF() code('__syncthreads(); // make sure all of above completed') code('') if inc_stage==1: for g_m in range(0,ninds): if indaccs[g_m] == OP_INC: FOR_INC('n','threadIdx.x','ind_ARG_size*<INDDIM>','blockDim.x') code('ind_ARG_s[n] = ZERO_<INDTYP>;') ENDFOR() if ind_inc: code('') code('__syncthreads();') code('') if ind_inc: FOR_INC('n','threadIdx.x','nelems2','blockDim.x') code('int col2 = -1;') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') IF('n<nelem') comm('initialise local variables') for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: FOR('d','0','<DIM>') code('<ARG>_l[d] = ZERO_<TYP>;') ENDFOR() else: FOR_INC('n','threadIdx.x','nelem','blockDim.x') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #non-optional maps k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [(0*nargs+mapinds[g_m])] #non-opt k = k + [(1*nargs+mapinds[g_m])] #opt code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): k = k + [(optflags[g_m]*nargs+mapinds[g_m])] if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() code('') for g_m in range (0,nargs): if accs[g_m] <> OP_INC: #TODO: add opt handling here u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const <TYP>* <ARG>_vec[] = {\n' else: line = '<TYP>* <ARG>_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # simple version for global coloring # elif ninds>0: code('int tid = threadIdx.x + blockIdx.x * blockDim.x;') IF('tid + start < end') code('int n = col_reord[tid + start];') comm('initialise local variables') #mapidx declarations k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #non-optional maps k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [(0*nargs+mapinds[g_m])] #non-opt k = k + [(1*nargs+mapinds[g_m])] #opt code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): k = k + [(optflags[g_m]*nargs+mapinds[g_m])] if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const <TYP>* <ARG>_vec[] = {\n' else: line = '<TYP>* <ARG>_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # simple alternative when no indirection # else: code('') comm('process set elements') FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x') # # kernel call # code('') comm('user-supplied kernel call') line = name+'_gpu(' prefix = ' '*len(name) a = 0 #only apply indentation if its not the 0th argument indent ='' for m in range (0, nargs): if a > 0: indent = ' '+' '*len(name) if maps[m] == OP_GBL: if accs[m] == OP_READ or accs[m] == OP_WRITE: line += rep(indent+'<ARG>,\n',m) else: line += rep(indent+'<ARG>_l,\n',m); a =a+1 elif maps[m]==OP_MAP and accs[m]==OP_INC and not op_color2: if vectorised[m]: if m+1 in unique_args: line += rep(indent+'<ARG>_vec,\n',m) else: line += rep(indent+'<ARG>_l,\n',m) a =a+1 elif maps[m]==OP_MAP: if vectorised[m]: if m+1 in unique_args: line += rep(indent+'<ARG>_vec,\n',m) else: if soaflags[m]: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m) else: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*<DIM>,'+'\n',m) a =a+1 elif maps[m]==OP_ID: if ninds>0 and not op_color2: if soaflags[m]: line += rep(indent+'<ARG>+(n+offset_b),\n',m) else: line += rep(indent+'<ARG>+(n+offset_b)*<DIM>,\n',m) a =a+1 else: if soaflags[m]: line += rep(indent+'<ARG>+n,\n',m) else: line += rep(indent+'<ARG>+n*<DIM>,\n',m) a =a+1 else: print 'internal error 1 ' code(line[0:-2]+');') #remove final ',' and \n # # updating for indirect kernels ... # if ninds>0 and not op_color2: if ind_inc: code('col2 = colors[n+offset_b];') ENDIF() code('') comm('store local variables') code('') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: code('int <ARG>_map;') IF('col2>=0') for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('<ARG>_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];') ENDIF() code('') FOR('col','0','ncolor') IF('col2==col') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];') else: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>];') # for g_m in range(0,nargs): # if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = <ARG>_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>] = <ARG>_l['+str(d)+'];') if optflags[g_m]==1: ENDIF() else: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];') else: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>];') # for g_m in range(0,nargs): # if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = <ARG>_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>] = <ARG>_l['+str(d)+'];') if optflags[g_m]==1: ENDIF() ENDFOR() code('__syncthreads();') ENDFOR() ENDFOR() if inc_stage: for g_m in range(0,ninds): if indaccs[g_m]==OP_INC: if indopts[g_m] > 0: IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]])) if soaflags[invinds[g_m]]: FOR_INC('n','threadIdx.x','<INDARG>_size','blockDim.x') for d in range(0,int(dims[invinds[g_m]])): code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = <INDARG>_s[n+'+str(d)+'*<INDARG>_size] + <INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];') for d in range(0,int(dims[invinds[g_m]])): code('<INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];') ENDFOR() else: FOR_INC('n','threadIdx.x','<INDARG>_size*<INDDIM>','blockDim.x') code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] += <INDARG>_s[n];') ENDFOR() if indopts[g_m] > 0: ENDIF() # # global reduction # if reduct: code('') comm('global reductions') code('') for m in range (0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE: FOR('d','0','<DIM>') if accs[m]==OP_INC: code('op_reduction<OP_INC>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') elif accs[m]==OP_MIN: code('op_reduction<OP_MIN>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') elif accs[m]==OP_MAX: code('op_reduction<OP_MAX>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') else: print 'internal error: invalid reduction option' sys.exit(2); ENDFOR() depth -= 2 code('}') code('') ########################################################################## # then C++ stub function ########################################################################## code('') comm('host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg <ARG>){') code('') else: code('op_arg <ARG>,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('<ARG>.idx = 0;') code('args['+str(g_m)+'] = <ARG>;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = <ARG>;') if nopts>0: code('int optflags = 0;') for i in range(0,nargs): if optflags[i] == 1: IF('args['+str(i)+'].opt') code('optflags |= 1<<'+str(optidxs[i])+';') ENDIF() if nopts > 30: print 'ERROR: too many optional arguments to store flags in an integer' # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('') code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm('get plan') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') IF('set->size > 0') code('') # # kernel call for indirect version # if ninds>0: if inc_stage==1 and ind_inc: code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);') elif op_color2: code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') else: code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);') code('') # # transfer constants # g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)] if len(g)>0: comm('transfer constants to GPU') code('int consts_bytes = 0;') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE): code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));') code('reallocConstArrays(consts_bytes);') code('consts_bytes = 0;') for m in range(0,nargs): if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE): g_m = m code('<ARG>.data = OP_consts_h + consts_bytes;') code('<ARG>.data_d = OP_consts_d + consts_bytes;') FOR('d','0','<DIM>') code('((<TYP> *)<ARG>.data)[d] = <ARG>h[d];') ENDFOR() code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));') code('mvConstArraysToDevice(consts_bytes);') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() # # transfer global reduction initial data # if ninds == 0: comm('set CUDA execution parameters') code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code(' int nthread = OP_block_size;') comm(' int nthread = 128;') code('#endif') code('') code('int nblocks = 200;') code('') if reduct: comm('transfer global reduction data to GPU') if ninds>0: code('int maxblocks = 0;') FOR('col','0','Plan->ncolors') code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') ENDFOR() else: code('int maxblocks = nblocks;') code('int reduct_bytes = 0;') code('int reduct_size = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE: code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));') code('reduct_size = MAX(reduct_size,sizeof(<TYP>));') code('reallocReductArrays(reduct_bytes);') code('reduct_bytes = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE: code('<ARG>.data = OP_reduct_h + reduct_bytes;') code('<ARG>.data_d = OP_reduct_d + reduct_bytes;') FOR('b','0','maxblocks') FOR('d','0','<DIM>') if accs[g_m]==OP_INC: code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = ZERO_<TYP>;') else: code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = <ARG>h[d];') ENDFOR() ENDFOR() code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));') code('mvReductArraysToDevice(reduct_bytes);') code('') # # kernel call for indirect version # if ninds>0: comm('execute plan') if not op_color2: code('') code('int block_offset = 0;') FOR('col','0','Plan->ncolors') IF('col==Plan->ncolors_core') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code('int nthread = OP_block_size;') code('#endif') code('') if op_color2: code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('int nblocks = (end - start - 1)/nthread + 1;') else: code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],') code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);') IF('Plan->ncolblk[col] > 0') if reduct or (inc_stage==1 and ind_inc): if reduct and inc_stage==1: code('int nshared = MAX(Plan->nshared,reduct_size*nthread);') elif reduct: code('int nshared = reduct_size*nthread;') else: code('int nshared = Plan->nsharedCol[col];') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') if nopts > 0: code('optflags,') for m in range(1,ninds+1): g_m = invinds[m-1] code('(<TYP> *)<ARG>.data_d,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('arg'+str(g_m)+'.map_data_d, ') for g_m in range(0,nargs): if inds[g_m]==0: code('(<TYP>*)<ARG>.data_d,') if inc_stage==1 and ind_inc: code('Plan->ind_map,') code('Plan->loc_map,') code('Plan->ind_sizes,') code('Plan->ind_offs,') if op_color2: code('start,') code('end,') code('Plan->col_reord,') else: code('block_offset,') code('Plan->blkmap,') code('Plan->offset,') code('Plan->nelems,') code('Plan->nthrcol,') code('Plan->thrcol,') code('Plan->ncolblk[col],') code('set->size+set->exec_size);') code('') if reduct: comm('transfer global reduction data back to CPU') IF('col == Plan->ncolors_owned-1') code('mvReductArraysToHost(reduct_bytes);') ENDIF() if not op_color2: ENDFOR() #TODO sztem ez forditva van... code('block_offset += Plan->ncolblk[col];') ENDIF() # # kernel call for direct version # else: if reduct: code('int nshared = reduct_size*nthread;') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') indent = ' '#*(len(name)+42) if nopts > 0: code(indent+'optflags,') for g_m in range(0,nargs): if g_m > 0: code(indent+'(<TYP> *) <ARG>.data_d,') else: code(indent+'(<TYP> *) <ARG>.data_d,') code(indent+'set->size );') if ninds>0: code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') # # transfer global reduction initial data # if reduct: if ninds == 0: comm('transfer global reduction data back to CPU') code('mvReductArraysToHost(reduct_bytes);') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE: FOR('b','0','maxblocks') FOR('d','0','<DIM>') if accs[m]==OP_INC: code('<ARG>h[d] = <ARG>h[d] + ((<TYP> *)<ARG>.data)[d+b*<DIM>];') elif accs[m]==OP_MIN: code('<ARG>h[d] = MIN(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);') elif accs[m]==OP_MAX: code('<ARG>h[d] = MAX(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);') ENDFOR() ENDFOR() code('<ARG>.data = (char *)<ARG>h;') code('op_mpi_reduce(&<ARG>,<ARG>h);') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: code('') code('mvConstArraysToHost(consts_bytes);') break for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: FOR('d','0','<DIM>') code('<ARG>h[d] = ((<TYP> *)<ARG>.data)[d];') ENDFOR() code('<ARG>.data = (char *)<ARG>h;') code('op_mpi_reduce(&<ARG>,<ARG>h);') ENDIF() code('op_mpi_set_dirtybit_cuda(nargs, args);') # # update kernel record # code('cutilSafeCall(cudaDeviceSynchronize());') comm('update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if optflags[g_m]==1: IF('<ARG>.opt') if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ: code(line+' <ARG>.size;') else: code(line+' <ARG>.size * 2.0f;') if optflags[g_m]==1: ENDIF() depth = depth - 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('cuda'): os.makedirs('cuda') fid = open('cuda/'+name+'_kernel.cu','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm('global constants') code('#ifndef MAX_CONST_SIZE') code('#define MAX_CONST_SIZE 128') code('#endif') code('') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda['+num+'];') code('') comm('header') if os.path.exists('./user_types.h'): code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif') code('#include "../user_types.h"') code('#include "op_lib_cpp.h"') code('#include "op_cuda_rt_support.h"') code('#include "op_cuda_reduction.h"') code('') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') for nc in range(0,len(consts)): IF('!strcmp(name,"'+consts[nc]['name']+'")') if consts[nc]['dim'] < 0: IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {') code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*size));') ENDIF() code('else ') code('{') depth = depth + 2 code('printf("error: unknown const name\\n"); exit(1);') ENDIF() depth = depth - 2 code('}') code('') comm('user kernel files') for nk in range(0,len(kernels)): file_text = file_text +\ '#include "'+kernels[nk]['name']+'_kernel.cu"\n' master = master.split('.')[0] fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_cuda_simple(master, date, consts, kernels, sets): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1 OP_GBL = 2 OP_MAP = 3 OP_READ = 1 OP_WRITE = 2 OP_RW = 3 OP_INC = 4 OP_MAX = 5 OP_MIN = 6 accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN'] inc_stage = 1 ########################################################################## # create new kernel file ########################################################################## for nk in range(0, len(kernels)): name = kernels[nk]['name'] nargs = kernels[nk]['nargs'] dims = kernels[nk]['dims'] maps = kernels[nk]['maps'] var = kernels[nk]['var'] typs = kernels[nk]['typs'] accs = kernels[nk]['accs'] idxs = kernels[nk]['idxs'] inds = kernels[nk]['inds'] soaflags = kernels[nk]['soaflags'] ninds = kernels[nk]['ninds'] inddims = kernels[nk]['inddims'] indaccs = kernels[nk]['indaccs'] indtyps = kernels[nk]['indtyps'] invinds = kernels[nk]['invinds'] mapnames = kernels[nk]['mapnames'] invmapinds = kernels[nk]['invmapinds'] mapinds = kernels[nk]['mapinds'] nmaps = 0 if ninds > 0: nmaps = max(mapinds) + 1 vec = [ m for m in range(0, nargs) if int(idxs[m]) < 0 and maps[m] == OP_MAP ] if len(vec) > 0: unique_args = [1] vec_counter = 1 vectorised = [] new_dims = [] new_maps = [] new_vars = [] new_typs = [] new_accs = [] new_idxs = [] new_inds = [] new_soaflags = [] for m in range(0, nargs): if int(idxs[m]) < 0 and maps[m] == OP_MAP: if m > 0: unique_args = unique_args + [len(new_dims) + 1] temp = [0] * (-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): temp[i] = var[m] new_vars = new_vars + temp for i in range(0, -1 * int(idxs[m])): temp[i] = typs[m] new_typs = new_typs + temp for i in range(0, -1 * int(idxs[m])): temp[i] = dims[m] new_dims = new_dims + temp new_maps = new_maps + [maps[m]] * int(-1 * int(idxs[m])) new_soaflags = new_soaflags + [0] * int(-1 * int(idxs[m])) new_accs = new_accs + [accs[m]] * int(-1 * int(idxs[m])) for i in range(0, -1 * int(idxs[m])): new_idxs = new_idxs + [i] new_inds = new_inds + [inds[m]] * int(-1 * int(idxs[m])) vectorised = vectorised + [vec_counter] * int( -1 * int(idxs[m])) vec_counter = vec_counter + 1 else: if m > 0: unique_args = unique_args + [len(new_dims) + 1] new_dims = new_dims + [dims[m]] new_maps = new_maps + [maps[m]] new_accs = new_accs + [int(accs[m])] new_soaflags = new_soaflags + [soaflags[m]] new_idxs = new_idxs + [int(idxs[m])] new_inds = new_inds + [inds[m]] new_vars = new_vars + [var[m]] new_typs = new_typs + [typs[m]] vectorised = vectorised + [0] dims = new_dims maps = new_maps accs = new_accs idxs = new_idxs inds = new_inds var = new_vars typs = new_typs soaflags = new_soaflags nargs = len(vectorised) for i in range(1, ninds + 1): for index in range(0, len(inds) + 1): if inds[index] == i: invinds[i - 1] = index break else: vectorised = [0] * nargs unique_args = range(1, nargs + 1) cumulative_indirect_index = [-1] * nargs j = 0 for i in range(0, nargs): if maps[i] == OP_MAP and ((not inc_stage) or accs[i] == OP_INC): cumulative_indirect_index[i] = j j = j + 1 any_soa = 0 any_soa = any_soa or sum(soaflags) # # set two logicals # j = -1 for i in range(0, nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0, nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 if inc_stage: ninds_staged = 0 inds_staged = [-1] * nargs for i in range(0, nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: if inds_staged[invinds[inds[i] - 1]] == -1: inds_staged[i] = ninds_staged ninds_staged = ninds_staged + 1 else: inds_staged[i] = inds_staged[invinds[inds[i] - 1]] invinds_staged = [-1] * ninds_staged inddims_staged = [-1] * ninds_staged indopts_staged = [-1] * ninds_staged for i in range(0, nargs): if inds_staged[i] >= 0 and invinds_staged[ inds_staged[i]] == -1: invinds_staged[inds_staged[i]] = i inddims_staged[inds_staged[i]] = dims[i] # if optflags[i] == 1: # indopts_staged[inds_staged[i]] = i for i in range(0, nargs): inds_staged[i] = inds_staged[i] + 1 ########################################################################## # start with CUDA kernel function ########################################################################## FORTRAN = 0 CPP = 1 g_m = 0 file_text = '' depth = 0 #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('__constant__ int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT;') code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0, nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('__constant__ int direct_' + name + '_stride_OP2CONSTANT;') code('int direct_' + name + '_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') found = 0 for files in glob.glob("*.h"): f = open(files, 'r') for line in f: match = re.search(r'' + '\\b' + name + '\\b', line) if match: file_name = f.name found = 1 break if found == 1: break if found == 0: print "COUND NOT FIND KERNEL", name f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b' + name + '\\b') i = p.search(kernel_text).start() if (i < 0): print "\n********" print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}') signature_text = kernel_text[i:i + j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l + 1:m] body_text = kernel_text[i + j + 1:k] # check for number of arguments if len(signature_text.split(',')) != nargs: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0, nargs): var = signature_text.split(',')[i].strip() if soaflags[i] and not (maps[i] == OP_MAP and accs[i] == OP_INC): var = var.replace('*', '') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length - 1].strip() body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)', var2 + '[0]', body_text) body_text = re.sub( r'' + var2 + '\[([A-Za-z0-9]*)\]' + '', var2 + r'[\1*' + op2_gen_common.get_stride_string(i, maps, mapnames, name) + ']', body_text) signature_text = '__device__ ' + head_text + '_gpu( ' + signature_text + ') {' file_text += signature_text + body_text + '}\n' comm('') comm(' CUDA kernel function') if FORTRAN: code('subroutine op_cuda_' + name + '(') elif CPP: code('__global__ void op_cuda_' + name + '(') depth = 2 for g_m in range(0, ninds): if (indaccs[g_m] == OP_READ): code('const INDTYP *__restrict INDARG,') else: code('INDTYP *__restrict INDARG,') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('const int *__restrict opDat' + str(invinds[inds[g_m] - 1]) + 'Map, ') for g_m in range(0, nargs): if maps[g_m] == OP_ID: if accs[g_m] == OP_READ: code('const TYP *__restrict ARG,') else: code('TYP *ARG,') elif maps[g_m] == OP_GBL: if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[ g_m] == OP_MAX: code('TYP *ARG,') elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int( dims[g_m]) == 1: code('const TYP *ARG,') if ind_inc and inc_stage == 1: code('int *ind_map,') code('short *arg_map,') code('int *ind_arg_sizes,') code('int *ind_arg_offs, ') if ninds > 0: code('int block_offset, ') code('int *blkmap, ') code('int *offset, ') code('int *nelems, ') code('int *ncolors, ') code('int *colors, ') code('int nblocks, ') code('int set_size) { ') else: code('int set_size ) {') code('') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: code('TYP ARG_l[DIM];') if accs[g_m] == OP_INC: FOR('d', '0', 'DIM') code('ARG_l[d]=ZERO_TYP;') ENDFOR() else: FOR('d', '0', 'DIM') code('ARG_l[d]=ARG[d+blockIdx.x*DIM];') ENDFOR() elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('TYP ARG_l[DIM];') for m in range(1, ninds + 1): g_m = m - 1 v = [int(inds[i] == m) for i in range(len(inds))] v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] if sum(v) > 1 and sum(v_i) > 0: #check this sum(v_i) if indaccs[m - 1] == OP_INC: ind = int( max([ idxs[i] for i in range(len(inds)) if inds[i] == m ])) + 1 code('INDTYP *ARG_vec[' + str(ind) + '] = {') depth += 2 for n in range(0, nargs): if inds[n] == m: g_m = n code('ARG_l,') depth -= 2 code('};') else: ind = int( max([ idxs[i] for i in range(len(inds)) if inds[i] == m ])) + 1 code('INDTYP *ARG_vec[' + str(ind) + '];') # # lengthy code for general case with indirection # if ninds > 0: code('') if inc_stage == 1: for g_m in range(0, ninds): if indaccs[g_m] == OP_INC: code('__shared__ int *INDARG_map, INDARG_size;') code('__shared__ INDTYP *INDARG_s;') code('') if ind_inc: code('__shared__ int nelems2, ncolor;') code('__shared__ int nelem, offset_b;') code('') code('extern __shared__ char shared[];') code('') IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') code('return;') ENDIF() IF('threadIdx.x==0') code('') comm('get sizes and shift pointers and direct-mapped data') code('') code( 'int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];' ) code('') code('nelem = nelems[blockId];') code('offset_b = offset[blockId];') code('') if ind_inc: code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') code('ncolor = ncolors[blockId];') code('') if inc_stage == 1 and ind_inc: for g_m in range(0, ninds_staged): code('ind_arg' + str(inds[invinds_staged[g_m]] - 1) + '_size = ind_arg_sizes[' + str(g_m) + '+blockId*' + str(ninds_staged) + '];') code('') for m in range(1, ninds_staged + 1): g_m = m - 1 c = [i for i in range(nargs) if inds_staged[i] == m] code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];') code('') comm('set shared memory pointers') code('int nbytes = 0;') for g_m in range(0, ninds_staged): code('ind_arg' + str(inds[invinds_staged[g_m]] - 1) + '_s = (' + typs[invinds_staged[g_m]] + ' *) &shared[nbytes];') if g_m < ninds_staged - 1: code('nbytes += ROUND_UP(ind_ARG_size*sizeof(' + typs[invinds_staged[g_m]] + ')*' + dims[invinds_staged[g_m]] + ');') ENDIF() code('__syncthreads(); // make sure all of above completed') code('') if inc_stage == 1: for g_m in range(0, ninds): if indaccs[g_m] == OP_INC: FOR_INC('n', 'threadIdx.x', 'ind_ARG_size*INDDIM', 'blockDim.x') code('ind_ARG_s[n] = ZERO_INDTYP;') ENDFOR() if ind_inc: code('') code('__syncthreads();') code('') if ind_inc: FOR_INC('n', 'threadIdx.x', 'nelems2', 'blockDim.x') code('int col2 = -1;') k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map' + str(mapinds[g_m]) + 'idx;') IF('n<nelem') comm('initialise local variables') for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: FOR('d', '0', 'DIM') code('ARG_l[d] = ZERO_TYP;') ENDFOR() else: FOR_INC('n', 'threadIdx.x', 'nelem', 'blockDim.x') k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map' + str(mapinds[g_m]) + 'idx;') k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('map' + str(mapinds[g_m]) + 'idx = opDat' + str(invmapinds[inds[g_m] - 1]) + 'Map[n + offset_b + set_size * ' + str(int(idxs[g_m])) + '];') # # simple alternative when no indirection # else: code('') comm('process set elements') FOR_INC('n', 'threadIdx.x+blockIdx.x*blockDim.x', 'set_size', 'blockDim.x*gridDim.x') # # kernel call # code('') comm('user-supplied kernel call') line = name + '_gpu(' prefix = ' ' * len(name) a = 0 #only apply indentation if its not the 0th argument indent = '' for m in range(0, nargs): if a > 0: indent = ' ' + ' ' * len(name) if maps[m] == OP_GBL: if accs[m] == OP_READ: line += rep(indent + 'ARG,\n', m) else: line += rep(indent + 'ARG_l,\n', m) a = a + 1 elif maps[m] == OP_MAP and accs[m] == OP_INC: line += rep(indent + 'ARG_l,\n', m) a = a + 1 elif maps[m] == OP_MAP: if soaflags[m]: line += rep( indent + 'ind_arg' + str(inds[m] - 1) + '+map' + str(mapinds[m]) + 'idx,' + '\n', m) else: line += rep( indent + 'ind_arg' + str(inds[m] - 1) + '+map' + str(mapinds[m]) + 'idx*DIM,' + '\n', m) a = a + 1 elif maps[m] == OP_ID: if ninds > 0: if soaflags[m]: line += rep(indent + 'ARG+(n+offset_b),\n', m) else: line += rep(indent + 'ARG+(n+offset_b)*DIM,\n', m) a = a + 1 else: if soaflags[m]: line += rep(indent + 'ARG+n,\n', m) else: line += rep(indent + 'ARG+n*DIM,\n', m) a = a + 1 else: print 'internal error 1 ' code(line[0:-2] + ');') #remove final ',' and \n # # updating for indirect kernels ... # if ninds > 0: if ind_inc: code('col2 = colors[n+offset_b];') ENDIF() code('') comm('store local variables') code('') if inc_stage == 1: for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('int ARG_map;') IF('col2>=0') for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('ARG_map = arg_map[' + str(cumulative_indirect_index[g_m]) + '*set_size+n+offset_b];') ENDIF() code('') FOR('col', '0', 'ncolor') IF('col2==col') if inc_stage == 1: for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0, int(dims[g_m])): if soaflags[invinds[g_m]]: code('ARG_l[' + str(d) + '] += ind_arg' + str(inds[g_m] - 1) + '_s[ARG_map+' + str(d) + '*ind_arg' + str(inds[g_m] - 1) + '_size];') else: code('ARG_l[' + str(d) + '] += ind_arg' + str(inds[g_m] - 1) + '_s[' + str(d) + '+ARG_map*DIM];') for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0, int(dims[g_m])): if soaflags[invinds[g_m]]: code('ind_arg' + str(inds[g_m] - 1) + '_s[ARG_map+' + str(d) + '*ind_arg' + str(inds[g_m] - 1) + '_size] = ARG_l[' + str(d) + '];') else: code('ind_arg' + str(inds[g_m] - 1) + '_s[' + str(d) + '+ARG_map*DIM] = ARG_l[' + str(d) + '];') else: for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0, int(dims[g_m])): if soaflags[g_m]: code('ARG_l[' + str(d) + '] += ind_arg' + str(inds[g_m] - 1) + '[' + str(d) + '*' + op2_gen_common.get_stride_string( g_m, maps, mapnames, name) + '+map' + str(mapinds[g_m]) + 'idx];') else: code('ARG_l[' + str(d) + '] += ind_arg' + str(inds[g_m] - 1) + '[' + str(d) + '+map' + str(mapinds[g_m]) + 'idx*DIM];') for g_m in range(0, nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0, int(dims[g_m])): if soaflags[g_m]: code('ind_arg' + str(inds[g_m] - 1) + '[' + str(d) + '*' + op2_gen_common.get_stride_string( g_m, maps, mapnames, name) + '+map' + str(mapinds[g_m]) + 'idx] = ARG_l[' + str(d) + '];') else: code('ind_arg' + str(inds[g_m] - 1) + '[' + str(d) + '+map' + str(mapinds[g_m]) + 'idx*DIM] = ARG_l[' + str(d) + '];') ENDFOR() code('__syncthreads();') ENDFOR() ENDFOR() if inc_stage: for g_m in range(0, ninds): if indaccs[g_m] == OP_INC: if soaflags[invinds[g_m]]: FOR_INC('n', 'threadIdx.x', 'INDARG_size', 'blockDim.x') for d in range(0, int(dims[invinds[g_m]])): code('arg' + str(invinds[g_m]) + '_l[' + str(d) + '] = INDARG_s[n+' + str(d) + '*INDARG_size] + INDARG[INDARG_map[n]+' + str(d) + '*' + op2_gen_common.get_stride_string( g_m, maps, mapnames, name) + '];') for d in range(0, int(dims[invinds[g_m]])): code('INDARG[INDARG_map[n]+' + str(d) + '*' + op2_gen_common.get_stride_string( g_m, maps, mapnames, name) + '] = arg' + str(invinds[g_m]) + '_l[' + str(d) + '];') ENDFOR() else: FOR_INC('n', 'threadIdx.x', 'INDARG_size*INDDIM', 'blockDim.x') code( 'INDARG[n%INDDIM+INDARG_map[n/INDDIM]*INDDIM] += INDARG_s[n];' ) ENDFOR() # # global reduction # if reduct: code('') comm('global reductions') code('') for m in range(0, nargs): g_m = m if maps[m] == OP_GBL and accs[m] <> OP_READ: FOR('d', '0', 'DIM') if accs[m] == OP_INC: code( 'op_reduction<OP_INC>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);' ) elif accs[m] == OP_MIN: code( 'op_reduction<OP_MIN>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);' ) elif accs[m] == OP_MAX: code( 'op_reduction<OP_MAX>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);' ) else: print 'internal error: invalid reduction option' sys.exit(2) ENDFOR() depth -= 2 code('}') code('') ########################################################################## # then C++ stub function ########################################################################## code('') comm('host stub function') code('void op_par_loop_' + name + '(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args) - 1]: code('op_arg ARG){') code('') else: code('op_arg ARG,') for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('TYP*ARGh = (TYP *)ARG.data;') code('int nargs = ' + str(nargs) + ';') code('op_arg args[' + str(nargs) + '];') code('') #print vectorised for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: code('ARG.idx = 0;') code('args[' + str(g_m) + '] = ARG;') v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] FOR('v', '1', str(sum(v))) code('args['+str(g_m)+' + v] = op_arg_dat(arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m] > 0: pass else: code('args[' + str(g_m) + '] = ARG;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc(' + str(nk) + ');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' + str(nk) + '].name = name;') code('OP_kernels[' + str(nk) + '].count += 1;') code('') # # indirect bits # if ninds > 0: code('') code('int ninds = ' + str(ninds) + ';') line = 'int inds[' + str(nargs) + '] = {' for m in range(0, nargs): line += str(inds[m] - 1) + ',' code(line[:-1] + '};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: ' + name + '\\n");') ENDIF() code('') comm('get plan') code('#ifdef OP_PART_SIZE_' + str(nk)) code(' int part_size = OP_PART_SIZE_' + str(nk) + ';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code( 'int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: ' + name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') IF('set->size > 0') code('') # # kernel call for indirect version # if ninds > 0: if inc_stage == 1 and ind_inc: code( 'op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);' ) else: code( 'op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);' ) code('') # # transfer constants # g = [ i for i in range(0, nargs) if maps[i] == OP_GBL and accs[i] == OP_READ ] if len(g) > 0: comm('transfer constants to GPU') code('int consts_bytes = 0;') for m in range(0, nargs): g_m = m if maps[m] == OP_GBL and accs[m] == OP_READ: code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));') code('reallocConstArrays(consts_bytes);') code('consts_bytes = 0;') for m in range(0, nargs): if maps[m] == OP_GBL and accs[m] == OP_READ: g_m = m code('ARG.data = OP_consts_h + consts_bytes;') code('ARG.data_d = OP_consts_d + consts_bytes;') FOR('d', '0', 'DIM') code('((TYP *)ARG.data)[d] = ARGh[d];') ENDFOR() code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));') code('mvConstArraysToDevice(consts_bytes);') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' + str(nk) + '].count==1) || (opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(g_m) + '))') code('opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(g_m) + ');') code('cudaMemcpyToSymbol(opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2CONSTANT, &opDat' + str(invinds[inds[g_m] - 1]) + '_' + name + '_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa <> -1: IF('(OP_kernels[' + str(nk) + '].count==1) || (direct_' + name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' + str(dir_soa) + '))') code('direct_' + name + '_stride_OP2HOST = getSetSizeFromOpArg(&arg' + str(dir_soa) + ');') code('cudaMemcpyToSymbol(direct_' + name + '_stride_OP2CONSTANT,&direct_' + name + '_stride_OP2HOST,sizeof(int));') ENDIF() # # transfer global reduction initial data # if ninds == 0: comm('set CUDA execution parameters') code('#ifdef OP_BLOCK_SIZE_' + str(nk)) code(' int nthread = OP_BLOCK_SIZE_' + str(nk) + ';') code('#else') code(' int nthread = OP_block_size;') comm(' int nthread = 128;') code('#endif') code('') code('int nblocks = 200;') code('') if reduct: comm('transfer global reduction data to GPU') if ninds > 0: code('int maxblocks = 0;') FOR('col', '0', 'Plan->ncolors') code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') ENDFOR() else: code('int maxblocks = nblocks;') code('int reduct_bytes = 0;') code('int reduct_size = 0;') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: code( 'reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));') code('reduct_size = MAX(reduct_size,sizeof(TYP));') code('reallocReductArrays(reduct_bytes);') code('reduct_bytes = 0;') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: code('ARG.data = OP_reduct_h + reduct_bytes;') code('ARG.data_d = OP_reduct_d + reduct_bytes;') FOR('b', '0', 'maxblocks') FOR('d', '0', 'DIM') if accs[g_m] == OP_INC: code('((TYP *)ARG.data)[d+b*DIM] = ZERO_TYP;') else: code('((TYP *)ARG.data)[d+b*DIM] = ARGh[d];') ENDFOR() ENDFOR() code( 'reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));') code('mvReductArraysToDevice(reduct_bytes);') code('') # # kernel call for indirect version # if ninds > 0: comm('execute plan') code('') code('int block_offset = 0;') FOR('col', '0', 'Plan->ncolors') IF('col==Plan->ncolors_core') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('#ifdef OP_BLOCK_SIZE_' + str(nk)) code('int nthread = OP_BLOCK_SIZE_' + str(nk) + ';') code('#else') code('int nthread = OP_block_size;') code('#endif') code('') code( 'dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],' ) code( 'Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);' ) IF('Plan->ncolblk[col] > 0') if reduct or (inc_stage == 1 and ind_inc): if reduct and inc_stage == 1: code( 'int nshared = MAX(Plan->nshared,reduct_size*nthread);' ) elif reduct: code('int nshared = reduct_size*nthread;') else: code('int nshared = Plan->nsharedCol[col];') code('op_cuda_' + name + '<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_' + name + '<<<nblocks,nthread>>>(') for m in range(1, ninds + 1): g_m = invinds[m - 1] code('(TYP *)ARG.data_d,') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('arg' + str(g_m) + '.map_data_d, ') for g_m in range(0, nargs): if inds[g_m] == 0: code('(TYP*)ARG.data_d,') if inc_stage == 1 and ind_inc: code('Plan->ind_map,') code('Plan->loc_map,') code('Plan->ind_sizes,') code('Plan->ind_offs,') code('block_offset,') code('Plan->blkmap,') code('Plan->offset,') code('Plan->nelems,') code('Plan->nthrcol,') code('Plan->thrcol,') code('Plan->ncolblk[col],') code('set->size+set->exec_size);') code('') if reduct: comm('transfer global reduction data back to CPU') IF('col == Plan->ncolors_owned-1') code('mvReductArraysToHost(reduct_bytes);') ENDIF() ENDFOR() code('block_offset += Plan->ncolblk[col];') ENDIF() # # kernel call for direct version # else: if reduct: code('int nshared = reduct_size*nthread;') code('op_cuda_' + name + '<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_' + name + '<<<nblocks,nthread>>>(') indent = ' ' #*(len(name)+42) for g_m in range(0, nargs): if g_m > 0: code(indent + '(TYP *) ARG.data_d,') else: code(indent + '(TYP *) ARG.data_d,') code(indent + 'set->size );') if ninds > 0: code('OP_kernels[' + str(nk) + '].transfer += Plan->transfer;') code('OP_kernels[' + str(nk) + '].transfer2 += Plan->transfer2;') # # transfer global reduction initial data # if reduct: if ninds == 0: comm('transfer global reduction data back to CPU') code('mvReductArraysToHost(reduct_bytes);') for m in range(0, nargs): g_m = m if maps[m] == OP_GBL and accs[m] <> OP_READ: FOR('b', '0', 'maxblocks') FOR('d', '0', 'DIM') if accs[m] == OP_INC: code('ARGh[d] = ARGh[d] + ((TYP *)ARG.data)[d+b*DIM];') elif accs[m] == OP_MIN: code( 'ARGh[d] = MIN(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);' ) elif accs[m] == OP_MAX: code( 'ARGh[d] = MAX(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);' ) ENDFOR() ENDFOR() code('ARG.data = (char *)ARGh;') code('op_mpi_reduce(&ARG,ARGh);') ENDIF() code('op_mpi_set_dirtybit_cuda(nargs, args);') # # update kernel record # code('cutilSafeCall(cudaDeviceSynchronize());') comm('update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' + str(nk) + '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *' for g_m in range(0, nargs): if maps[g_m] <> OP_GBL: if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: code(line + ' ARG.size;') else: code(line + ' ARG.size * 2.0f;') depth = depth - 2 code('}') ########################################################################## # output individual kernel file ########################################################################## fid = open(name + '_kernel.cu', 'w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm('header') code('#include "op_lib_cpp.h"') code('#include "op_cuda_rt_support.h"') code('#include "op_cuda_reduction.h"') code('') comm('global constants') code('#ifndef MAX_CONST_SIZE') code('#define MAX_CONST_SIZE 128') code('#endif') code('') for nc in range(0, len(consts)): if consts[nc]['dim'] == 1: code('__constant__ ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + ';') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('__constant__ ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + '[' + num + '];') code('') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') for nc in range(0, len(consts)): IF('!strcmp(name,"' + consts[nc]['name'] + '")') if consts[nc]['dim'] < 0: IF('!strcmp(name,"' + consts[nc]['name'] + '") && size>MAX_CONST_SIZE) {') code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol(' + consts[nc]['name'] + ', dat, dim*size));') ENDIF() code('else ') code('{') depth = depth + 2 code('printf("error: unknown const name\\n"); exit(1);') ENDIF() depth = depth - 2 code('}') code('') comm('user kernel files') for nk in range(0, len(kernels)): file_text = file_text +\ '#include "'+kernels[nk]['name']+'_kernel.cu"\n' master = master.split('.')[0] fid = open(master.split('.')[0] + '_kernels.cu', 'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()