def resized(suffix, dtp, b, last): global need_extent if (need_extent == True): yutils.display( OUTFILE, "uintptr_t extent%d = %s->extent / sizeof(%s);\n" % (suffix, dtp, b)) need_extent = False
def contig(suffix, b, blklen, last): global num_paren_open num_paren_open += 1 yutils.display( OUTFILE, "for (int j%d = 0; j%d < count%d; j%d++) {\n" % (suffix, suffix, suffix, suffix)) global s s += " + j%d * stride%d" % (suffix, suffix)
def hindexed(suffix, b, blklen, last): global num_paren_open num_paren_open += 2 yutils.display(OUTFILE, "for (int j%d = 0; j%d < count%d; j%d++) {\n" % (suffix, suffix, suffix, suffix)) yutils.display(OUTFILE, "for (int k%d = 0; k%d < array_of_blocklengths%d[j%d]; k%d++) {\n" % \ (suffix, suffix, suffix, suffix, suffix)) global s if (last != 1): s += " + array_of_displs%d[j%d] / sizeof(%s) + k%d * extent%d" % \ (suffix, suffix, b, suffix, suffix + 1) else: s += " + array_of_displs%d[j%d] / sizeof(%s) + k%d" % (suffix, suffix, b, suffix)
def contig(suffix, dtp, b, last): global s global idx global need_extent yutils.display( OUTFILE, "intptr_t stride%d = %s->u.contig.child->extent;\n" % (suffix, dtp)) if (need_extent == True): yutils.display(OUTFILE, "uintptr_t extent%d = %s->extent;\n" % (suffix, dtp)) need_extent = False s += " + x%d * stride%d" % (idx, suffix) idx = idx + 1
def hvector(suffix, dtp, b, last): global s global idx global need_extent yutils.display( OUTFILE, "intptr_t stride%d = %s->u.hvector.stride;\n" % (suffix, dtp)) if (need_extent == True): yutils.display(OUTFILE, "uintptr_t extent%d = %s->extent;\n" % (suffix, dtp)) if (last != 1): s += " + x%d * stride%d + x%d * extent%d" % (idx, suffix, idx + 1, suffix + 1) need_extent = True else: s += " + x%d * stride%d + x%d * sizeof(%s)" % (idx, suffix, idx + 1, b) need_extent = False idx = idx + 2
def switcher_builtin(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr): yutils.display(OUTFILE, "switch (%s->id) {\n" % child_type_str(typelist)) for b in builtin_types: switcher_builtin_element(backend, OUTFILE, blklens, typelist, pupstr, "YAKSA_TYPE__%s" % b.replace(" ", "_"), b.replace(" ", "_")) for key in builtin_maps: switcher_builtin_element(backend, OUTFILE, blklens, typelist, pupstr, key, builtin_maps[key]) yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, "}\n")
def hvector_decl(nesting, dtp, b): yutils.display( OUTFILE, "intptr_t count%d = %s->u.hvector.count;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t blocklength%d ATTRIBUTE((unused)) = %s->u.hvector.blocklength;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t stride%d = %s->u.hvector.stride;\n" % (nesting, dtp)) yutils.display( OUTFILE, "uintptr_t extent%d ATTRIBUTE((unused)) = %s->extent;\n" % (nesting, dtp))
def hindexed(suffix, dtp, b, last): global s global idx global need_extent yutils.display( OUTFILE, "intptr_t *array_of_displs%d = %s->u.hindexed.array_of_displs;\n" % (suffix, dtp)) if (need_extent == True): yutils.display(OUTFILE, "uintptr_t extent%d = %s->extent;\n" % (suffix, dtp)) if (last != 1): s += " + array_of_displs%d[x%d] + x%d * extent%d" % \ (suffix, idx, idx + 1, suffix + 1) need_extent = True else: s += " + array_of_displs%d[x%d] + x%d * sizeof(%s)" % (suffix, idx, idx + 1, b) need_extent = False idx = idx + 2
def hindexed_decl(nesting, dtp, b): yutils.display( OUTFILE, "intptr_t count%d = %s->u.hindexed.count;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t *restrict array_of_blocklengths%d = %s->u.hindexed.array_of_blocklengths;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t *restrict array_of_displs%d = %s->u.hindexed.array_of_displs;\n" % (nesting, dtp)) yutils.display( OUTFILE, "uintptr_t extent%d ATTRIBUTE((unused)) = %s->extent;\n" % (nesting, dtp))
def blkhindx_decl(nesting, dtp, b): yutils.display( OUTFILE, "intptr_t count%d = %s->u.blkhindx.count;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t blocklength%d ATTRIBUTE((unused)) = %s->u.blkhindx.blocklength;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t *restrict array_of_displs%d = %s->u.blkhindx.array_of_displs;\n" % (nesting, dtp)) yutils.display( OUTFILE, "uintptr_t extent%d ATTRIBUTE((unused)) = %s->extent;\n" % (nesting, dtp))
def contig_decl(nesting, dtp, b): yutils.display(OUTFILE, "intptr_t count%d = %s->u.contig.count;\n" % (nesting, dtp)) yutils.display( OUTFILE, "intptr_t stride%d = %s->u.contig.child->extent;\n" % (nesting, dtp)) yutils.display( OUTFILE, "uintptr_t extent%d ATTRIBUTE((unused)) = %s->extent;\n" % (nesting, dtp))
def hvector(suffix, b, blklen, last): global num_paren_open num_paren_open += 2 yutils.display(OUTFILE, "for (int j%d = 0; j%d < count%d; j%d++) {\n" % (suffix, suffix, suffix, suffix)) if (blklen == "generic"): yutils.display(OUTFILE, "for (int k%d = 0; k%d < blocklength%d; k%d++) {\n" % (suffix, suffix, suffix, suffix)) else: yutils.display(OUTFILE, "for (int k%d = 0; k%d < %s; k%d++) {\n" % (suffix, suffix, blklen, suffix)) global s if (last != 1): s += " + j%d * stride%d + k%d * extent%d" % (suffix, suffix, suffix, suffix + 1) else: s += " + j%d * stride%d + k%d" % (suffix, suffix, suffix)
def blkhindx(suffix, b, blklen, last): global num_paren_open num_paren_open += 2 yutils.display(OUTFILE, "for (int j%d = 0; j%d < count%d; j%d++) {\n" % (suffix, suffix, suffix, suffix)) if (blklen == "generic"): yutils.display(OUTFILE, "for (int k%d = 0; k%d < blocklength%d; k%d++) {\n" % (suffix, suffix, suffix, suffix)) else: yutils.display(OUTFILE, "for (int k%d = 0; k%d < %s; k%d++) {\n" % (suffix, suffix, blklen, suffix)) global s if (last != 1): s += " + array_of_displs%d[j%d] / sizeof(%s) + k%d * extent%d" % \ (suffix, suffix, b, suffix, suffix + 1) else: s += " + array_of_displs%d[j%d] / sizeof(%s) + k%d" % (suffix, suffix, b, suffix)
def write_headers(): yutils.display(OUTFILE, "typedef signed char int8_t;\n") yutils.display(OUTFILE, "typedef signed short int int16_t;\n") yutils.display(OUTFILE, "typedef signed int int32_t;\n") yutils.display(OUTFILE, "typedef signed long int64_t;\n") yutils.display(OUTFILE, "typedef unsigned char uint8_t;\n") yutils.display(OUTFILE, "typedef unsigned short int uint16_t;\n") yutils.display(OUTFILE, "typedef unsigned int uint32_t;\n") yutils.display(OUTFILE, "typedef unsigned long uint64_t;\n") yutils.display(OUTFILE, "#include \"yaksuri_zei_md.h\"\n") yutils.display(OUTFILE, "\n")
def generate_kernels(b, darray, op): global need_extent global s global idx # we need pup kernels for reduction of basic types funclist = [] funclist.append("pack_%s" % op) funclist.append("unpack_%s" % op) for func in funclist: ##### figure out the function name to use funcprefix = "%s_" % func for d in darray: funcprefix = funcprefix + "%s_" % d funcprefix = funcprefix + b.replace(" ", "_") ##### generate the ZE kernel yutils.display( OUTFILE, "__kernel void yaksuri_zei_kernel_%s(__global const void *inbuf, __global void *outbuf, unsigned long count, __global const yaksuri_zei_md_s *__restrict__ md)\n" % funcprefix) yutils.display(OUTFILE, "{\n") yutils.display( OUTFILE, "__global const char *__restrict__ sbuf = (__global char *) inbuf;\n" ) yutils.display( OUTFILE, "__global char *__restrict__ dbuf = (__global char *) outbuf;\n") if ("unpack" in func and len(darray) != 0): yutils.display(OUTFILE, "dbuf = dbuf - md->true_lb;\n") elif (len(darray) != 0): yutils.display( OUTFILE, "sbuf = (__global const char *) ((__global char *)sbuf - md->true_lb);\n" ) yutils.display(OUTFILE, "uintptr_t extent = md->extent;\n") yutils.display(OUTFILE, "uintptr_t idx = get_global_id(0);\n") yutils.display(OUTFILE, "uintptr_t res = idx;\n") yutils.display(OUTFILE, "uintptr_t inner_elements = md->num_elements;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "if (idx >= (count * inner_elements))\n") yutils.display(OUTFILE, " return;\n") yutils.display(OUTFILE, "\n") # copy loop idx = 0 md = "md" for d in darray: if (d == "hvector" or d == "blkhindx" or d == "hindexed" or \ d == "contig"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display(OUTFILE, "inner_elements /= %s->u.%s.count;\n" % (md, d)) yutils.display(OUTFILE, "\n") if (d == "hvector" or d == "blkhindx"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display( OUTFILE, "inner_elements /= %s->u.%s.blocklength;\n" % (md, d)) elif (d == "hindexed"): yutils.display(OUTFILE, "uintptr_t x%d;\n" % idx) yutils.display( OUTFILE, "for (intptr_t i = 0; i < %s->u.%s.count; i++) {\n" % (md, d)) yutils.display( OUTFILE, " uintptr_t in_elems = %s->u.%s.array_of_blocklengths[i] *\n" % (md, d)) yutils.display( OUTFILE, " %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " if (res < in_elems) {\n") yutils.display(OUTFILE, " x%d = i;\n" % idx) yutils.display(OUTFILE, " res %= in_elems;\n") yutils.display( OUTFILE, " inner_elements = %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, " } else {\n") yutils.display(OUTFILE, " res -= in_elems;\n") yutils.display(OUTFILE, " }\n") yutils.display(OUTFILE, "}\n") idx = idx + 1 yutils.display(OUTFILE, "\n") md = "%s->u.%s.child" % (md, d) yutils.display(OUTFILE, "uintptr_t x%d = res;\n" % idx) yutils.display(OUTFILE, "\n") dtp = "md" s = "x0 * extent" idx = 1 x = 1 need_extent = False for d in darray: if (x == len(darray)): last = 1 else: last = 0 getattr(sys.modules[__name__], d)(x, dtp, b, last) x = x + 1 dtp = dtp + "->u.%s.child" % d if (func == "pack_REPLACE"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_SUM"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) += *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_PROD"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) *= *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_BOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) |= *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_BAND"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) &= *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_BXOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) ^= *((const %s *) (const void *) (sbuf + %s));\n" % (b, b, b, s)) elif (func == "pack_LOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = (*((%s *) (void *) (dbuf + idx * sizeof(%s)))) || (*((const %s *) (const void *) (sbuf + %s)));\n" % (b, b, b, b, b, s)) elif (func == "pack_LAND"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = (*((%s *) (void *) (dbuf + idx * sizeof(%s)))) && (*((const %s *) (const void *) (sbuf + %s)));\n" % (b, b, b, b, b, s)) elif (func == "pack_LXOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = !(*((%s *) (void *) (dbuf + idx * sizeof(%s)))) != !(*((const %s *) (const void *) (sbuf + %s)));\n" % (b, b, b, b, b, s)) elif (func == "pack_MAX" and (b == "float" or b == "double")): yutils.display( OUTFILE, " %s x_[2] = {*((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx * sizeof(%s)))};\n" % (b, b, s, b, b)) yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = x_[*((const %s *) (const void *) (sbuf + %s)) < *((%s *) (void *) (dbuf + idx * sizeof(%s)))];\n" % (b, b, b, s, b, b)) elif (func == "pack_MAX"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = *((const %s *) (const void *) (sbuf + %s)) ^ ((*((const %s *) (const void *) (sbuf + %s)) ^ *((%s *) (void *) (dbuf + idx * sizeof(%s)))) & -( *((const %s *) (const void *) (sbuf + %s)) < *((%s *) (void *) (dbuf + idx * sizeof(%s)))));\n" % (b, b, b, s, b, s, b, b, b, s, b, b)) elif (func == "pack_MIN" and (b == "float" or b == "double")): yutils.display( OUTFILE, " %s x_[2] = {*((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx * sizeof(%s)))};\n" % (b, b, s, b, b)) yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = x_[*((const %s *) (const void *) (sbuf + %s)) > *((%s *) (void *) (dbuf + idx * sizeof(%s)))];\n" % (b, b, b, s, b, b)) elif (func == "pack_MIN"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + idx * sizeof(%s))) = *((%s *) (void *) (dbuf + idx * sizeof(%s))) ^ ((*((const %s *) (const void *) (sbuf + %s)) ^ *((%s *) (void *) (dbuf + idx * sizeof(%s)))) & -( *((const %s *) (const void *) (sbuf + %s)) < *((%s *) (void *) (dbuf + idx * sizeof(%s)))));\n" % (b, b, b, b, b, s, b, b, b, s, b, b)) elif (func == "unpack_REPLACE"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_SUM"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) += *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_PROD"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) *= *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_BOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) |= *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_BAND"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) &= *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_BXOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) ^= *((const %s *) (const void *) (sbuf + idx * sizeof(%s)));\n" % (b, s, b, b)) elif (func == "unpack_LOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = (*((%s *) (void *) (dbuf + %s))) || (*((const %s *) (const void *) (sbuf + idx * sizeof(%s))));\n" % (b, s, b, s, b, b)) elif (func == "unpack_LAND"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = (*((%s *) (void *) (dbuf + %s))) && (*((const %s *) (const void *) (sbuf + idx * sizeof(%s))));\n" % (b, s, b, s, b, b)) elif (func == "unpack_LXOR"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = !(*((%s *) (void *) (dbuf + %s))) != !(*((const %s *) (const void *) (sbuf + idx * sizeof(%s))));\n" % (b, s, b, s, b, b)) elif (func == "unpack_MAX" and (b == "float" or b == "double")): yutils.display( OUTFILE, " %s x_[2] = {*((const %s *) (const void *) (sbuf + idx * sizeof(%s))), *((%s *) (void *) (dbuf + %s))};\n" % (b, b, b, b, s)) yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = x_[*((const %s *) (const void *) (sbuf + idx * sizeof(%s))) < *((%s *) (void *) (dbuf + %s))];\n" % (b, s, b, b, b, s)) elif (func == "unpack_MAX"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = *((const %s *) (const void *) (sbuf + idx * sizeof(%s))) ^ ((*((const %s *) (const void *) (sbuf + idx * sizeof(%s))) ^ *((%s *) (void *) (dbuf + %s))) & -( *((const %s *) (const void *) (sbuf + idx * sizeof(%s))) < *((%s *) (void *) (dbuf + %s))));\n" % (b, s, b, b, b, b, b, s, b, b, b, s)) elif (func == "unpack_MIN" and (b == "float" or b == "double")): yutils.display( OUTFILE, " %s x_[2] = {*((const %s *) (const void *) (sbuf + idx * sizeof(%s))), *((%s *) (void *) (dbuf + %s))};\n" % (b, b, b, b, s)) yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = x_[*((const %s *) (const void *) (sbuf + idx * sizeof(%s))) > *((%s *) (void *) (dbuf + %s))];\n" % (b, s, b, b, b, s)) elif (func == "unpack_MIN"): yutils.display( OUTFILE, "*((%s *) (void *) (dbuf + %s)) = *((%s *) (void *) (dbuf + %s)) ^ ((*((const %s *) (const void *) (sbuf + idx * sizeof(%s))) ^ *((%s *) (void *) (dbuf + %s))) & -( *((const %s *) (const void *) (sbuf + idx * sizeof(%s))) < *((%s *) (void *) (dbuf + %s))));\n" % (b, s, b, s, b, b, b, s, b, b, b, s)) yutils.display(OUTFILE, "}\n\n")
def switcher_builtin_element(backend, OUTFILE, blklens, typelist, pupstr, key, val): yutils.display(OUTFILE, "case %s:\n" % key.upper()) if (len(typelist) == 0): t = "" else: t = typelist.pop() if (t == ""): nesting_level = 0 else: nesting_level = len(typelist) + 1 if ((t == "hvector" or t == "blkhindx") and (len(blklens) > 1)): yutils.display(OUTFILE, "switch (%s->u.%s.blocklength) {\n" % (child_type_str(typelist), t)) for blklen in blklens: if (blklen != "generic"): yutils.display(OUTFILE, "case %s:\n" % blklen) else: yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, "if (max_nesting_level >= %d) {\n" % nesting_level) yutils.display(OUTFILE, "%s->pack = yaksuri_%si_%s_blklen_%s_%s;\n" % (backend, backend, pupstr, blklen, val)) yutils.display(OUTFILE, "%s->unpack = yaksuri_%si_un%s_blklen_%s_%s;\n" % (backend, backend, pupstr, blklen, val)) yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "}\n") else: yutils.display(OUTFILE, "if (max_nesting_level >= %d) {\n" % nesting_level) yutils.display(OUTFILE, "%s->pack = yaksuri_%si_%s_%s;\n" % (backend, backend, pupstr, val)) yutils.display(OUTFILE, "%s->unpack = yaksuri_%si_un%s_%s;\n" % (backend, backend, pupstr, val)) yutils.display(OUTFILE, "}\n") if (t != ""): typelist.append(t) yutils.display(OUTFILE, "break;\n")
for darray in darraylist: darray.append(d1) darray.append(d2) for op in gencomm.type_ops[b]: generate_kernels(b, darray, op) darray.pop() darray.pop() OUTFILE.close() ##### generate code to load modules/kernels used by init_hook filename = "src/backend/ze/hooks/yaksuri_zei_init_kernels.c" yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include \"stdlib.h\"\n") yutils.display(OUTFILE, "#include \"yaksi.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_zei.h\"\n") yutils.display(OUTFILE, "#include \"level_zero/ze_api.h\"\n\n") num_modules = 0 for b in builtin_types: OUTFILE.write( "extern const unsigned char yaksuri_zei_pup_%s_str[];\n" % b.replace(" ", "_")) OUTFILE.write("extern const size_t yaksuri_zei_pup_%s_size;\n" % b.replace(" ", "_")) num_modules += 1 for d1 in gencomm.derived_types: ##### generate the core pack/unpack kernels (single level) OUTFILE.write(
def generate_host_function(b, darray): for func in "pack", "unpack": funcprefix = "%s_" % func for d in darray: funcprefix = funcprefix + "%s_" % d funcprefix = funcprefix + b.replace(" ", "_") yutils.display( OUTFILE, "void yaksuri_cudai_%s(const void *inbuf, void *outbuf, uintptr_t count, yaksa_op_t op, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)\n" % funcprefix) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "void *args[] = { &inbuf, &outbuf, &count, &md };\n") yutils.display(OUTFILE, "cudaError_t cerr;\n") yutils.display(OUTFILE, "switch (op) {\n") for op in gencomm.type_ops[b]: funcprefix = "%s_%s_" % (func, op) for d in darray: funcprefix = funcprefix + "%s_" % d funcprefix = funcprefix + b.replace(" ", "_") yutils.display(OUTFILE, "case YAKSA_OP__%s:\n" % op) yutils.display( OUTFILE, "cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_%s,\n" % funcprefix) yutils.display( OUTFILE, " dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);\n" ) yutils.display(OUTFILE, "YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);\n") yutils.display(OUTFILE, "break;\n\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "}\n\n")
def generate_kernels(b, darray, op): global need_extent global s global idx for func in "pack", "unpack": ##### figure out the function name to use funcprefix = "%s_%s_" % (func, op) for d in darray: funcprefix = funcprefix + "%s_" % d funcprefix = funcprefix + b.replace(" ", "_") ##### generate the CUDA kernel yutils.display( OUTFILE, "__global__ void yaksuri_cudai_kernel_%s(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)\n" % funcprefix) yutils.display(OUTFILE, "{\n") yutils.display( OUTFILE, "const char *__restrict__ sbuf = (const char *) inbuf;\n") yutils.display(OUTFILE, "char *__restrict__ dbuf = (char *) outbuf;\n") yutils.display(OUTFILE, "uintptr_t extent = md->extent;\n") yutils.display( OUTFILE, "uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n") yutils.display(OUTFILE, "uintptr_t res = idx;\n") yutils.display(OUTFILE, "uintptr_t inner_elements = md->num_elements;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "if (idx >= (count * inner_elements))\n") yutils.display(OUTFILE, " return;\n") yutils.display(OUTFILE, "\n") # copy loop idx = 0 md = "md" for d in darray: if (d == "hvector" or d == "blkhindx" or d == "hindexed" or \ d == "contig"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display(OUTFILE, "inner_elements /= %s->u.%s.count;\n" % (md, d)) yutils.display(OUTFILE, "\n") if (d == "hvector" or d == "blkhindx"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display( OUTFILE, "inner_elements /= %s->u.%s.blocklength;\n" % (md, d)) elif (d == "hindexed"): yutils.display(OUTFILE, "uintptr_t x%d;\n" % idx) yutils.display( OUTFILE, "for (intptr_t i = 0; i < %s->u.%s.count; i++) {\n" % (md, d)) yutils.display( OUTFILE, " uintptr_t in_elems = %s->u.%s.array_of_blocklengths[i] *\n" % (md, d)) yutils.display( OUTFILE, " %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " if (res < in_elems) {\n") yutils.display(OUTFILE, " x%d = i;\n" % idx) yutils.display(OUTFILE, " res %= in_elems;\n") yutils.display( OUTFILE, " inner_elements = %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, " } else {\n") yutils.display(OUTFILE, " res -= in_elems;\n") yutils.display(OUTFILE, " }\n") yutils.display(OUTFILE, "}\n") idx = idx + 1 yutils.display(OUTFILE, "\n") md = "%s->u.%s.child" % (md, d) yutils.display(OUTFILE, "uintptr_t x%d = res;\n" % idx) yutils.display(OUTFILE, "\n") dtp = "md" s = "x0 * extent" idx = 1 x = 1 need_extent = False for d in darray: if (x == len(darray)): last = 1 else: last = 0 getattr(sys.modules[__name__], d)(x, dtp, b, last) x = x + 1 dtp = dtp + "->u.%s.child" % d if (func == "pack"): if ((b == "float" or b == "double") and (op == "MAX" or op == "MIN")): yutils.display( OUTFILE, "YAKSURI_CUDAI_OP_%s_FLOAT(%s, *((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx * sizeof(%s))));\n" % (op, b, b, s, b, b)) else: yutils.display( OUTFILE, "YAKSURI_CUDAI_OP_%s(*((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx * sizeof(%s))));\n" % (op, b, s, b, b)) else: if ((b == "float" or b == "double") and (op == "MAX" or op == "MIN")): yutils.display( OUTFILE, "YAKSURI_CUDAI_OP_%s_FLOAT(%s, *((const %s *) (const void *) (sbuf + idx * sizeof(%s))), *((%s *) (void *) (dbuf + %s)));\n" % (op, b, b, b, b, s)) else: yutils.display( OUTFILE, "YAKSURI_CUDAI_OP_%s(*((const %s *) (const void *) (sbuf + idx * sizeof(%s))), *((%s *) (void *) (dbuf + %s)));\n" % (op, b, b, b, s)) yutils.display(OUTFILE, "}\n\n")
def resized_decl(nesting, dtp, b): yutils.display(OUTFILE, "uintptr_t extent%d ATTRIBUTE((unused)) = %s->extent / sizeof(%s);\n" % (nesting, dtp, b))
def generate_kernels(b, darray): global need_extent global s global idx # we don't need pup kernels for basic types if (len(darray) == 0): return for func in "pack", "unpack": ##### figure out the function name to use funcprefix = "%s_" % func for d in darray: funcprefix = funcprefix + "%s_" % d funcprefix = funcprefix + b.replace(" ", "_") ##### generate the CUDA kernel yutils.display( OUTFILE, "__global__ void yaksuri_cudai_kernel_%s(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)\n" % funcprefix) yutils.display(OUTFILE, "{\n") yutils.display( OUTFILE, "const %s *__restrict__ sbuf = (const %s *) inbuf;\n" % (b, b)) yutils.display(OUTFILE, "%s *__restrict__ dbuf = (%s *) outbuf;\n" % (b, b)) yutils.display(OUTFILE, "uintptr_t extent = md->extent / sizeof(%s);\n" % b) yutils.display( OUTFILE, "uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n") yutils.display(OUTFILE, "uintptr_t res = idx;\n") yutils.display(OUTFILE, "uintptr_t inner_elements = md->num_elements;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "if (idx >= (count * inner_elements))\n") yutils.display(OUTFILE, " return;\n") yutils.display(OUTFILE, "\n") # copy loop idx = 0 md = "md" for d in darray: if (d == "hvector" or d == "blkhindx" or d == "hindexed" or \ d == "contig"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display(OUTFILE, "inner_elements /= %s->u.%s.count;\n" % (md, d)) yutils.display(OUTFILE, "\n") if (d == "hvector" or d == "blkhindx"): yutils.display(OUTFILE, "uintptr_t x%d = res / inner_elements;\n" % idx) idx = idx + 1 yutils.display(OUTFILE, "res %= inner_elements;\n") yutils.display( OUTFILE, "inner_elements /= %s->u.%s.blocklength;\n" % (md, d)) elif (d == "hindexed"): yutils.display(OUTFILE, "uintptr_t x%d;\n" % idx) yutils.display( OUTFILE, "for (int i = 0; i < %s->u.%s.count; i++) {\n" % (md, d)) yutils.display( OUTFILE, " uintptr_t in_elems = %s->u.%s.array_of_blocklengths[i] *\n" % (md, d)) yutils.display( OUTFILE, " %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " if (res < in_elems) {\n") yutils.display(OUTFILE, " x%d = i;\n" % idx) yutils.display(OUTFILE, " res %= in_elems;\n") yutils.display( OUTFILE, " inner_elements = %s->u.%s.child->num_elements;\n" % (md, d)) yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, " } else {\n") yutils.display(OUTFILE, " res -= in_elems;\n") yutils.display(OUTFILE, " }\n") yutils.display(OUTFILE, "}\n") idx = idx + 1 yutils.display(OUTFILE, "\n") md = "%s->u.%s.child" % (md, d) yutils.display(OUTFILE, "uintptr_t x%d = res;\n" % idx) yutils.display(OUTFILE, "\n") dtp = "md" s = "x0 * extent" idx = 1 x = 1 need_extent = False for d in darray: if (x == len(darray)): last = 1 else: last = 0 getattr(sys.modules[__name__], d)(x, dtp, b, last) x = x + 1 dtp = dtp + "->u.%s.child" % d if (func == "pack"): yutils.display(OUTFILE, "dbuf[idx] = sbuf[%s];\n" % s) else: yutils.display(OUTFILE, "dbuf[%s] = sbuf[idx];\n" % s) yutils.display(OUTFILE, "}\n\n") # generate the host function yutils.display( OUTFILE, "void yaksuri_cudai_%s(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)\n" % funcprefix) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "void *args[] = { &inbuf, &outbuf, &count, &md };\n") yutils.display( OUTFILE, "cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_%s,\n" % funcprefix) yutils.display( OUTFILE, " dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);\n" ) yutils.display(OUTFILE, "YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);\n") yutils.display(OUTFILE, "}\n\n")
def switcher(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr, nests): yutils.display(OUTFILE, "switch (%s->kind) {\n" % child_type_str(typelist)) for x in range(len(derived_types)): d = derived_types[x] if (nests > 1): yutils.display(OUTFILE, "case YAKSI_TYPE_KIND__%s:\n" % d.upper()) typelist.append(d) switcher(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr + "_%s" % d, nests - 1) typelist.pop() yutils.display(OUTFILE, "break;\n") if (len(typelist)): yutils.display(OUTFILE, "case YAKSI_TYPE_KIND__BUILTIN:\n") switcher_builtin(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr) yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, "}\n")
help='maximum nesting levels to generate') args = parser.parse_args() if (args.pup_max_nesting < 0): parser.print_help() print print("===> ERROR: pup-max-nesting must be positive") sys.exit(1) ##### generate the core pack/unpack kernels (single level) for b in builtin_types: for d in gencomm.derived_types: filename = "src/backend/cuda/pup/yaksuri_cudai_pup_%s_%s.cu" % ( d, b.replace(" ", "_")) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <string.h>\n") yutils.display(OUTFILE, "#include <stdint.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include <assert.h>\n") yutils.display(OUTFILE, "#include <cuda.h>\n") yutils.display(OUTFILE, "#include <cuda_runtime.h>\n") yutils.display(OUTFILE, "#include \"yaksuri_cudai.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_cudai_pup.h\"\n") yutils.display(OUTFILE, "\n") emptylist = [] emptylist.append(d) generate_kernels(b, emptylist) emptylist.pop() OUTFILE.close()
def populate_pupfns(pup_max_nesting, backend, blklens, builtin_types, builtin_maps): ##### generate the switching logic to select pup functions filename = "src/backend/%s/pup/yaksuri_%si_populate_pupfns.c" % (backend, backend) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <stdio.h>\n") yutils.display(OUTFILE, "#include \"yaksi.h\"\n") yutils.display(OUTFILE, "#include \"yaksu.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_%si.h\"\n" % backend) yutils.display(OUTFILE, "#include \"yaksuri_%si_populate_pupfns.h\"\n" % backend) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "int yaksuri_%si_populate_pupfns(yaksi_type_s * type)\n" % backend) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "int rc = YAKSA_SUCCESS;\n") yutils.display(OUTFILE, "yaksuri_%si_type_s *%s = (yaksuri_%si_type_s *) type->backend.%s.priv;\n" \ % (backend, backend, backend, backend)) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "%s->pack = NULL;\n" % backend) yutils.display(OUTFILE, "%s->unpack = NULL;\n" % backend) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "switch (type->kind) {\n") for dtype1 in derived_types: yutils.display(OUTFILE, "case YAKSI_TYPE_KIND__%s:\n" % dtype1.upper()) yutils.display(OUTFILE, "switch (type->u.%s.child->kind) {\n" % dtype1) for dtype2 in derived_types: yutils.display(OUTFILE, "case YAKSI_TYPE_KIND__%s:\n" % dtype2.upper()) yutils.display(OUTFILE, "rc = yaksuri_%si_populate_pupfns_%s_%s(type);\n" % (backend, dtype1, dtype2)) yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "case YAKSI_TYPE_KIND__BUILTIN:\n") yutils.display(OUTFILE, "rc = yaksuri_%si_populate_pupfns_%s_builtin(type);\n" % (backend, dtype1)) yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, " return rc;\n") yutils.display(OUTFILE, "}\n"); OUTFILE.close() for dtype1 in derived_types: for dtype2 in derived_types: filename = "src/backend/%s/pup/yaksuri_%si_populate_pupfns_%s_%s.c" % (backend, backend, dtype1, dtype2) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <stdio.h>\n") yutils.display(OUTFILE, "#include <stdlib.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include \"yaksi.h\"\n") yutils.display(OUTFILE, "#include \"yaksu.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_%si.h\"\n" % backend) yutils.display(OUTFILE, "#include \"yaksuri_%si_populate_pupfns.h\"\n" % backend) yutils.display(OUTFILE, "#include \"yaksuri_%si_pup.h\"\n" % backend) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "int yaksuri_%si_populate_pupfns_%s_%s(yaksi_type_s * type)\n" % (backend, dtype1, dtype2)) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "int rc = YAKSA_SUCCESS;\n") yutils.display(OUTFILE, "yaksuri_%si_type_s *%s = (yaksuri_%si_type_s *) type->backend.%s.priv;\n" \ % (backend, backend, backend, backend)) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "char *str = getenv(\"YAKSA_ENV_MAX_NESTING_LEVEL\");\n") yutils.display(OUTFILE, "int max_nesting_level;\n") yutils.display(OUTFILE, "if (str) {\n") yutils.display(OUTFILE, "max_nesting_level = atoi(str);\n") yutils.display(OUTFILE, "} else {\n") yutils.display(OUTFILE, "max_nesting_level = YAKSI_ENV_DEFAULT_NESTING_LEVEL;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "\n") pupstr = "pack_%s_%s" % (dtype1, dtype2) typelist = [ dtype1, dtype2 ] switcher(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr, pup_max_nesting - 1) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "return rc;\n") yutils.display(OUTFILE, "}\n") OUTFILE.close() filename = "src/backend/%s/pup/yaksuri_%si_populate_pupfns_%s_builtin.c" % (backend, backend, dtype1) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <stdio.h>\n") yutils.display(OUTFILE, "#include <stdlib.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include \"yaksi.h\"\n") yutils.display(OUTFILE, "#include \"yaksu.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_%si.h\"\n" % backend) yutils.display(OUTFILE, "#include \"yaksuri_%si_populate_pupfns.h\"\n" % backend) yutils.display(OUTFILE, "#include \"yaksuri_%si_pup.h\"\n" % backend) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "int yaksuri_%si_populate_pupfns_%s_builtin(yaksi_type_s * type)\n" % (backend, dtype1)) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "int rc = YAKSA_SUCCESS;\n") yutils.display(OUTFILE, "yaksuri_%si_type_s *%s = (yaksuri_%si_type_s *) type->backend.%s.priv;\n" \ % (backend, backend, backend, backend)) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "char *str = getenv(\"YAKSA_ENV_MAX_NESTING_LEVEL\");\n") yutils.display(OUTFILE, "int max_nesting_level;\n") yutils.display(OUTFILE, "if (str) {\n") yutils.display(OUTFILE, "max_nesting_level = atoi(str);\n") yutils.display(OUTFILE, "} else {\n") yutils.display(OUTFILE, "max_nesting_level = YAKSI_ENV_DEFAULT_NESTING_LEVEL;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "\n") pupstr = "pack_%s" % dtype1 typelist = [ dtype1 ] switcher_builtin(backend, OUTFILE, blklens, builtin_types, builtin_maps, typelist, pupstr) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "return rc;\n") yutils.display(OUTFILE, "}\n") OUTFILE.close() ##### generate the Makefile for the pup function selection functions filename = "src/backend/%s/pup/Makefile.pup.mk" % backend yutils.copyright_makefile(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "libyaksa_la_SOURCES += \\\n") for dtype1 in derived_types: for dtype2 in derived_types: yutils.display(OUTFILE, "\tsrc/backend/%s/pup/yaksuri_%si_populate_pupfns_%s_%s.c \\\n" % (backend, backend, dtype1, dtype2)) yutils.display(OUTFILE, "\tsrc/backend/%s/pup/yaksuri_%si_populate_pupfns_%s_builtin.c \\\n" % (backend, backend, dtype1)) yutils.display(OUTFILE, "\tsrc/backend/%s/pup/yaksuri_%si_populate_pupfns.c\n" % (backend, backend)) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "noinst_HEADERS += \\\n") yutils.display(OUTFILE, "\tsrc/backend/%s/pup/yaksuri_%si_populate_pupfns.h \\\n" % (backend, backend)) yutils.display(OUTFILE, "\tsrc/backend/%s/pup/yaksuri_%si_pup.h\n" % (backend, backend)) OUTFILE.close() ##### generate the header file for the pup function selection functions filename = "src/backend/%s/pup/yaksuri_%si_populate_pupfns.h" % (backend, backend) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#ifndef YAKSURI_%sI_POPULATE_PUPFNS_H_INCLUDED\n" % backend.upper()) yutils.display(OUTFILE, "#define YAKSURI_%sI_POPULATE_PUPFNS_H_INCLUDED\n" % backend.upper()) yutils.display(OUTFILE, "\n") for dtype1 in derived_types: for dtype2 in derived_types: yutils.display(OUTFILE, "int yaksuri_%si_populate_pupfns_%s_%s(yaksi_type_s * type);\n" % (backend, dtype1, dtype2)) yutils.display(OUTFILE, "int yaksuri_%si_populate_pupfns_%s_builtin(yaksi_type_s * type);\n" % (backend, dtype1)) yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "#endif /* YAKSURI_%sI_POPULATE_PUPFNS_H_INCLUDED */\n" % backend.upper()) OUTFILE.close()
print print("===> ERROR: pup-max-nesting must be positive") sys.exit(1) ##### generate the list of derived datatype arrays darraylist = [] yutils.generate_darrays(gencomm.derived_types, darraylist, args.pup_max_nesting) ##### generate the core pack/unpack kernels for b in builtin_types: filename = "src/backend/seq/pup/yaksuri_seqi_pup_%s.c" % b.replace( " ", "_") yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <string.h>\n") yutils.display(OUTFILE, "#include <stdint.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include \"yaksuri_seqi_pup.h\"\n") yutils.display(OUTFILE, "\n") for darray in darraylist: for blklen in blklens: generate_kernels(b, darray, blklen) OUTFILE.close() ##### generate the core pack/unpack kernel declarations filename = "src/backend/seq/pup/yaksuri_seqi_pup.h" yutils.copyright_c(filename) OUTFILE = open(filename, "a")
def generate_kernels(b, darray, blklen): global num_paren_open global s # we don't need pup kernels for basic types if (len(darray) == 0): return # individual blocklength optimization is only for # hvector and blkhindx if (darray[-1] != "hvector" and darray[-1] != "blkhindx" and blklen != "generic"): return for func in "pack","unpack": ##### figure out the function name to use s = "int yaksuri_seqi_%s_" % func for d in darray: s = s + "%s_" % d # hvector and hindexed get blklen-specific function names if (darray[-1] != "hvector" and darray[-1] != "blkhindx"): s = s + b.replace(" ", "_") else: s = s + "blklen_%s_" % blklen + b.replace(" ", "_") yutils.display(OUTFILE, "%s(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type)\n" % s), yutils.display(OUTFILE, "{\n") ##### variable declarations # generic variables yutils.display(OUTFILE, "int rc = YAKSA_SUCCESS;\n"); yutils.display(OUTFILE, "const %s *restrict sbuf = (const %s *) inbuf;\n" % (b, b)); yutils.display(OUTFILE, "%s *restrict dbuf = (%s *) outbuf;\n" % (b, b)); yutils.display(OUTFILE, "uintptr_t extent ATTRIBUTE((unused)) = type->extent / sizeof(%s);\n" % b) yutils.display(OUTFILE, "\n"); # variables specific to each nesting level s = "type" for x in range(len(darray)): getattr(sys.modules[__name__], "%s_decl" % darray[x])(x + 1, s, b) yutils.display(OUTFILE, "\n") s = s + "->u.%s.child" % darray[x] ##### non-hvector and non-blkhindx yutils.display(OUTFILE, "uintptr_t idx = 0;\n") yutils.display(OUTFILE, "for (int i = 0; i < count; i++) {\n") num_paren_open += 1 s = "i * extent" for x in range(len(darray)): if (x != len(darray) - 1): getattr(sys.modules[__name__], darray[x])(x + 1, b, "generic", 0) else: getattr(sys.modules[__name__], darray[x])(x + 1, b, blklen, 1) if (func == "pack"): yutils.display(OUTFILE, "dbuf[idx++] = sbuf[%s];\n" % s) else: yutils.display(OUTFILE, "dbuf[%s] = sbuf[idx++];\n" % s) for x in range(num_paren_open): yutils.display(OUTFILE, "}\n") num_paren_open = 0 yutils.display(OUTFILE, "\n"); yutils.display(OUTFILE, "return rc;\n") yutils.display(OUTFILE, "}\n\n")
def generate_kernels(b, darray, blklen): global num_paren_open global s # individual blocklength optimization is only for # hvector and blkhindx if (len(darray) and darray[-1] != "hvector" and darray[-1] != "blkhindx" and blklen != "generic"): return for func in "pack", "unpack": ##### figure out the function name to use s = "int yaksuri_seqi_%s_" % func for d in darray: s = s + "%s_" % d # hvector and hindexed get blklen-specific function names if (len(darray) and (darray[-1] == "hvector" or darray[-1] == "blkhindx")): s = s + "blklen_%s_" % blklen + b.replace(" ", "_") else: s = s + b.replace(" ", "_") yutils.display( OUTFILE, "%s(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type, yaksa_op_t op)\n" % s), yutils.display(OUTFILE, "{\n") ##### variable declarations # generic variables yutils.display(OUTFILE, "int rc = YAKSA_SUCCESS;\n") yutils.display(OUTFILE, "const char *restrict sbuf = (const char *) inbuf;\n") yutils.display(OUTFILE, "char *restrict dbuf = (char *) outbuf;\n") yutils.display( OUTFILE, "uintptr_t extent ATTRIBUTE((unused)) = type->extent;\n") yutils.display(OUTFILE, "\n") # variables specific to each nesting level s = "type" for x in range(len(darray)): getattr(sys.modules[__name__], "%s_decl" % darray[x])(x + 1, s, b) yutils.display(OUTFILE, "\n") s = s + "->u.%s.child" % darray[x] yutils.display(OUTFILE, "uintptr_t idx = 0;\n") ##### non-hvector and non-blkhindx yutils.display(OUTFILE, "switch (op) {\n") for op in gencomm.type_ops[b]: yutils.display(OUTFILE, "case YAKSA_OP__%s:\n" % op) yutils.display(OUTFILE, "{\n") yutils.display(OUTFILE, "for (intptr_t i = 0; i < count; i++) {\n") num_paren_open += 1 s = "i * extent" for x in range(len(darray)): if (x != len(darray) - 1): getattr(sys.modules[__name__], darray[x])(x + 1, b, "generic", 0) else: getattr(sys.modules[__name__], darray[x])(x + 1, b, blklen, 1) if (func == "pack"): if ((b == "float" or b == "double" or b == "long double") and (op == "MAX" or op == "MIN")): yutils.display( OUTFILE, "YAKSURI_SEQI_OP_%s_FLOAT(%s, *((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx)));\n" % (op, b, b, s, b)) else: yutils.display( OUTFILE, "YAKSURI_SEQI_OP_%s(*((const %s *) (const void *) (sbuf + %s)), *((%s *) (void *) (dbuf + idx)));\n" % (op, b, s, b)) else: if ((b == "float" or b == "double" or b == "long double") and (op == "MAX" or op == "MIN")): yutils.display( OUTFILE, "YAKSURI_SEQI_OP_%s_FLOAT(%s, *((const %s *) (const void *) (sbuf + idx)), *((%s *) (void *) (dbuf + %s)));\n" % (op, b, b, b, s)) else: yutils.display( OUTFILE, "YAKSURI_SEQI_OP_%s(*((const %s *) (const void *) (sbuf + idx)), *((%s *) (void *) (dbuf + %s)));\n" % (op, b, b, s)) yutils.display(OUTFILE, "idx += sizeof(%s);\n" % b) for x in range(num_paren_open): yutils.display(OUTFILE, "}\n") num_paren_open = 0 yutils.display(OUTFILE, "break;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "default:\n") yutils.display(OUTFILE, " break;\n") yutils.display(OUTFILE, "}\n") yutils.display(OUTFILE, "\n") yutils.display(OUTFILE, "return rc;\n") yutils.display(OUTFILE, "}\n\n")
default=3, help='maximum nesting levels to generate') args = parser.parse_args() if (args.pup_max_nesting < 0): parser.print_help() print print("===> ERROR: pup-max-nesting must be positive") sys.exit(1) ##### generate the core pack/unpack kernels (zero levels) for b in builtin_types: filename = "src/backend/cuda/pup/yaksuri_cudai_pup_%s.cu" % b.replace( " ", "_") yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <string.h>\n") yutils.display(OUTFILE, "#include <stdint.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include <assert.h>\n") yutils.display(OUTFILE, "#include <cuda.h>\n") yutils.display(OUTFILE, "#include <cuda_runtime.h>\n") yutils.display(OUTFILE, "#include \"yaksuri_cudai_base.h\"\n") yutils.display(OUTFILE, "#include \"yaksuri_cudai_pup.h\"\n") yutils.display(OUTFILE, "\n") emptylist = [] for op in gencomm.type_ops[b]: generate_kernels(b, emptylist, op) generate_host_function(b, emptylist) OUTFILE.close()
default=3, help='maximum nesting levels to generate') args = parser.parse_args() if (args.pup_max_nesting < 0): parser.print_help() print print("===> ERROR: pup-max-nesting must be positive") sys.exit(1) ##### generate the reduction kernels for contiguous types for b in builtin_types: filename = "src/backend/seq/pup/yaksuri_seqi_pup_%s.c" % b.replace( " ", "_") yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <string.h>\n") yutils.display(OUTFILE, "#include <stdint.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include \"yaksuri_seqi_pup.h\"\n") yutils.display(OUTFILE, "\n") emptylist = [] generate_kernels(b, emptylist, 0) OUTFILE.close() ##### generate the core pack/unpack kernels (single level) for b in builtin_types: for d in gencomm.derived_types: filename = "src/backend/seq/pup/yaksuri_seqi_pup_%s_%s.c" % ( d, b.replace(" ", "_"))
parser = argparse.ArgumentParser() parser.add_argument('--pup-max-nesting', type=int, default=3, help='maximum nesting levels to generate') args = parser.parse_args() if (args.pup_max_nesting < 0): parser.print_help() print print("===> ERROR: pup-max-nesting must be positive") sys.exit(1) ##### generate the core pack/unpack kernels (single level) for b in builtin_types: for d in gencomm.derived_types: filename = "src/backend/seq/pup/yaksuri_seqi_pup_%s_%s.c" % (d, b.replace(" ","_")) yutils.copyright_c(filename) OUTFILE = open(filename, "a") yutils.display(OUTFILE, "#include <string.h>\n") yutils.display(OUTFILE, "#include <stdint.h>\n") yutils.display(OUTFILE, "#include <wchar.h>\n") yutils.display(OUTFILE, "#include \"yaksuri_seqi_pup.h\"\n") yutils.display(OUTFILE, "\n") emptylist = [ ] emptylist.append(d) for blklen in blklens: generate_kernels(b, emptylist, blklen) emptylist.pop() OUTFILE.close() ##### generate the core pack/unpack kernels (more than one level) darraylist = [ ]