def __init__(self, name): self.init = Code() self.body = Code() self.inputColumns = {} self.outputAttributes = [] self.variables = [] self.kernelName = name
class Kernel(object): def __init__(self, name): self.init = Code() self.body = Code() self.inputColumns = {} self.outputAttributes = [] self.variables = [] self.kernelName = name self.annotations = [] def add(self, code): self.body.add(code) def addVar(self, c): # resolve multiply added columns self.inputColumns[c.get()] = c def getParameters(self): params = [] for name, c in self.inputColumns.items(): params.append(c.getGPU()) for a in self.outputAttributes: params.append(ident.gpuResultColumn(a)) for v in self.variables: params.append(v.getGPU()) return params def getKernelCode(self): kernel = Code() # open kernel frame kernel.add("__global__ void " + self.kernelName + "(") comma = False params = "" for name, c in self.inputColumns.items(): if not comma: comma = True else: params += ", " params += c.dataType + "* " + c.get() for a in self.outputAttributes: params += ", " params += a.dataType + "* " + ident.resultColumn(a) for v in self.variables: params += ", " params += v.dataType + "* " + v.get() kernel.add(params + ") {") # add code generated by operator tree kernel.add(self.init.content) # add code generated by operator tree kernel.add(self.body.content) # close kernel frame kernel.add("}") return kernel.content def annotate(self, msg): self.annotations.append(msg)
def __init__(self, decimalRepresentation): self.read = Code() self.types = Code() self.kernels = [] self.currentKernel = None self.kernelCalls = [] self.declare = Code() self.finish = Code() self.mirrorKernel = None self.gpumem = GpuIO() self.constCounter = 0 self.decimalType = decimalRepresentation
def generic(kernelName, parameters, gridSize=1024, blockSize=128, templateParams=""): # kernel invocation parameters code = Code() templatedKernel = kernelName if templateParams != "": templatedKernel += "<" + templateParams + ">" with Scope(code): emit("int gridsize=" + str(gridSize), code) emit("int blocksize=" + str(blockSize), code) call = templatedKernel + "<<<gridsize, blocksize>>>(" # add parameters: input attributes, output attributes and additional variables (output number) comma = False for a in parameters: if not comma: comma = True else: call += ", " call += str(a) call += ")" emit(call, code) return code
def translate ( self, ctxt ): code = Code() var = Variable.val ( ctxt.codegen.langType ( self.type ), "casevar" + str(self.exprId) ) var.declare ( ctxt.codegen ) #declare variable w0,t0 = self.exprListWhenThen[0] with lang.IfClause ( w0.translate ( ctxt ), ctxt.codegen ): lang.emit ( lang.assign ( var, t0.translate ( ctxt ) ), ctxt.codegen ) for w,t in self.exprListWhenThen[1:]: with lang.ElseIfClause ( w.translate ( ctxt ), ctxt.codegen ): lang.emit ( lang.assign ( var, t.translate ( ctxt ) ), ctxt.codegen ) if self.exprElse != None: with lang.ElseClause ( ctxt.codegen ): lang.emit ( lang.assign ( var, self.exprElse.translate ( ctxt ) ), ctxt.codegen ) return var.get()
class CodeGenerator(object): def __init__(self, decimalRepresentation): self.read = Code() self.types = Code() self.kernels = [] self.currentKernel = None self.kernelCalls = [] self.declare = Code() self.finish = Code() self.mirrorKernel = None self.gpumem = GpuIO() self.constCounter = 0 self.decimalType = decimalRepresentation def langType(self, relDataType): internalTypeMap = {} internalTypeMap[Type.INT] = CType.INT internalTypeMap[Type.DATE] = CType.UINT internalTypeMap[Type.CHAR] = CType.CHAR internalTypeMap[Type.FLOAT] = self.decimalType internalTypeMap[Type.DOUBLE] = self.decimalType internalTypeMap[Type.STRING] = CType.STR_TYPE return internalTypeMap[relDataType] def stringConstant(self, token): self.constCounter += 1 c = Variable.val(CType.STR_TYPE, "c" + str(self.constCounter)) emit( assign(declare(c), call("stringConstant", ["\"" + token + "\"", len(token)])), self.init()) return c def openKernel(self, kernel): self.kernels.append(kernel) self.currentKernel = kernel self.kernelCalls.append(KernelCall.generated(kernel)) return kernel # used for multiple passes e.g. (multi) hash build def openMirrorKernel(self, suffix): kernel = copy.deepcopy(self.currentKernel) kernel.kernelName = self.currentKernel.kernelName + suffix self.kernels.append(kernel) self.mirrorKernel = kernel self.kernelCalls.append(KernelCall.generated(kernel)) return kernel def closeKernel(self): self.currentKernel = None if self.mirrorKernel: self.mirrorKernel = None def add(self, string): self.currentKernel.add(string) if self.mirrorKernel: self.mirrorKernel.add(string) def init(self): return self.currentKernel.init def warplane(self): try: return self.currentKernel.warplane except AttributeError: self.currentKernel.warplane = Variable.val(CType.UINT, "warplane") emit( assign(declare(self.currentKernel.warplane), modulo(threadIdx_x(), intConst(32))), self.init()) return self.currentKernel.warplane def warpid(self): try: return self.currentKernel.warpid except AttributeError: self.currentKernel.warpid = Variable.val(CType.UINT, "warpid") emit( assign(declare(self.currentKernel.warpid), div(threadIdx_x(), intConst(32))), self.init()) return self.currentKernel.warpid def newStatisticsCounter(self, varname, text): counter = Variable.val(CType.UINT, varname) counter.declareAssign(intConst(0), self.declare) self.gpumem.mapForWrite(counter) self.gpumem.initVar(counter, "0u") self.currentKernel.addVar(counter) emit(printf(text + "%i\\n", [counter]), self.finish) return counter def prefixlanes(self): try: return self.currentKernel.prefixlanes except AttributeError: self.currentKernel.prefixlanes = Variable.val( CType.UINT, "prefixlanes") emit( assign( declare(self.currentKernel.prefixlanes), shiftRight(bitmask32f(), sub(intConst(32), self.warplane()))), self.init()) return self.currentKernel.prefixlanes def addDatabaseAccess(self, context, accessor): self.read.add(accessor.getCodeAccessDatabase(context.inputAttributes)) self.accessor = accessor # build complete code file from generated pieces and add time measurements def composeCode(self, useCuda=True): code = Code() code.add(qlib.getIncludes()) if useCuda: code.add(qlib.getCudaIncludes()) code.addFragment(self.types) for k in self.kernels: code.add(k.getKernelCode()) code.add("int main() {") code.addUntimedFragment(self.read, "import") code.addUntimedFragment(self.declare, "declare") if self.gpumem.cudaMalloc.hasCode: wakeup = Code() comment("wake up gpu", wakeup) code.addUntimedCudaFragment(wakeup, "wake up gpu") code.addUntimedCudaFragment(self.gpumem.cudaMalloc, "cuda malloc") if useCuda: printMemoryFootprint(code) code.addUntimedCudaFragment(self.gpumem.cudaMallocHT, "cuda mallocHT") if useCuda: printMemoryFootprint(code) code.addUntimedCudaFragment(self.gpumem.cudaMemcpyIn, "cuda memcpy in") tsKernels = Timestamp("totalKernelTime", code) for call in self.kernelCalls: code.addCudaFragment(call.get(), call.kernelName, call.getAnnotations()) tsKernels.stop() code.addUntimedCudaFragment(self.gpumem.cudaMemcpyOut, "cuda memcpy out") code.addUntimedCudaFragment(self.gpumem.cudaFree, "cuda free") code.addTimedFragment(self.finish, "finish") if useCuda: code.timestamps.append(tsKernels) emit(printf("<timing>\\n"), code) for ts in code.timestamps: ts.printTime() emit(printf("</timing>\\n"), code) code.add("}") return code.content def writeCodeFile(self, code, filename): with open(filename, 'w') as f: f.write(code) # format sourcecode cmd = "astyle --indent-col1-comments " + filename subprocess.run(cmd, stdout=subprocess.DEVNULL, shell=True) def compile_(self, filename, arch="sm_52", debug=False): print("compilation...") sys.stdout.flush() self.filename = filename cuFilename = filename + ".cu" self.writeCodeFile(self.composeCode(), cuFilename) # compile nvccFlags = "-std=c++11 -arch=" + arch + " " hostFlags = "-pthread " if debug: nvccFlags += "-g -G " hostFlags += "-rdynamic " cmd = "nvcc " + cuFilename + " -o " + filename + " " + nvccFlags + " -Xcompiler=\"" + hostFlags + "\" " print(cmd) start = time.time() if debug: subprocess.run(cmd, shell=True) else: subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True) end = time.time() print("compilation time: %.1f ms" % ((end - start) * 1000)) def compileCpu(self, filename, debug=False): self.filename = filename cppFilename = filename + ".cpp" self.writeCodeFile(self.composeCode(False), cppFilename) # compile flags = "-std=c++11 -pthread " if debug: flags += " -g" cmd = "g++ " + cppFilename + " -o " + filename + " " + flags print(cmd) output = subprocess.check_output(cmd, shell=True) def execute(self): print("\nexecution...") sys.stdout.flush() cmd = "./" + self.filename output = subprocess.check_output(cmd, shell=True).decode('utf-8') print(output) with open(self.filename + ".log", "w") as log_file: print(output, file=log_file) sys.stdout.flush() return (output)
def composeCode(self, useCuda=True): code = Code() code.add(qlib.getIncludes()) if useCuda: code.add(qlib.getCudaIncludes()) code.addFragment(self.types) for k in self.kernels: code.add(k.getKernelCode()) code.add("int main() {") code.addUntimedFragment(self.read, "import") code.addUntimedFragment(self.declare, "declare") if self.gpumem.cudaMalloc.hasCode: wakeup = Code() comment("wake up gpu", wakeup) code.addUntimedCudaFragment(wakeup, "wake up gpu") code.addUntimedCudaFragment(self.gpumem.cudaMalloc, "cuda malloc") if useCuda: printMemoryFootprint(code) code.addUntimedCudaFragment(self.gpumem.cudaMallocHT, "cuda mallocHT") if useCuda: printMemoryFootprint(code) code.addUntimedCudaFragment(self.gpumem.cudaMemcpyIn, "cuda memcpy in") tsKernels = Timestamp("totalKernelTime", code) for call in self.kernelCalls: code.addCudaFragment(call.get(), call.kernelName, call.getAnnotations()) tsKernels.stop() code.addUntimedCudaFragment(self.gpumem.cudaMemcpyOut, "cuda memcpy out") code.addUntimedCudaFragment(self.gpumem.cudaFree, "cuda free") code.addTimedFragment(self.finish, "finish") if useCuda: code.timestamps.append(tsKernels) emit(printf("<timing>\\n"), code) for ts in code.timestamps: ts.printTime() emit(printf("</timing>\\n"), code) code.add("}") return code.content
def __init__(self): self.cudaMalloc = Code() self.cudaMallocHT = Code() self.cudaMemcpyIn = Code() self.cudaMemcpyOut = Code() self.cudaFree = Code()
class GpuIO(object): def __init__(self): self.cudaMalloc = Code() self.cudaMallocHT = Code() self.cudaMemcpyIn = Code() self.cudaMemcpyOut = Code() self.cudaFree = Code() def local(self, var, init=None): self.declareAllocateHT(var) if init is not None: self.initVar(var, init) def mapForRead(self, var, blocked=False): self.declareAllocate(var) self.cudaMemcpyIn.add(getCudaMemcpyIn(var, var.numElements)) def mapForReadLit(self, deviceVar, hostVar, blocked=False): self.declareAllocateLit(deviceVar) self.cudaMemcpyIn.add( getCudaMemcpyInLit(deviceVar, hostVar, hostVar.numElements)) def mapForWrite(self, var, sizevar=None): self.declareAllocate(var) if sizevar == None: self.cudaMemcpyOut.add(getCudaMemcpyOut(var, var.numElements)) else: self.cudaMemcpyOut.add(getCudaMemcpyOut(var, sizevar)) def copyOut(self, var): self.declareAllocateDevice(var) self.cudaMemcpyIn.add(getCudaMemcpyIn(var, var.numElements)) def initVar(self, var, init): call = KernelCall.library( "initArray", [var.getGPU(), str(init), var.numElements], var.dataType) self.cudaMallocHT.add(call.get()) def declare(self, var, blocked=False): self.cudaMalloc.add(var.declareGPU()) def declareAllocate(self, var, blocked=False): self.cudaMalloc.add(var.declareGPU()) self.cudaMalloc.add(getCudaMalloc(var, var.numElements)) self.cudaFree.add(getCudaFree(var)) def declareAllocateHT(self, var): self.cudaMallocHT.add(var.declareGPU()) self.cudaMallocHT.add(getCudaMalloc(var, var.numElements)) self.cudaFree.add(getCudaFree(var)) def declareAllocateLit(self, var, blocked=False): var.declarePointer(self.cudaMalloc) self.cudaMalloc.add(getCudaMallocLit(var, var.numElements)) self.cudaFree.add(getCudaFreeLit(var))
def getIncludes(): code = Code() code.add("#include <list>") code.add("#include <unordered_map>") code.add("#include <vector>") code.add("#include <iostream>") code.add("#include <ctime>") code.add("#include <limits.h>") code.add("#include <float.h>") code.add("#include \"../dogqc/include/csv.h\"") code.add("#include \"../dogqc/include/util.h\"") code.add("#include \"../dogqc/include/mappedmalloc.h\"") return code
def getCudaIncludes(): code = Code() code.add("#include \"../dogqc/include/util.cuh\"") code.add("#include \"../dogqc/include/hashing.cuh\"") return code
def getKernelCode(self): kernel = Code() # open kernel frame kernel.add("__global__ void " + self.kernelName + "(") comma = False params = "" for name, c in self.inputColumns.items(): if not comma: comma = True else: params += ", " params += c.dataType + "* " + c.get() for a in self.outputAttributes: params += ", " params += a.dataType + "* " + ident.resultColumn(a) for v in self.variables: params += ", " params += v.dataType + "* " + v.get() kernel.add(params + ") {") # add code generated by operator tree kernel.add(self.init.content) # add code generated by operator tree kernel.add(self.body.content) # close kernel frame kernel.add("}") return kernel.content