Beispiel #1
0
 def __init__(self, name):
     self.init = Code()
     self.body = Code()
     self.inputColumns = {}
     self.outputAttributes = []
     self.variables = []
     self.kernelName = name
Beispiel #2
0
class Kernel(object):
    def __init__(self, name):
        self.init = Code()
        self.body = Code()
        self.inputColumns = {}
        self.outputAttributes = []
        self.variables = []
        self.kernelName = name
        self.annotations = []

    def add(self, code):
        self.body.add(code)

    def addVar(self, c):
        # resolve multiply added columns
        self.inputColumns[c.get()] = c

    def getParameters(self):
        params = []
        for name, c in self.inputColumns.items():
            params.append(c.getGPU())
        for a in self.outputAttributes:
            params.append(ident.gpuResultColumn(a))
        for v in self.variables:
            params.append(v.getGPU())
        return params

    def getKernelCode(self):
        kernel = Code()

        # open kernel frame
        kernel.add("__global__ void " + self.kernelName + "(")
        comma = False
        params = ""
        for name, c in self.inputColumns.items():
            if not comma:
                comma = True
            else:
                params += ", "
            params += c.dataType + "* " + c.get()
        for a in self.outputAttributes:
            params += ", "
            params += a.dataType + "* " + ident.resultColumn(a)
        for v in self.variables:
            params += ", "
            params += v.dataType + "* " + v.get()
        kernel.add(params + ") {")

        # add code generated by operator tree
        kernel.add(self.init.content)

        # add code generated by operator tree
        kernel.add(self.body.content)

        # close kernel frame
        kernel.add("}")
        return kernel.content

    def annotate(self, msg):
        self.annotations.append(msg)
Beispiel #3
0
    def __init__(self, decimalRepresentation):
        self.read = Code()
        self.types = Code()
        self.kernels = []
        self.currentKernel = None
        self.kernelCalls = []

        self.declare = Code()
        self.finish = Code()
        self.mirrorKernel = None

        self.gpumem = GpuIO()
        self.constCounter = 0

        self.decimalType = decimalRepresentation
Beispiel #4
0
    def generic(kernelName,
                parameters,
                gridSize=1024,
                blockSize=128,
                templateParams=""):
        # kernel invocation parameters
        code = Code()

        templatedKernel = kernelName
        if templateParams != "":
            templatedKernel += "<" + templateParams + ">"

        with Scope(code):
            emit("int gridsize=" + str(gridSize), code)
            emit("int blocksize=" + str(blockSize), code)
            call = templatedKernel + "<<<gridsize, blocksize>>>("
            # add parameters: input attributes, output attributes and additional variables (output number)
            comma = False
            for a in parameters:
                if not comma:
                    comma = True
                else:
                    call += ", "
                call += str(a)
            call += ")"
            emit(call, code)
        return code
Beispiel #5
0
    def translate ( self, ctxt ):
        code = Code() 

        var = Variable.val ( ctxt.codegen.langType ( self.type ), "casevar" + str(self.exprId) )
        var.declare ( ctxt.codegen )

        #declare variable 
        w0,t0 = self.exprListWhenThen[0]
        with lang.IfClause ( w0.translate ( ctxt ), ctxt.codegen ):
            lang.emit ( lang.assign ( var, t0.translate ( ctxt ) ), ctxt.codegen )
        for w,t in self.exprListWhenThen[1:]:
            with lang.ElseIfClause ( w.translate ( ctxt ), ctxt.codegen ):
                lang.emit ( lang.assign ( var, t.translate ( ctxt ) ), ctxt.codegen )
        if self.exprElse != None:
            with lang.ElseClause ( ctxt.codegen ):
                lang.emit ( lang.assign ( var, self.exprElse.translate ( ctxt ) ), ctxt.codegen )
        return var.get()
Beispiel #6
0
class CodeGenerator(object):
    def __init__(self, decimalRepresentation):
        self.read = Code()
        self.types = Code()
        self.kernels = []
        self.currentKernel = None
        self.kernelCalls = []

        self.declare = Code()
        self.finish = Code()
        self.mirrorKernel = None

        self.gpumem = GpuIO()
        self.constCounter = 0

        self.decimalType = decimalRepresentation

    def langType(self, relDataType):
        internalTypeMap = {}
        internalTypeMap[Type.INT] = CType.INT
        internalTypeMap[Type.DATE] = CType.UINT
        internalTypeMap[Type.CHAR] = CType.CHAR
        internalTypeMap[Type.FLOAT] = self.decimalType
        internalTypeMap[Type.DOUBLE] = self.decimalType
        internalTypeMap[Type.STRING] = CType.STR_TYPE
        return internalTypeMap[relDataType]

    def stringConstant(self, token):
        self.constCounter += 1
        c = Variable.val(CType.STR_TYPE, "c" + str(self.constCounter))
        emit(
            assign(declare(c),
                   call("stringConstant",
                        ["\"" + token + "\"", len(token)])), self.init())
        return c

    def openKernel(self, kernel):
        self.kernels.append(kernel)
        self.currentKernel = kernel
        self.kernelCalls.append(KernelCall.generated(kernel))
        return kernel

    # used for multiple passes e.g. (multi) hash build
    def openMirrorKernel(self, suffix):
        kernel = copy.deepcopy(self.currentKernel)
        kernel.kernelName = self.currentKernel.kernelName + suffix
        self.kernels.append(kernel)
        self.mirrorKernel = kernel
        self.kernelCalls.append(KernelCall.generated(kernel))
        return kernel

    def closeKernel(self):
        self.currentKernel = None

        if self.mirrorKernel:
            self.mirrorKernel = None

    def add(self, string):
        self.currentKernel.add(string)
        if self.mirrorKernel:
            self.mirrorKernel.add(string)

    def init(self):
        return self.currentKernel.init

    def warplane(self):
        try:
            return self.currentKernel.warplane
        except AttributeError:
            self.currentKernel.warplane = Variable.val(CType.UINT, "warplane")
            emit(
                assign(declare(self.currentKernel.warplane),
                       modulo(threadIdx_x(), intConst(32))), self.init())
            return self.currentKernel.warplane

    def warpid(self):
        try:
            return self.currentKernel.warpid
        except AttributeError:
            self.currentKernel.warpid = Variable.val(CType.UINT, "warpid")
            emit(
                assign(declare(self.currentKernel.warpid),
                       div(threadIdx_x(), intConst(32))), self.init())
            return self.currentKernel.warpid

    def newStatisticsCounter(self, varname, text):
        counter = Variable.val(CType.UINT, varname)
        counter.declareAssign(intConst(0), self.declare)
        self.gpumem.mapForWrite(counter)
        self.gpumem.initVar(counter, "0u")
        self.currentKernel.addVar(counter)
        emit(printf(text + "%i\\n", [counter]), self.finish)
        return counter

    def prefixlanes(self):
        try:
            return self.currentKernel.prefixlanes
        except AttributeError:
            self.currentKernel.prefixlanes = Variable.val(
                CType.UINT, "prefixlanes")
            emit(
                assign(
                    declare(self.currentKernel.prefixlanes),
                    shiftRight(bitmask32f(), sub(intConst(32),
                                                 self.warplane()))),
                self.init())
            return self.currentKernel.prefixlanes

    def addDatabaseAccess(self, context, accessor):
        self.read.add(accessor.getCodeAccessDatabase(context.inputAttributes))
        self.accessor = accessor

    # build complete code file from generated pieces and add time measurements
    def composeCode(self, useCuda=True):
        code = Code()
        code.add(qlib.getIncludes())
        if useCuda:
            code.add(qlib.getCudaIncludes())
        code.addFragment(self.types)
        for k in self.kernels:
            code.add(k.getKernelCode())
        code.add("int main() {")
        code.addUntimedFragment(self.read, "import")
        code.addUntimedFragment(self.declare, "declare")
        if self.gpumem.cudaMalloc.hasCode:
            wakeup = Code()
            comment("wake up gpu", wakeup)
            code.addUntimedCudaFragment(wakeup, "wake up gpu")
        code.addUntimedCudaFragment(self.gpumem.cudaMalloc, "cuda malloc")
        if useCuda:
            printMemoryFootprint(code)
        code.addUntimedCudaFragment(self.gpumem.cudaMallocHT, "cuda mallocHT")
        if useCuda:
            printMemoryFootprint(code)
        code.addUntimedCudaFragment(self.gpumem.cudaMemcpyIn, "cuda memcpy in")
        tsKernels = Timestamp("totalKernelTime", code)
        for call in self.kernelCalls:
            code.addCudaFragment(call.get(), call.kernelName,
                                 call.getAnnotations())
        tsKernels.stop()
        code.addUntimedCudaFragment(self.gpumem.cudaMemcpyOut,
                                    "cuda memcpy out")
        code.addUntimedCudaFragment(self.gpumem.cudaFree, "cuda free")
        code.addTimedFragment(self.finish, "finish")
        if useCuda:
            code.timestamps.append(tsKernels)

        emit(printf("<timing>\\n"), code)
        for ts in code.timestamps:
            ts.printTime()
        emit(printf("</timing>\\n"), code)

        code.add("}")
        return code.content

    def writeCodeFile(self, code, filename):
        with open(filename, 'w') as f:
            f.write(code)

        # format sourcecode
        cmd = "astyle --indent-col1-comments " + filename
        subprocess.run(cmd, stdout=subprocess.DEVNULL, shell=True)

    def compile_(self, filename, arch="sm_52", debug=False):
        print("compilation...")
        sys.stdout.flush()
        self.filename = filename
        cuFilename = filename + ".cu"

        self.writeCodeFile(self.composeCode(), cuFilename)

        # compile
        nvccFlags = "-std=c++11 -arch=" + arch + " "
        hostFlags = "-pthread "
        if debug:
            nvccFlags += "-g -G "
            hostFlags += "-rdynamic "
        cmd = "nvcc " + cuFilename + " -o " + filename + " " + nvccFlags + " -Xcompiler=\"" + hostFlags + "\" "
        print(cmd)
        start = time.time()
        if debug:
            subprocess.run(cmd, shell=True)
        else:
            subprocess.run(cmd,
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL,
                           shell=True)
        end = time.time()
        print("compilation time: %.1f ms" % ((end - start) * 1000))

    def compileCpu(self, filename, debug=False):
        self.filename = filename
        cppFilename = filename + ".cpp"

        self.writeCodeFile(self.composeCode(False), cppFilename)

        # compile
        flags = "-std=c++11  -pthread "
        if debug:
            flags += " -g"
        cmd = "g++ " + cppFilename + " -o " + filename + " " + flags
        print(cmd)
        output = subprocess.check_output(cmd, shell=True)

    def execute(self):
        print("\nexecution...")
        sys.stdout.flush()
        cmd = "./" + self.filename
        output = subprocess.check_output(cmd, shell=True).decode('utf-8')
        print(output)
        with open(self.filename + ".log", "w") as log_file:
            print(output, file=log_file)
        sys.stdout.flush()
        return (output)
Beispiel #7
0
    def composeCode(self, useCuda=True):
        code = Code()
        code.add(qlib.getIncludes())
        if useCuda:
            code.add(qlib.getCudaIncludes())
        code.addFragment(self.types)
        for k in self.kernels:
            code.add(k.getKernelCode())
        code.add("int main() {")
        code.addUntimedFragment(self.read, "import")
        code.addUntimedFragment(self.declare, "declare")
        if self.gpumem.cudaMalloc.hasCode:
            wakeup = Code()
            comment("wake up gpu", wakeup)
            code.addUntimedCudaFragment(wakeup, "wake up gpu")
        code.addUntimedCudaFragment(self.gpumem.cudaMalloc, "cuda malloc")
        if useCuda:
            printMemoryFootprint(code)
        code.addUntimedCudaFragment(self.gpumem.cudaMallocHT, "cuda mallocHT")
        if useCuda:
            printMemoryFootprint(code)
        code.addUntimedCudaFragment(self.gpumem.cudaMemcpyIn, "cuda memcpy in")
        tsKernels = Timestamp("totalKernelTime", code)
        for call in self.kernelCalls:
            code.addCudaFragment(call.get(), call.kernelName,
                                 call.getAnnotations())
        tsKernels.stop()
        code.addUntimedCudaFragment(self.gpumem.cudaMemcpyOut,
                                    "cuda memcpy out")
        code.addUntimedCudaFragment(self.gpumem.cudaFree, "cuda free")
        code.addTimedFragment(self.finish, "finish")
        if useCuda:
            code.timestamps.append(tsKernels)

        emit(printf("<timing>\\n"), code)
        for ts in code.timestamps:
            ts.printTime()
        emit(printf("</timing>\\n"), code)

        code.add("}")
        return code.content
Beispiel #8
0
 def __init__(self):
     self.cudaMalloc = Code()
     self.cudaMallocHT = Code()
     self.cudaMemcpyIn = Code()
     self.cudaMemcpyOut = Code()
     self.cudaFree = Code()
Beispiel #9
0
class GpuIO(object):
    def __init__(self):
        self.cudaMalloc = Code()
        self.cudaMallocHT = Code()
        self.cudaMemcpyIn = Code()
        self.cudaMemcpyOut = Code()
        self.cudaFree = Code()

    def local(self, var, init=None):
        self.declareAllocateHT(var)
        if init is not None:
            self.initVar(var, init)

    def mapForRead(self, var, blocked=False):
        self.declareAllocate(var)
        self.cudaMemcpyIn.add(getCudaMemcpyIn(var, var.numElements))

    def mapForReadLit(self, deviceVar, hostVar, blocked=False):
        self.declareAllocateLit(deviceVar)
        self.cudaMemcpyIn.add(
            getCudaMemcpyInLit(deviceVar, hostVar, hostVar.numElements))

    def mapForWrite(self, var, sizevar=None):
        self.declareAllocate(var)
        if sizevar == None:
            self.cudaMemcpyOut.add(getCudaMemcpyOut(var, var.numElements))
        else:
            self.cudaMemcpyOut.add(getCudaMemcpyOut(var, sizevar))

    def copyOut(self, var):
        self.declareAllocateDevice(var)
        self.cudaMemcpyIn.add(getCudaMemcpyIn(var, var.numElements))

    def initVar(self, var, init):
        call = KernelCall.library(
            "initArray",
            [var.getGPU(), str(init), var.numElements], var.dataType)
        self.cudaMallocHT.add(call.get())

    def declare(self, var, blocked=False):
        self.cudaMalloc.add(var.declareGPU())

    def declareAllocate(self, var, blocked=False):
        self.cudaMalloc.add(var.declareGPU())
        self.cudaMalloc.add(getCudaMalloc(var, var.numElements))
        self.cudaFree.add(getCudaFree(var))

    def declareAllocateHT(self, var):
        self.cudaMallocHT.add(var.declareGPU())
        self.cudaMallocHT.add(getCudaMalloc(var, var.numElements))
        self.cudaFree.add(getCudaFree(var))

    def declareAllocateLit(self, var, blocked=False):
        var.declarePointer(self.cudaMalloc)
        self.cudaMalloc.add(getCudaMallocLit(var, var.numElements))
        self.cudaFree.add(getCudaFreeLit(var))
Beispiel #10
0
def getIncludes():
    code = Code()
    code.add("#include <list>")
    code.add("#include <unordered_map>")
    code.add("#include <vector>")
    code.add("#include <iostream>")
    code.add("#include <ctime>")
    code.add("#include <limits.h>")
    code.add("#include <float.h>")
    code.add("#include \"../dogqc/include/csv.h\"")
    code.add("#include \"../dogqc/include/util.h\"")
    code.add("#include \"../dogqc/include/mappedmalloc.h\"")
    return code
Beispiel #11
0
def getCudaIncludes():
    code = Code()
    code.add("#include \"../dogqc/include/util.cuh\"")
    code.add("#include \"../dogqc/include/hashing.cuh\"")
    return code
Beispiel #12
0
    def getKernelCode(self):
        kernel = Code()

        # open kernel frame
        kernel.add("__global__ void " + self.kernelName + "(")
        comma = False
        params = ""
        for name, c in self.inputColumns.items():
            if not comma:
                comma = True
            else:
                params += ", "
            params += c.dataType + "* " + c.get()
        for a in self.outputAttributes:
            params += ", "
            params += a.dataType + "* " + ident.resultColumn(a)
        for v in self.variables:
            params += ", "
            params += v.dataType + "* " + v.get()
        kernel.add(params + ") {")

        # add code generated by operator tree
        kernel.add(self.init.content)

        # add code generated by operator tree
        kernel.add(self.body.content)

        # close kernel frame
        kernel.add("}")
        return kernel.content