Ejemplo n.º 1
0
def get_kernel_function_info(a, W1=0, W2=1, W3=1):
    """Show kernel information
    
    Including 
        1. max #threads per block, 
        2. active warps per MP, 
        3. thread block per MP, 
        4. usage of shared memory, 
        5. const memory , 
        6. local memory 
        7. registers
        8. hardware occupancy
        9. limitation of the hardware occupancy
    """

    import pycuda.tools as tl
    import pycuda.driver as dri
    dev = dri.Device(0)
    td = tl.DeviceData()
    if not W1:
        W1 = a.max_threads_per_block
    to = tl.OccupancyRecord(td, W1 * W2 * W3, a.shared_size_bytes, a.num_regs)

    print "***************************************"
    print "  Function Info    "
    print "   -> max threads per block: %d / %d / %d" % \
                (a.max_threads_per_block,
                        dev.max_threads_per_block,
                        dev.max_threads_per_multiprocessor)
    print "   -> shared mem : %d / %d" % (a.shared_size_bytes,
                                          td.shared_memory)
    print "   -> const mem : %d" % a.const_size_bytes
    print "   -> local mem : %d" % a.local_size_bytes
    print "   -> register : %d / %d" % (a.num_regs, td.registers)
    print "   -> thread block per MP %d / %d" % \
            (to.tb_per_mp, td.thread_blocks_per_mp)
    print "   -> warps per MP %d / %d" % (to.warps_per_mp, td.warps_per_mp)
    print "   -> occupancy %f" % to.occupancy
    print "   -> limitation %s" % to.limited_by
    print "  Block size : %dx%dx%d" % (W1, W2, W3)
    print "***************************************"
Ejemplo n.º 2
0
    def run(self,
            parameters,
            initValues,
            timing=True,
            info=False,
            constant_sets=False,
            pairings=False):

        #NEEDS TO BE ADDED AGAIN
        ##########################################################################
        #check parameters and initValues for compability with pre-defined parameterNumber and speciesNumber
        #if(len(parameters[0]) != self._parameterNumber):
        #    print "Error: Number of parameters specified (" + str(self._parameterNumber) + ") and given in parameter array (" + str(len(parameters[0])) + ") differ from each other!"
        #    exit()
        #elif(len(initValues[0]) != self._speciesNumber):
        #    print "Error: Number of species specified (" +  str(self._speciesNumber) + ") and given in species array (" + str(len(initValues[0])) + ") differ from each other!"
        #    exit()
        #elif(len(parameters) != len(initValues)):
        #    print "Error: Number of sets of parameters (" + str(len(parameters)) + ") and species (" + str(len(initValues)) + ") do not match!"
        #    exit()
        ##########################################################################

        #returnValue_final = [np.shape(parameters)[0],self._beta, self._resultNumber, self._speciesNumber]
        #returnValue_final = [np.zeros(returnValue_final) for x in self._stepCode]
        returnValue_final = [""] * len(self._cudafiles)

        if constant_sets == True:
            initValues_orig = initValues
            initValues_check = [x[0, :] for x in initValues_orig]
            parameters_orig = parameters

        total_time = 0.0

        for count, cuda in enumerate(self._stepCode):

            if constant_sets == True:
                initValues_ind = pairings[self._cudafiles[count]]
                initValues = np.zeros(
                    (np.shape(parameters)[0] * len(initValues_ind),
                     self._speciesNumber))
                for i, ICs in enumerate(initValues_ind):
                    index_IC = [sum(ICs == x) for x in initValues_check
                                ].index(self._speciesNumber)
                    initValues[i * np.shape(parameters_orig)[0]:(i + 1) *
                               np.shape(parameters_orig
                                        )[0], :] = initValues_orig[index_IC]
                parameters = np.concatenate(
                    (parameters_orig, ) * len(initValues_ind), axis=0)
                #print parameters

            if (self._compiledRunMethod == None and self._runtimeCompile):
                #compile to determine blocks and threads
                self._completeCode, self._compiledRunMethod = self._compileAtRuntime(
                    cuda, parameters)

            blocks, threads = self._getOptimalGPUParam(parameters)
            if info == True:
                print "cuda-sim: threads/blocks:", threads, blocks

            # real runtime compile

            #self._seedValue = seed
            #np.random.seed(self._seedValue)

            # make multiples of initValues
            initNew = np.zeros(
                (len(initValues) * self._beta, self._speciesNumber))
            for i in range(len(initValues)):
                for j in range(self._beta):
                    for k in range(self._speciesNumber):
                        initNew[i * self._beta + j][k] = initValues[i][k]
            initValues = initNew

            if info == True:
                print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs
                occ = tools.OccupancyRecord(
                    tools.DeviceData(),
                    threads=threads,
                    shared_mem=self._compiledRunMethod.shared_size_bytes,
                    registers=self._compiledRunMethod.num_regs)
                print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy

            if timing:
                start = time.time()

            # number of device calls
            runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE)))
            for i in range(runs):
                # for last device call calculate number of remaining threads to run
                if (i == runs - 1):
                    runblocks = int(blocks % self._MAXBLOCKSPERDEVICE)
                    if (runblocks == 0):
                        runblocks = self._MAXBLOCKSPERDEVICE
                else:
                    runblocks = int(self._MAXBLOCKSPERDEVICE)

                if info == True:
                    print "cuda-sim: Run", runblocks, "blocks."

                minIndex = self._MAXBLOCKSPERDEVICE * i * threads
                maxIndex = minIndex + threads * runblocks
                runParameters = parameters[minIndex / self._beta:maxIndex /
                                           self._beta]
                runInitValues = initValues[minIndex:maxIndex]

                #first run store return Value
                if (i == 0):
                    returnValue = self._runSimulation(runParameters,
                                                      runInitValues, runblocks,
                                                      threads)
                else:
                    returnValue = np.append(returnValue,
                                            self._runSimulation(
                                                runParameters, runInitValues,
                                                runblocks, threads),
                                            axis=0)

            if timing:
                print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round(
                    (time.time() - start), 4), "s"
                total_time += time.time() - start

            if info:
                print ""

            returnValue_final[count] = returnValue

        print "cuda-sim: total running time:", round((total_time), 4), "s"

        if len(returnValue_final) == 1:
            return returnValue_final[0]
        else:
            return returnValue_final
Ejemplo n.º 3
0
    def run(self):

        # obtain a CUDA context
        driver.init()
        if self._card < 0:
            self._context = tools.make_default_context()
        else:
            self._context = driver.Device(self._card).make_context()

        if self._info:
            print "cuda-sim: running on device ", self._card, self._context.get_device().name(), \
                self._context.get_device().pci_bus_id()

        # hack for SDE code
        self._device = 0

        # compile code
        self._completeCode, self._compiledRunMethod = self._compile(
            self._stepCode)

        blocks, threads = self._get_optimal_gpu_param()
        if self._info:
            print "cuda-sim: threads/blocks:", threads, blocks

        # make multiples of initValues incase beta > 1
        init_new = np.zeros(
            (len(self._initValues) * self._beta, self._speciesNumber))
        for i in range(len(self._initValues)):
            for j in range(self._beta):
                for k in range(self._speciesNumber):
                    init_new[i * self._beta + j][k] = self._initValues[i][k]
        self._initValues = copy.deepcopy(init_new)

        if self._info:
            print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, \
                self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs
            occ = tools.OccupancyRecord(
                tools.DeviceData(),
                threads=threads,
                shared_mem=self._compiledRunMethod.shared_size_bytes,
                registers=self._compiledRunMethod.num_regs)
            print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy

        if self._timing:
            start = time.time()

        # number of device calls
        runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE)))
        for i in range(runs):
            # for last device call calculate number of remaining threads to run
            if i == runs - 1:
                runblocks = int(blocks % self._MAXBLOCKSPERDEVICE)
                if runblocks == 0:
                    runblocks = self._MAXBLOCKSPERDEVICE
            else:
                runblocks = int(self._MAXBLOCKSPERDEVICE)

            if self._info:
                print "cuda-sim: Run", runblocks, "blocks."

            min_index = self._MAXBLOCKSPERDEVICE * i * threads
            max_index = min_index + threads * runblocks
            run_parameters = self._parameters[min_index /
                                              self._beta:max_index /
                                              self._beta]
            run_init_values = self._initValues[min_index:max_index]

            # first run store return Value
            if i == 0:
                self._returnValue = self._run_simulation(
                    run_parameters, run_init_values, runblocks, threads)
            else:
                self._returnValue = np.append(
                    self._returnValue,
                    self._run_simulation(run_parameters, run_init_values,
                                         runblocks, threads),
                    axis=0)

        self.output_cpu.put([self._card, self._returnValue])
        self.output_cpu.close()

        # if self._timing:
        #    print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round((time.time()-start),4), "s"

        if self._info:
            print ""

        # return the context
        self._context.pop()
        del self._context

        return self._returnValue
Ejemplo n.º 4
0
    def run(self, parameters, initValues, timing=True, info=False):

        #check parameters and initValues for compability with pre-defined parameterNumber and spieciesNumber
        if (len(parameters[0]) != self._parameterNumber):
            print "Error: Number of parameters specified (" + str(
                self.
                _parameterNumber) + ") and given in parameter array (" + str(
                    len(parameters[0])) + ") differ from each other!"
            exit()
        elif (len(initValues[0]) != self._speciesNumber):
            print "Error: Number of species specified (" + str(
                self._speciesNumber) + ") and given in species array (" + str(
                    len(initValues[0])) + ") differ from each other!"
            exit()
        elif (len(parameters) != len(initValues)):
            print "Error: Number of sets of parameters (" + str(
                len(parameters)) + ") and species (" + str(
                    len(initValues)) + ") do not match!"
            exit()

        if (self._compiledRunMethod == None and self._runtimeCompile):
            #compile to determine blocks and threads
            self._completeCode, self._compiledRunMethod = self._compileAtRuntime(
                self._stepCode, parameters)

        blocks, threads = self._getOptimalGPUParam(parameters)
        if info == True:
            print "cuda-sim: threads/blocks:", threads, blocks

        # real runtime compile

        #self._seedValue = seed
        #np.random.seed(self._seedValue)

        # make multiples of initValues
        initNew = np.zeros((len(initValues) * self._beta, self._speciesNumber))
        for i in range(len(initValues)):
            for j in range(self._beta):
                for k in range(self._speciesNumber):
                    initNew[i * self._beta + j][k] = initValues[i][k]
        initValues = initNew

        if info == True:
            print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs
            occ = tools.OccupancyRecord(
                tools.DeviceData(),
                threads=threads,
                shared_mem=self._compiledRunMethod.shared_size_bytes,
                registers=self._compiledRunMethod.num_regs)
            print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy

        if timing:
            start = time.time()

        # number of device calls
        runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE)))
        for i in range(runs):
            # for last device call calculate number of remaining threads to run
            if (i == runs - 1):
                runblocks = int(blocks % self._MAXBLOCKSPERDEVICE)
                if (runblocks == 0):
                    runblocks = self._MAXBLOCKSPERDEVICE
            else:
                runblocks = int(self._MAXBLOCKSPERDEVICE)

            if info == True:
                print "cuda-sim: Run", runblocks, "blocks."

            minIndex = self._MAXBLOCKSPERDEVICE * i * threads
            maxIndex = minIndex + threads * runblocks
            runParameters = parameters[minIndex / self._beta:maxIndex /
                                       self._beta]
            runInitValues = initValues[minIndex:maxIndex]

            #first run store return Value
            if (i == 0):
                returnValue = self._runSimulation(runParameters, runInitValues,
                                                  runblocks, threads)
            else:
                returnValue = np.append(returnValue,
                                        self._runSimulation(
                                            runParameters, runInitValues,
                                            runblocks, threads),
                                        axis=0)

        if timing:
            print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round(
                (time.time() - start), 4), "s"

        if info:
            print ""

        return returnValue