コード例 #1
0
ファイル: NMFskcuda.py プロジェクト: HKCaesar/nmf_cuda
    def setVariables(self):
        n, m, r = self.n, self.m, self.rank

        # compile the matrix separations and G update functions for CUDA
        G_size = m * r
        FTF_size = r**2
        max_threads = tools.DeviceData().max_threads
        self.block_G = int(np.min([G_size, max_threads]))
        self.grid_G = np.int(np.ceil(G_size/np.float32(self.block_G)))
        self.block_FTF = int(np.min([FTF_size, max_threads]))
        self.grid_FTF = np.int(np.ceil(FTF_size/np.float32(self.block_FTF)))
        mod_msepXTF = compiler.SourceModule(matrix_separation_code % G_size)
        mod_msepFTF = compiler.SourceModule(matrix_separation_code % FTF_size)
        mod_Gupdate = compiler.SourceModule(G_update_code % G_size)
        self.matrix_separationXTF = \
             mod_msepXTF.get_function("matrix_separation")
        self.matrix_separationFTF = \
             mod_msepFTF.get_function("matrix_separation")
        self.G_ew_update = mod_Gupdate.get_function("G_ew_update")

        # allocate the matrices on the GPU
        self.G_gpu = gpuarray.to_gpu(self.G)
        self.F_gpu = gpuarray.empty((n,r), np.float32)
        self.X_gpu = gpuarray.to_gpu(self.X)
        self.GTG_gpu = gpuarray.empty((r,r), np.float32)
        self.GTGinv_gpu = gpuarray.empty((r,r), np.float32)
        self.XG_gpu = gpuarray.empty((n,r), np.float32)
        self.XTF_gpu = gpuarray.empty((m,r), np.float32)
        self.FTF_gpu = gpuarray.empty((r,r), np.float32)
        self.XTFpos_gpu = gpuarray.empty((m,r), np.float32)
        self.XTFneg_gpu = gpuarray.empty((m,r), np.float32)
        self.FTFpos_gpu = gpuarray.empty((r,r), np.float32)
        self.FTFneg_gpu = gpuarray.empty((r,r), np.float32)
        self.GFTFneg_gpu = gpuarray.empty((m,r), np.float32)
        self.GFTFpos_gpu = gpuarray.empty((m,r), np.float32)
コード例 #2
0
ファイル: NMFskcuda.py プロジェクト: HKCaesar/nmf_cuda
    def setVariables(self):

        # compile the update functions for H and W as elementwise Matrix-Mult.
        # is not in skcuda
        H_size = self.rank * self.m
        W_size = self.n * self.rank
        max_threads = tools.DeviceData().max_threads
        self.block_H = int(np.min([H_size, max_threads]))
        self.block_W = int(np.min([W_size, max_threads]))
        self.grid_H = np.int(np.ceil(H_size/np.float32(self.block_H)))
        self.grid_W = np.int(np.ceil(W_size/np.float32(self.block_W)))
        mod_H = compiler.SourceModule(update_kernel_code % H_size)
        mod_W = compiler.SourceModule(update_kernel_code % W_size)
        self.update_H = mod_H.get_function("ew_md")
        self.update_W = mod_W.get_function("ew_md")

        # allocate the matrices on the GPU
        self.H_gpu = gpuarray.to_gpu(self.H)
        self.W_gpu = gpuarray.to_gpu(self.W)
        self.X_gpu = gpuarray.to_gpu(self.X)
        self.WTW_gpu = gpuarray.empty((self.rank, self.rank), np.float32)
        self.WTWH_gpu = gpuarray.empty(self.H.shape, np.float32)
        self.WTX_gpu = gpuarray.empty(self.H.shape, np.float32)
        self.XHT_gpu = gpuarray.empty(self.W.shape, np.float32)
        self.WH_gpu = gpuarray.empty(self.X.shape, np.float32)
        self.WHHT_gpu = gpuarray.empty(self.W.shape, np.float32)
コード例 #3
0
ファイル: filt.py プロジェクト: melvinlim/nnet
    def __init__(self, x=1, y=1, z=1, a=0, b=255, GPU=True):
        self.output = np.array(x * y * z * [0]).astype(np.float64)
        if x == 1:
            self.output.reshape(y)
        elif z == 1:
            self.output.reshape(x, y)
        else:
            self.output.reshape(x, y, z)
        self.x = x
        self.y = y
        self.z = z
        self.a = a
        self.b = b
        self.GPU = GPU

        if GPU:
            kernel_code = normalize1DTemplate % {
                'YDIM': self.y,
                'MIN': self.a,
                'MAX': self.b
            }
            module = compiler.SourceModule(kernel_code)
            self.normalize1DKernel = module.get_function("normalize1DKernel")
            kernel_code = normalize2DTemplate % {
                'XDIM': self.x,
                'YDIM': self.y,
                'MIN': self.a,
                'MAX': self.b
            }
            module = compiler.SourceModule(kernel_code)
            self.normalize2DKernel = module.get_function("normalize2DKernel")
コード例 #4
0
def histogram(light):
    grid_gpu_template = """
    __global__ void grid(int *values, int size, int *temp_grid)
    {
         unsigned int id = threadIdx.x;
         int i,bin;
         for(i=id;i<size;i+=blockDim.x){
             bin=values[i];
             if (values[i]==%(interv)s){
                values[i]=%(interv)s-1;
             }
             temp_grid[id*%(interv)s+bin]+=1.0;
         }
    }
    """

    reduction_gpu_template = """
    __global__ void reduction(int *temp_grid, int *his)
    {
         unsigned int id = blockIdx.x*blockDim.x+threadIdx.x;
         if(id<%(interv)s){
             for(int i=0;i<%(max_number_of_threads)s;i++){
                 his[id]+=temp_grid[id+%(interv)s*i];
             }
         }
    }
    """
    number_of_points = len(light)
    max_number_of_threads = 1024
    interv = 15626

    blocks = interv / max_number_of_threads
    if interv % max_number_of_threads != 0:
        blocks += 1

    grid_gpu = grid_gpu_template % {
        'interv': interv,
    }
    mod_grid = compiler.SourceModule(grid_gpu)
    grid = mod_grid.get_function("grid")

    reduction_gpu = reduction_gpu_template % {
        'interv': interv,
        'max_number_of_threads': max_number_of_threads,
    }
    mod_redt = compiler.SourceModule(reduction_gpu)
    redt = mod_redt.get_function("reduction")
    values_gpu = gpuarray.to_gpu(light)
    temp_grid_gpu = gpuarray.zeros((max_number_of_threads, interv), dtype=np.int32)
    hist = np.zeros(interv, dtype=np.int32)
    hist_gpu = gpuarray.to_gpu(hist)
    grid(values_gpu, np.int32(number_of_points), temp_grid_gpu, grid=(1, 1), block=(max_number_of_threads, 1, 1))
    redt(temp_grid_gpu, hist_gpu, grid=(blocks, 1), block=(max_number_of_threads, 1, 1))
    hist = hist_gpu.get()
    return hist
コード例 #5
0
ファイル: Lsoda.py プロジェクト: ada586/cuda-sim
    def _compile_at_runtime(self, step_code, parameters):
        # set beta to 1: repeats are pointless as simulation is deterministic
        self._beta = 1

        fc = open(
            os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                'cuLsoda_all.cu'), 'r')

        _sourceFromFile_ = fc.read()

        _isize_ = "#define ISIZE " + repr(20 + self._speciesNumber) + "\n"
        _rsize_ = "#define RSIZE " + repr(
            22 + self._speciesNumber * max(16, self._speciesNumber + 9)) + "\n"

        _textures_ = "texture<float, 2, cudaReadModeElementType> param_tex;\n"
        _common_block_ = "__device__ struct cuLsodaCommonBlock common[" + repr(
            1 * 1) + "];\n"
        _code_ = _isize_ + _rsize_ + _textures_ + step_code + _sourceFromFile_ + _common_block_ + self._lsoda_source_

        if self._dump:
            of = open("full_ode_code.cu", "w")
            print >> of, _code_

        # dummy compile to determine optimal blockSize and gridSize
        compiled = compiler.SourceModule(_code_,
                                         nvcc="nvcc",
                                         options=[],
                                         no_extern_c=True,
                                         keep=False)

        blocks, threads = self._get_optimal_gpu_param(
            parameters, compiled.get_function("cuLsoda"))
        blocks = self._MAXBLOCKSPERDEVICE

        # real compile
        _common_block_ = "__device__ struct cuLsodaCommonBlock common[" + repr(
            blocks * threads) + "];\n"
        _code_ = _isize_ + _rsize_ + _textures_ + step_code + _sourceFromFile_ + _common_block_ + self._lsoda_source_

        if self._dump:
            of = open("full_ode_code.cu", "w")
            print >> of, _code_

        compiled = compiler.SourceModule(_code_,
                                         nvcc="nvcc",
                                         options=[],
                                         no_extern_c=True,
                                         keep=False)

        self._param_tex = compiled.get_texref("param_tex")

        lsoda_kernel = compiled.get_function("cuLsoda")
        return compiled, lsoda_kernel
コード例 #6
0
ファイル: snp-v06.01.11.py プロジェクト: QWestos/snp-gpgpu
def genCks( allValidSpikVec, MATRIX_SIZE, TILE_WIDTH, configVec_str, spikTransMatFile) :
	#using all generated valid spiking vector files, 'feed' the files to the CUDA C kernels to evaluate (1)
	for spikVec in  allValidSpikVec[ 0 ] :
		# string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...]
		# to configVec = 211 <string>
		Ck_1_str = configVec_str 
		#write into total list of Ckspri
		#allGenCk = addTotalCk( allGenCk, Ck_1_str )
		#print spikVec		
		#form the filenames of the Cks and the Sks
		Ck = 'c_' + Ck_1_str + '_' + spikVec
		Ck_1 = 'c_' + Ck_1_str
		Sk = 's_' + spikVec
		#print ' Ck, Ck_1, Sk: ', Ck, Ck_1, Sk
		#import the vectors/Matrix as numpy ND arrays 
		Ck_1 = toNumpyArr( Ck_1, MATRIX_SIZE )
		Sk = toNumpyArr( Sk, MATRIX_SIZE )
		M = toNumpyArr( spikTransMatFile, MATRIX_SIZE )
		#allocate memory in the GPU
		Ck_1gpu = gpuarray.to_gpu( Ck_1 )
		Skgpu = gpuarray.to_gpu( Sk )
		Mgpu = gpuarray.to_gpu( M )
		SkMgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 )
		Ckgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 )
		#get kernel code from template by specifying the constant MATRIX_SIZE
		#matmul_kernel = matmul_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE}
		matmul_kernel = matmul_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH}
		#matadd_kernel = matadd_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE}
		matadd_kernel = matadd_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH}
		# compile the kernel code 
		mulmod = compiler.SourceModule(matmul_kernel)
		addmod = compiler.SourceModule(matadd_kernel)
		matrixmul = mulmod.get_function( "MatrixMulKernel" )
		matrixadd = addmod.get_function( "MatrixAddKernel" )
		#call kernel functions
		#matrixmul( Skgpu, Mgpu, SkMgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), )
		#print ' BEFORE DEVICE CALLS. Time is '
		#print str(datetime.now())
		#create PyCUDA events to record time of kernel execution
		startTime = driver.Event()
		endTime = driver.Event()
		startTime.record( ) #start the timer
		matrixmul( Skgpu, Mgpu, SkMgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) )
		#matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), )
		matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) )
		endTime.record( ) #start the end time timer.
		endTime.synchronize( ) # synchronize end of threads
		simTime = startTime.time_till( endTime ) * 1e-3
		print " Kernel call exec time is ", simTime
		#print ' AFTER DEVICE CALLS. Time is '
		#print str(datetime.now())
		#print Ck_1gpu.get()[ 4 ] #this is a numpy ND array
		#write ND array into a file
		NDarrToFile( Ck, Ckgpu )
コード例 #7
0
ファイル: snp-v04.16.1.py プロジェクト: QWestos/snp-gpgpu
def genCks(allValidSpikVec, MATRIX_SIZE, configVec_str, spikTransMatFile):
    #using all generated valid spiking vector files, 'feed' the files to the CUDA C program to evaluate (1)
    #execute CUDA C program e.g. os.popen('./snp-v12.26.10.1 c_211 s0 M 5 c_211_s0') given the generated spik vecs
    for spikVec in allValidSpikVec[0]:
        # string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...]
        # to configVec = 211 <string>
        Ck_1_str = configVec_str
        #write into total list of Ckspri
        #allGenCk = addTotalCk( allGenCk, Ck_1_str )
        #print spikVec
        #form the filenames of the Cks and the Sks
        Ck = 'c_' + Ck_1_str + '_' + spikVec
        Ck_1 = 'c_' + Ck_1_str
        Sk = 's_' + spikVec
        #import the vectors/Matrix as numpy ND arrays
        Ck_1 = toNumpyArr(Ck_1, MATRIX_SIZE)
        Sk = toNumpyArr(Sk, MATRIX_SIZE)
        M = toNumpyArr(spikTransMatFile, MATRIX_SIZE)
        #allocate memory in the GPU
        Ck_1gpu = gpuarray.to_gpu(Ck_1)
        Skgpu = gpuarray.to_gpu(Sk)
        Mgpu = gpuarray.to_gpu(M)
        SkMgpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.int32)
        Ckgpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.int32)
        #get kernel code from template by specifying the constant MATRIX_SIZE
        matmul_kernel = matmul_kernel_temp % {'MATRIX_SIZE': MATRIX_SIZE}
        matadd_kernel = matadd_kernel_temp % {'MATRIX_SIZE': MATRIX_SIZE}
        # compile the kernel code
        mulmod = compiler.SourceModule(matmul_kernel)
        addmod = compiler.SourceModule(matadd_kernel)
        matrixmul = mulmod.get_function("MatrixMulKernel")
        matrixadd = addmod.get_function("MatrixAddKernel")
        #call kernel functions
        matrixmul(
            Skgpu,
            Mgpu,
            SkMgpu,
            block=(MATRIX_SIZE, MATRIX_SIZE, 1),
        )
        matrixadd(
            Ck_1gpu,
            SkMgpu,
            Ckgpu,
            block=(MATRIX_SIZE, MATRIX_SIZE, 1),
        )
        #print Ck_1gpu.get()[ 4 ] #this is a numpy ND array
        #write ND array into a file
        NDarrToFile(Ck, Ckgpu)
コード例 #8
0
ファイル: gpu.py プロジェクト: Room-10/Opymize
def prepare_kernels(files, templates, constvars, blockvars={}):
    """ Compile and prepare CUDA kernel functions

    Args:
        files : list of cuda source file handles
        templates : list of tuples describing the kernels
        constvars : dict of readonly variables
        blockvars : dict of blockvars whose description will be included
                    in the preamble as preprocessor macros
    Returns:
        kernels : dict of executable CUDA kernels
    """
    preamble, constvars = prepare_vars(constvars, blockvars)
    kernels_code = preamble
    for f in files:
        kernels_code += f.read().decode("utf-8")

    mod = compiler.SourceModule(kernels_code)
    kernels = {}
    for d in templates:
        kernels[d[0]] = prepare_kernelfun(mod, *d)

    for name, val in constvars.items():
        const_ptr, size_in_bytes = mod.get_global(name)
        pycuda.driver.memcpy_htod(const_ptr, val)
        # WARNING: The gpudata argument in gpuarray.GPUArray usually requires a
        # pycuda.driver.DeviceAllocation and const_ptr is an int generated from
        # casting a CUdeviceptr to an int.
        # However, since DeviceAllocation is a simple wrapper around CUdeviceptr
        # (that gives a CUdeviceptr when cast to an int), it works like this.
        constvars[name] = gpuarray.GPUArray(val.shape,
                                            val.dtype,
                                            gpudata=const_ptr)

    return kernels
コード例 #9
0
def blur(source_array, standard_deviation, filter_width):
    result_array = np.empty_like(source_array)
    red_channel = source_array[:, :, 0].copy()
    green_channel = source_array[:, :, 1].copy()
    blue_channel = source_array[:, :, 2].copy()

    height, width = source_array.shape[:2]

    dim_grid_x = math.ceil(width / BLOCK_SIZE)
    dim_grid_y = math.ceil(height / BLOCK_SIZE)

    gaussian_kernel = create_gaussian_kernel(filter_width, standard_deviation)

    mod = compiler.SourceModule(open('./blur.cu').read())
    gaussian_blur = mod.get_function('gaussian_blur')

    for channel in (red_channel, green_channel, blue_channel):
        gaussian_blur(driver.In(channel),
                      driver.Out(channel),
                      np.uint32(width),
                      np.uint32(height),
                      driver.In(gaussian_kernel),
                      np.uint32(filter_width),
                      block=(BLOCK_SIZE, BLOCK_SIZE, 1),
                      grid=(dim_grid_x, dim_grid_y))

    result_array[:, :, 0] = red_channel
    result_array[:, :, 1] = green_channel
    result_array[:, :, 2] = blue_channel

    return result_array
コード例 #10
0
ファイル: cudakde.py プロジェクト: andrvch/gwmcmc
def kde_gauss_cuda1d(x, nbins1D):
    nsmpl = len(x)
    nbins = nbins1D
    sigmax = np.std(x)
    bandwidth = 1.06 * sigmax * nsmpl**(-1. / 5.)
    xi = linspace(x.min(), x.max(), nbins1D)
    x_gpu = gpuarray.to_gpu(x.astype(np.float32))
    xi_gpu = gpuarray.to_gpu(xi.astype(np.float32))
    pdf_gpu = gpuarray.zeros(nbins, np.float32)
    b_s = 16
    # get the kernel code from the template
    kernel_code = kernel_code_template_cov_1d % {
        'DATA_SIZE': nsmpl,
        'BAND_W': bandwidth,
    }
    # compile the kernel code
    mod = compiler.SourceModule(kernel_code)
    # get the kernel function from the compiled module
    cuda_gauss_kde = mod.get_function("gauss_kde1d")
    # call the kernel on the card
    cuda_gauss_kde(
        # inputs
        xi_gpu,
        x_gpu,
        # output
        pdf_gpu,
        #
        grid=(nbins // b_s, 1),
        # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
        block=(b_s, 1, 1),
    )
    return xi, pdf_gpu.get()  #*(xi[1]-xi[0])
コード例 #11
0
ファイル: pcaGPU.py プロジェクト: Parall-UD/sallfus
def componentes_principales_panchromartic(r_s , g_s, b_s, q, size, block_size):
    block_size = block_size
    nb1_temp = []
    nb2_temp = []
    nb3_temp = []
    size = size
    kernel_code = kernel_componentes_principales_pancromatica % {
        'BLOCK_SIZE': BLOCK_SIZE,
        }
    mod = compiler.SourceModule(kernel_code)
    kernel = mod.get_function("componentesPrincipalesPancromatica")
    s1_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    s2_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    s3_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    Rs_gpu_t = gpuarray.to_gpu(r_s)
    Gs_gpu_t = gpuarray.to_gpu(g_s)
    Bs_gpu_t = gpuarray.to_gpu(b_s)
    q_gpu = gpuarray.to_gpu(q)
    for i in range(len(r_s)):
        kernel(
            # inputs
            Rs_gpu_t[i], Gs_gpu_t[i], Bs_gpu_t[i], q_gpu,
            # output
            s1_gpu, s2_gpu, s3_gpu,
            # block of multiple threads
            block = (block_size, block_size, 1),
            )
        nb1_temp.append(s1_gpu.get())
        nb2_temp.append(s2_gpu.get())
        nb3_temp.append(s3_gpu.get())
    nb1 = stack_values(nb1_temp, g_s, size, block_size)
    nb2 = stack_values(nb2_temp, g_s, size, block_size)
    nb3 = stack_values(nb3_temp, g_s, size, block_size)
    return nb1, nb2, nb3
コード例 #12
0
    def col_update(self, itr, A, X_device, P, sin, cos, iterBlock):
        self.A_device = gpuarray.to_gpu(A)
        self.X_device = gpuarray.to_gpu(X_device)
        self.dev_sin = gpuarray.to_gpu(sin)
        self.dev_cos = gpuarray.to_gpu(cos)
        self.iterBlock_device = gpuarray.to_gpu(iterBlock)

        if (P % 2 == 0):
            grid_size = P / 2
        else:
            grid_size = P / 2 + 1

        mod2 = compiler.SourceModule(self.col_update_kernel_code)
        col_update_code = mod2.get_function("kernel_col_update")

        col_update_code(itr,
                        self.A_device,
                        self.X_device,
                        P,
                        self.device_eigenvectors,
                        self.dev_sin,
                        self.dev_cos,
                        self.iterBlock_device,
                        block=(np.int(P), np.int(P), 1),
                        grid=(np.int(grid_size), np.int(grid_size), 1))

        return self.device_eigenvectors.get()
コード例 #13
0
def dotp(a_cpu, b_cpu):

    print(a_cpu.shape, b_cpu.shape)

    # transfer host (CPU) memory to device (GPU) memory
    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)

    # create empty gpu array for the result (C = A * B)
    c_gpu = gpuarray.empty((1, 1), np.float32)

    MATRIX_SIZE = len(a_cpu)

    # get the kernel code from the template
    # by specifying the constant MATRIX_SIZE
    kernel_code = kernel_code_template % {'MATRIX_SIZE': MATRIX_SIZE}

    # compile the kernel code
    mod = compiler.SourceModule(kernel_code)

    # get the kernel function from the compiled module
    matrixmul = mod.get_function("MatrixMulKernel")

    # call the kernel on the card
    matrixmul(
        # inputs
        a_gpu,
        b_gpu,
        # output
        c_gpu,
        # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
        block=(MATRIX_SIZE, MATRIX_SIZE, 1),
    )

    return c_gpu.get()
コード例 #14
0
ファイル: svd_cuda.py プロジェクト: mythrandire/pycuda-svd
    def MatMul(self, A, rA, cA, B, rB, cB):

        self.C_gpu = gpuarray.empty((A.shape[0], B.shape[1]), dtype=np.float32)
        self.A_gpu = gpuarray.to_gpu(A)
        self.B_gpu = gpuarray.to_gpu(B)

        mod = compiler.SourceModule(self.mul_kernel_code)
        dev_mul = mod.get_function("kernel_MatMul")

        grid_x = np.int(np.ceil(cB * 1.0 / 16))
        grid_y = np.int(np.ceil(rA * 1.0 / 16))

        dev_mul(self.A_gpu,
                rA,
                cA,
                self.B_gpu,
                rB,
                cA,
                self.C_gpu,
                block=(16, 16, 1),
                grid=(grid_x, grid_y, 1))
        """
            dev_mul(
                self.A_gpu, rA, cA,
                self.B_gpu,
                self.C_gpu,
                block = (16, 16, 1),
                grid = (grid_x, grid_y, 1)
            )
            """
        return self.C_gpu.get()
コード例 #15
0
 def initKernels(self, *args, **kwargs):
     super(tanhLayer, self).initKernels(*args, **kwargs)
     m = self.A.shape[0]
     n = self.A.shape[1]
     kernel_code = cudaModules.forwardTemplate % {'NROWS': m, 'NCOLS': n}
     module = compiler.SourceModule(kernel_code)
     self.kernels.forwardKernel = module.get_function("forwardKernel")
コード例 #16
0
ファイル: utils.py プロジェクト: twistedmove/pyhawkes
def compile_kernels(srcFile, kernelNames, srcParams=None):
    """
    Load the GPU kernels from the specified CUDA C file 
    """
    import pycuda.compiler as nvcc

    # Read the src file into a string
    custr = ""
    with io.open(srcFile, 'r') as file:
        for l in file:
            custr += l

    ## Replace consts in cu file
    if srcParams != None:
        custr = custr % srcParams

    # Compile the CUDA Kernel
    cu_kernel_source_module = nvcc.SourceModule(custr)

    # Load the kernels into a dictionary
    kernels = {}
    for name in kernelNames:
        try:
            kernels[name] = cu_kernel_source_module.get_function(name)
        except:
            log.error("Failed to find kernel function: %s", name)
            exit()

    return kernels
コード例 #17
0
    def parallel_transpose(self):
        # return: the transpose of input matrix
        # TODO:
        # Memory copy to device
        # Function call and measuring time here

        mod = compiler.SourceModule(self.kernel_code)  #compiling the kernel
        transpose = mod.get_function("transpose")  #getting the function

        start = time.time()  #start timer
        transpose(
            # inputs
            self.a_gpu,
            # output
            self.b_gpu,
            np.float32(self.a_cpu.shape[1]),
            np.float32(self.a_cpu.shape[0]),

            # block size
            block=(16, 16, 1))  #kernel call
        self.times_gpu = time.time() - start  #get time
        # Memory copy to host
        self.b_cpu = self.b_gpu.get()  #copy result to host variable
        # Return output and measured time

        return self.b_cpu, self.times_gpu
コード例 #18
0
    def matrix_mul_naive(self):
        #MATRIX_SIZE = 10
        #self.kernel_code = self.kernel_code_template % {
        #'MATRIX_SIZE': MATRIX_SIZE
        #}

        TILE_WIDTH = self.TILE_WIDTH
        self.kernel_code = self.kernel_code_template % {
            'TILE_WIDTH': TILE_WIDTH
        }
        n = int(np.ceil(self.b_cpu.shape[1] / TILE_WIDTH))
        m = int(np.ceil(self.a_cpu.shape[0] / TILE_WIDTH))
        mod = compiler.SourceModule(self.kernel_code)
        matmul = mod.get_function("MatrixMulKernel_naive")
        start = time.time()
        matmul(self.a_gpu,
               self.b_gpu,
               self.c_gpu,
               np.int32(self.a_cpu.shape[0]),
               np.int32(self.a_cpu.shape[1]),
               np.int32(self.b_cpu.shape[1]),
               block=(TILE_WIDTH, TILE_WIDTH, 1),
               grid=(n, m, 1))
        times_gpu_ = time.time() - start

        self.c_cpu = self.c_gpu.get()

        return self.c_cpu, times_gpu_
コード例 #19
0
    def _get_func(self):
        """
    
        """    

        kernel_template = """
        
        #define Ws_W $Ws_W
        #define Ws_H $Ws_H
        
        __global__ void grad_Ws(double*d, double*x, double*grad){
        
            const size_t tx = threadIdx.x;
            const size_t ty = threadIdx.y;
            
            __shared__ double sd[Ws_H];
            __shared__ double sx[Ws_W];
            
            if (tx < Ws_W && ty < Ws_H){
            
                sx[threadIdx.x] = x[threadIdx.x];
                sd[threadIdx.y] = d[threadIdx.y];
                
                __syncthreads();

                grad[ty * Ws_W + tx] += sd[ty] * sx[tx];
            }
        }
        """
        kernel_template = string.Template(kernel_template)
        kernel_code = kernel_template.substitute(Ws_W = self.params['Ws_w'],
                                                 Ws_H = self.params['Ws_h'])
        module = compiler.SourceModule(kernel_code)
        return module.get_function('grad_Ws')
コード例 #20
0
def rgb2gray(image, height, width, channels=3):
    """
    Metodo para la conversion de RGB a escala de grises
    """
    # Asignacion de los tamanos de los vectores necesarios
    a_cpu = np.array(image).astype(np.float32)
    b_cpu = np.zeros((height, width)).astype(np.float32)

    # Asignacion de memoria requerida dentro del procesamiento
    a_gpu = cuda.mem_alloc(a_cpu.nbytes)
    b_gpu = cuda.mem_alloc(b_cpu.nbytes)

    # Copia de la informacion a de la cpu a la gpu
    cuda.memcpy_htod(a_gpu, a_cpu)

    # Kernel modificado con los valores necesarios
    kernel_code = kernel_code_template % {
        'width': str(width),
        'height': str(height),
        'channels': str(channels)
    }

    # LLamdo del kernel
    mod = compiler.SourceModule(kernel_code)
    matrixmul = mod.get_function('rgb2gray')
    # Ejecucion del kernel
    matrixmul(b_gpu, a_gpu, block=(6, 36, 1), grid=(100, 8, 1))

    #Copia de los resultados procesados por el kernel al cpu
    cuda.memcpy_dtoh(b_cpu, b_gpu)
    return b_cpu
コード例 #21
0
        def col_update(self, iter, A, X, P, sin, cos, iterBlock):
            self.A_device = gpuarray.to_gpu(A)
            self.dev_sin = gpuarray.to_gpu(sin)
            self.dev_cos = gpuarray.to_gpu(cos)
            self.iterBlock_device = gpuarray.to_gpu(iterBlock)
            self.X_device = gpuarray.to_gpu(X)
            self.device_eigenvectors = gpuarray.empty((P, P))

            if (P % 2 == 0):
                grid_size = P / 2
            else:
                grid_size = P / 2 + 1

            mod2 = compiler.SourceModule(col_update_kernel_code)
            col_update_code = mod2.get_function(kernel_col_update)

            col_update_code(
                iter, self.A_device,
                self.X_device, P,
                self.device_eigenvectors,
                self.dev_sin, self.device_cos,
                self.iterBlock_device,
                block = (P, P, 1),
                grid = (grid_size, grid_size)
            )

            return self.device_eigenvectors.get()
コード例 #22
0
    def __init__(self, idata):
        # idata: an array of lower characters.
        # TODO:
        # Declare host variables
        # Device memory allocation
        # Kernel code
        # -- initialize the device
        self.a_cpu = idata
        self.L = len(idata)

        import pycuda.autoinit

        self.kernel_code = """
        __global__ void CapCharKernel(char *a, char *b)
        {
            // 1D Thread ID (assuming that only *one* block will be executed)
            int tx = threadIdx.x;
            b[tx] = a[tx] - 32;
        }
        """

        # compile the kernel code
        mod = compiler.SourceModule(self.kernel_code)

        # get the kernel function from the compiled module
        self.capchar = mod.get_function("CapCharKernel")

        self.b_gpu = gpuarray.empty(self.L, 'S1')
コード例 #23
0
    def compute_params(self, A, P, itr, iterblock):
        self.A_gpu = gpuarray.to_gpu(A)
        self.iterBlock_device = gpuarray.to_gpu(iterblock)
        self.dev_sin = gpuarray.empty((P, P), np.float32)
        self.dev_cos = gpuarray.empty((P, P), np.float32)
        # self.iterBlock_device = gpuarray.empty((P-1)*P / 2 * 2), astype.int)
        if (P % 2 == 0):
            grid_size = np.int(P / 2)
        else:
            grid_size = np.int(P / 2 + 1)
        mod = compiler.SourceModule(self.compute_params_kernel_code)
        compute_params_code = mod.get_function("kernel_compute_params")
        compute_params_code(self.A_gpu,
                            P,
                            itr,
                            self.dev_sin,
                            self.dev_cos,
                            self.iterBlock_device,
                            block=(grid_size, grid_size, 1))
        # block size?
        dc = self.dev_cos.get()
        ds = self.dev_sin.get()
        self.A_gpu.get()
        self.iterBlock_device.get()

        return ds, dc
コード例 #24
0
    def runAdd_parallel(self):
        # return: an array containing capitalized characters from idata and running time.
        # TODO:
        # Memory copy to device
        input_gpu = gpuarray.to_gpu(self.a)  #allocate for input
        #        print("Parallel Input completed")

        # Function call and measuring time here

        #        print("Starting Kernel")
        kernel_code = self.capitalize_kernel  #get kernel code from template
        mod = compiler.SourceModule(kernel_code)  #compile the kernel code
        capitalize = mod.get_function("Capitalize")  # get the kernel function

        start = datetime.datetime.now()
        capitalize(input_gpu,
                   self.output_gpu,
                   block=(1024, 1, 1),
                   grid=(self.num_blocks + 1, 1, 1))
        total_time = datetime.datetime.now() - start

        # Memory copy to host
        result = self.output_gpu.get()

        # Return output and measured time
        #        print(result, total_time)
        return result, total_time
コード例 #25
0
ファイル: grid_traverse.py プロジェクト: mickvav/fdfd
    def __init__(self, shape, code, *params):
        """Return a cuda function that will execute on a grid.

        Input arguments:
        shape -- the size of the grid.
        code -- the CUDA source code to be executed at every cell.
        *params -- (type, name) tuples of the input parameters.

        Output arguments:
        wrapped_fun -- a function that accepts a list of pycuda.gpuarray.GPUArray
            objects as well as pycuda.driver.Function.__call__ keyword arguments.
        """

        # Initialize parameters.
        self.shape = shape  # Size of the simulation.
        self.block_shapes, self.grid_shapes = _get_shapes(shape)

        # Get the template and render it using jinja2.
        template = jinja_env.get_template('traverse.cu')
        cuda_source = template.render(  params=params, \
                                        dims=self.shape, \
                                        loop_code=code, \
                                        flat_tag='_f')
        f = open('/tmp/code', 'w')
        f.write(cuda_source)

        # Compile the code into a callable cuda function.
        mod = compiler.SourceModule(cuda_source)
        self.fun = mod.get_function('traverse')
コード例 #26
0
ファイル: pcaGPU.py プロジェクト: Parall-UD/sallfus
def componentes_principales_original(r_s , g_s, b_s, q, size, block_size):
    cp1_temp = []
    cp2_temp = []
    cp3_temp = []
    size = size
    block_size = block_size
    kernel_code = kernel_componentes_principales_original % {
        'BLOCK_SIZE': BLOCK_SIZE,
        }
    mod = compiler.SourceModule(kernel_code)
    kernel = mod.get_function("componentesPrincipalesOriginal")
    s1_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    s2_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    s3_gpu = gpuarray.zeros((block_size,block_size),np.float32)
    q_gpu = gpuarray.to_gpu(q)
    Rs_gpu_t = gpuarray.to_gpu(r_s)
    Gs_gpu_t = gpuarray.to_gpu(g_s)
    Bs_gpu_t = gpuarray.to_gpu(b_s)
    for i in range(len(r_s)):
        kernel(
            # inputs
            Rs_gpu_t[i], Gs_gpu_t[i], Bs_gpu_t[i], q_gpu,
            # output
            s1_gpu, s2_gpu, s3_gpu,
            # block of multiple threads
            block = (block_size, block_size, 1),
            )
        cp1_temp.append(s1_gpu.get())
        cp2_temp.append(s2_gpu.get())
        cp3_temp.append(s3_gpu.get())
    cp1 = stack_values(cp1_temp, r_s, size, block_size)
    cp2 = stack_values(cp2_temp, r_s, size, block_size)
    cp3 = stack_values(cp3_temp, r_s, size, block_size)
    return cp1, cp2, cp3
コード例 #27
0
    def generate_forward_euler_code(self):
        eqs = self.eqs
        M = len(eqs._diffeq_names)
        all_variables = eqs._eq_names + eqs._diffeq_names + eqs._alias.keys() + ['t']
        clines = '__global__ void stateupdate(int N, SCALAR t, SCALAR *S)\n'
        clines += '{\n'
        clines += '    int i = blockIdx.x * blockDim.x + threadIdx.x;\n'
        clines += '    if(i>=N) return;\n'
        for j, name in enumerate(eqs._diffeq_names):
            clines += '    int _index_' + name + ' = i+' + str(j) + '*N;\n'
        for j, name in enumerate(eqs._diffeq_names):
#            clines += '    SCALAR &' + name + ' = S[i+'+str(j)+'*N];\n'
            clines += '    SCALAR ' + name + ' = S[_index_' + name + '];\n'
        for j, name in enumerate(eqs._diffeq_names):
            namespace = eqs._namespace[name]
            expr = optimiser.freeze(eqs._string[name], all_variables, namespace)
            expr = rewrite_to_c_expression(expr)
            print expr
            if name in eqs._diffeq_names_nonzero:
                clines += '    SCALAR ' + name + '__tmp = ' + expr + ';\n'
        for name in eqs._diffeq_names_nonzero:
#            clines += '    '+name+' += '+str(self.clock_dt)+'*'+name+'__tmp;\n'
            clines += '    S[_index_' + name + '] = ' + name + '+' + str(self.clock_dt) + '*' + name + '__tmp;\n'
        clines += '}\n'
        clines = clines.replace('SCALAR', self.precision)
        self.gpu_mod = compiler.SourceModule(clines)
        self.gpu_func = self.gpu_mod.get_function("stateupdate")
        return clines
コード例 #28
0
def genModulo():
    """
    Calculates corners of objects across all frames
    Returns: GPU kernel
    """

    # Setup Kernel
    kernels = compiler.SourceModule("""
    #include <stdio.h>

    __global__ void FindObj(int* a, int* frameOrigin, int* c, int img_height, int img_width)
    {
       // Setup Indexing
       int tx = threadIdx.x;
       int ty = threadIdx.y;
       int row_o = blockIdx.y*blockDim.y + ty;
       int col_o = blockIdx.x*blockDim.x + tx;

       // printf("%i %i \\n", tx, ty);
       // printf("row_o:%i col_o:%i \\n", row_o, col_o);
       // printf("blockIdx.y:%i blockDim.y:%i \\n", blockIdx.y, blockDim.y);
       // printf("blockIdx.x:%i blockDim.x:%i \\n", blockIdx.x, blockDim.x);

       int pixelColor = a[row_o*img_width + col_o];

        //Gather all criteria for corner
        bool row_plus1 = false; bool row_plus2 = false;
        bool col_plus1 = false; bool col_plus2 = false;
        bool row_minus1 = true; bool col_minus1 = true;

        //Next rows in same column have same color and are in bounds
        if(((row_o + 1) %  img_height) < img_height && a[(row_o + 1)*img_width + col_o] == pixelColor){row_plus1 = true;}
        if(((row_o + 2) %  img_height) < img_height && a[(row_o + 2)*img_width + col_o] == pixelColor){row_plus2 = true;}

        //Next cols in same row have same color and are in bounds
        if((col_o + 1) < img_width && a[row_o*img_width + col_o + 1] == pixelColor){col_plus1 = true;}
        if((col_o + 2) < img_width && a[row_o*img_width + col_o + 2] == pixelColor){col_plus2 = true;}

        //Confirm not in the middle of an edge and on a vertex
        if((row_o - 1) >= 0 && a[(row_o - 1)*img_width + col_o] == pixelColor){row_minus1 = false;}
        if((col_o - 1) >= 0 && a[row_o*img_width + col_o - 1] == pixelColor){col_minus1 = false;}

        //printf("row: %i col: %i pixelColor: %i, row_plus1: %i, row_plus2: %i, col_plus1: %i, col_plus2: %i, row_minus1: %i, col_minus1: %i \\n",row_o, col_o, pixelColor, row_plus1, row_plus2, col_plus1, col_plus2, row_minus1, col_minus1);
        //If vertex, do all calculations for global array
        if(row_plus1 && row_plus2 && col_plus1 && col_plus2 && row_minus1 && col_minus1){
            int frame = (int) ((row_o*img_width + col_o)/(img_width*img_height-1));
            //printf("Found corner. row_o:%i col_o:%i pixelColor: %i, frame: %i\\n",row_o, col_o, pixelColor, frame);
            frameOrigin[frame*2] = col_o;
            frameOrigin[frame*2+1] = row_o  %  img_height;
        }

        //Check all criteria for validity
        //Modulo absolute thread location by frame size and store x val/y val to according locations in global array
        //After all global array values set, compare set of 2 x/y pairs to determine movement and set movement indicator in other global array
        //Return final global array
    }

    """)

    return kernels
コード例 #29
0
    def compile_kernel(self):
        """Compile the file containing the CUDA kernels."""

        t0 = time()
        kernel_code = open(self.kernel, 'r').read()
        self.mod = compiler.SourceModule(kernel_code)
        print('GPU - Kern : %f' % (time() - t0))
コード例 #30
0
    def __init__(self, n, nrhs, coeffs):
        '''
        Parameters
        ----------
        n: The size of the tridiagonal system.
        nrhs: The number of right hand sides
        coeffs: A list of coefficients that make up the tridiagonal matrix:
            (b1, c1, ai, bi, ci, an, bn)
        '''
        self.n = n
        self.nrhs = nrhs
        self.coeffs = coeffs

        # check that system_size is a power of 2:
        assert np.int(np.log2(self.n)) == np.log2(self.n)

        # compute coefficients a, b, etc.,
        a, b, c, k1, k2, b_first, k1_first, k1_last = _precompute_coefficients(
            self.n, self.coeffs)

        # copy coefficients to buffers:
        self.a_d = gpuarray.to_gpu(a)
        self.b_d = gpuarray.to_gpu(b)
        self.c_d = gpuarray.to_gpu(c)
        self.k1_d = gpuarray.to_gpu(k1)
        self.k2_d = gpuarray.to_gpu(k2)
        self.b_first_d = gpuarray.to_gpu(b_first)
        self.k1_first_d = gpuarray.to_gpu(k1_first)
        self.k1_last_d = gpuarray.to_gpu(k1_last)

        tpl = jinja2.Template(kernel_template)
        rendered_kernel = tpl.render(n=self.n, shared_size=self.n / 2)
        module = compiler.SourceModule(rendered_kernel, options=['-O2'])
        self.cyclic_reduction = module.get_function('sharedMemCyclicReduction')
        self.cyclic_reduction.prepare('PPPPPPPPPddddd')