Beispiel #1
0
def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
Beispiel #2
0
    def __compute_sub_gaussian_gpu(self, sub_partitions):
        if sub_partitions < 1:
            raise Exception("You can't have less than 1 partition")
        elif sub_partitions > self.pts.shape[0]:
            raise Exception("sub partitions need to be smaller than pts size")
        # Delta Partitions
        d_part = self.pts.shape[0]/sub_partitions

        # Does the correct partitioning
        alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize
        self.pts_gpu = cuda.mem_alloc(alloc_size)
        self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0])
        self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2])

        for partition in range(sub_partitions):
            sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :]
            self.__compute_guassian_on_pts(sub_pts)
        self.pts_gpu.free()

        # See's if there is a remainder of points to work with
        if self.pts.shape[0] % sub_partitions:
            alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize)
            self.pts_gpu = cuda.mem_alloc(alloc_size)
            self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :])
            self.pts_gpu.free()
Beispiel #3
0
def main():
    (h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0])
    if LINEAR:
        thread_x, thread_y, thread_z = 128,1,1
        block_x, block_y = (w*h*d)/thread_x, 1
        if (w*h*d)%thread_x:
            block_x += 1
    else:
        thread_x, thread_y, thread_z = 16, 8, d
        block_x, block_y = h / thread_x, w / thread_y
        if h % thread_x:
            block_x += 1
        if w % thread_y:
            block_y += 1
    #print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y)

    image_data_size = 2896782 * 4
    a_gpu = cuda.mem_alloc(image_data_size)
    b_gpu = cuda.mem_alloc(image_data_size)
    c_gpu = cuda.mem_alloc(image_data_size)
    
    image_path_pairs = []
    for i in xrange(50):
        page_num = i + 1
        path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num
        image_path_pairs.append((path1,path2))
    
    do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
Beispiel #4
0
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
Beispiel #5
0
    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
Beispiel #6
0
def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu):
	modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32)


	modelist_gpu = cuda.mem_alloc(modelist.nbytes)

#	nsampslen = np.array(len(theta), ndmin=1).astype(np.int32)
	nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32)
	nsamps_gpu = cuda.mem_alloc(nsamps.nbytes)
	nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes) 	
	
	cuda.memcpy_htod(nsamps_gpu, nsamps)
	cuda.memcpy_htod(nmodes_gpu, nmodeslen)

#	cuda.memcpy_htod(theta_gpu, theta)
#	cuda.memcpy_htod(phi_gpu, phi)
	cuda.memcpy_htod(modelist_gpu, modelist)


	# Get and compile the cuda function 
	sph = mod.get_function("compute_sph_harmonics_l_eq_2")
	result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2)	
	blk  = (1024,1,1)
	grd = (1,1,1) 
	sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd)	

#	cuda.memcpy_dtoh(result, result_gpu)
#	print(result[0:9])
#	print(len(result))
	return	
Beispiel #8
0
    def alloc(self, dim, stream=None):
        """
        Ensure that this object's framebuffers are large enough to handle the
        given dimensions, allocating new ones if not.

        If ``stream`` is not None and a reallocation is necessary, the stream
        will be synchronized before the old buffers are deallocated.
        """
        nbins = dim.ah * dim.astride
        if self.nbins >= nbins:
            return
        if self.nbins is not None:
            self.free()
        try:
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back = cuda.mem_alloc(16 * nbins)
            self.d_side = cuda.mem_alloc(16 * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
            # don't want to kill the server, but we also don't want to leave
            # it stuck without any free memory to complete the next alloc.
            # TODO: measure free mem and only take tasks that fit (but that
            # should be done elsewhere)
            self.free(stream)
            raise e
    def prepare_device_arrays(self):

        self.maxLayers  = self.grid_prop.GetMaxLayers()
        nczbins_fine    = len(self.czcen_fine)
        numLayers       = np.zeros(nczbins_fine,dtype=np.int32)
        densityInLayer  = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
        distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)

        self.grid_prop.GetNumberOfLayers(numLayers)
        self.grid_prop.GetDensityInLayer(densityInLayer)
        self.grid_prop.GetDistanceInLayer(distanceInLayer)

        # Copy all these earth info arrays to device:
        self.d_numLayers       = cuda.mem_alloc(numLayers.nbytes)
        self.d_densityInLayer  = cuda.mem_alloc(densityInLayer.nbytes)
        self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
        cuda.memcpy_htod(self.d_numLayers,numLayers)
        cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
        cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)

        self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
        self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
        cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
        cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)

        return
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
Beispiel #11
0
    def __init__(self, max_size, offsets=None):
        """
        Create a sorter. The sorter will hold on to internal resources for as
        long as it is alive, including an 'offsets' array of size 4*max_size.
        To share this cost, you may pass in an array of at least this size to
        __init__ (to, for instance, share across different bit-widths in a
        multi-pass sort).
        """
        self.init_mod()
        self.max_size = max_size
        assert max_size % self.group_size == 0
        max_grids = max_size / self.group_size

        if offsets is None:
            self.doffsets = cuda.mem_alloc(self.max_size * 4)
        else:
            self.doffsets = offsets
        self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4)
        self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4)

        # There are probably better ways to choose how many condensation
        # groups to launch. TODO: maybe pick one if I care
        self.ncond = 32
        self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4)
        self.dglobal = cuda.mem_alloc(self.radix_size * 4)
Beispiel #12
0
    def __init__(self, init_data, n_generators):

        self.ctx = curr_gpu.make_context()
        self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        ################################################################################################################

        self.width_mat = np.int32(init_data.shape[0])
        #        self.gpu_init_data = ga.to_gpu(init_data)
        self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
        cuda.memcpy_htod(self.gpu_init_data, init_data)

        self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
        print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
        cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
        #        self.gpu_new_data = ga.to_gpu(self.cpu_new_data)

        self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
        self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
        cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
        #        self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
        ################################################################################################################
        self.init_rng = self.module.get_function("init_rng")
        self.gen_rand_mat = self.module.get_function("gen_rand_mat")
        self.sum_along_axis = self.module.get_function("sum_along_axis")
        self.norm_along_axis = self.module.get_function("norm_along_axis")
        self.init_vect_sum = self.module.get_function("init_vect_sum")
        self.copy_mat = self.module.get_function("copy_mat")
        ################################################################################################################
        self.n_generators = n_generators
        seed = 1
        self.rng_states = cuda.mem_alloc(
            n_generators
            * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
        )
        self.init_rng(
            np.int32(n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(n_generators // 64 + 1, 1),
        )
        (free, total) = cuda.mem_get_info()

        size_block_x = 32
        size_block_y = 32
        n_blocks_x = int(self.width_mat) // (size_block_x) + 1
        n_blocks_y = int(self.width_mat) // (size_block_y) + 1
        self.grid = (n_blocks_x, n_blocks_y, 1)
        self.block = (size_block_x, size_block_y, 1)
Beispiel #13
0
def confirmInitialization(featuresForSOM,somMatrix):
    #allocate memory for the somcuda on the device
    somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
    somBytesPerRow = np.int32(somMatrix.strides[0])
    somNumberOfRows = np.int32(somMatrix.shape[0])
    somNumberOfColumns = np.int32(somMatrix.shape[1])
    pycuda.memcpy_htod(somMatrixPtr,somMatrix)
    #allocate space for bmu index
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    intraDayOffset = features.columns.get_loc('Ret_121')
    dayOffset = features.columns.get_loc('Ret_PlusOne')
    objVal = 0.0;
    objSampSize=0.0
    r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] 
    nodeHitMatrix = np.array(r).astype(np.float32)
    hitCountDict = defaultdict(list)
    samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
    if len(samples) >= len(featuresForSOM):
        samples = [x for x in range (0, len(featuresForSOM))]       
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
    fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
    ax = plt.subplot(111)
    somplot = plt.imshow(nodeHitMatrix,interpolation="none")
    plt.show()
    plt.pause(0.1)
Beispiel #14
0
    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
Beispiel #15
0
def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns):
    adjustNodes = {}
    sampSize = 0
    cumDistance = 0.0
    nodeHitMatrix.fill(0)
    hitCountDict.clear()
    if len(featuresForSOM) < 100:
        sampSize = len(featuresForSOM)
    elif currentIter < len(featuresForSOM):
        sampSize = int(currentIter)
        if sampSize == 0:
            sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM))
    else:
        sampSize = len(featuresForSOM)
    samples = [x for x in range (0,sampSize)]
    #allocate space for bmu
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    #allocate space for bmu index
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        cumDistance += np.min(bmu)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        adjustNodes[i]=[block,thread]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    iterationDistance.append(cumDistance/sampSize)
    iterationCount.append(currentIter)
    return cumDistance/sampSize
Beispiel #16
0
	def gfx_init( self ) :
		try :
			print 'compiling'
			self.prog = sh.compile_program_vfg( 'shad/balls' )

			print 'compiled'

			self.loc_mmv = sh.get_loc(self.prog,'modelview' )
			self.loc_mp  = sh.get_loc(self.prog,'projection')
			self.l_color = sh.get_loc(self.prog,'color'     )
			self.l_size  = sh.get_loc(self.prog,'ballsize'  )

		except ValueError as ve :
			print "Shader compilation failed: " + str(ve)
			sys.exit(0)    

#        glUseProgram( self.prog )
#        glUniform1i( pointsid , 0 );
#        glUseProgram( 0 )

		#
		# cuda init
		#
		self.grid = (int(self.BOX),int(self.BOX))
		self.block = (1,1,int(self.BOX))

		print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))
#        print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_X
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z

		floatbytes = np.dtype(np.float32).itemsize

		self.gpos = glGenBuffers(1)
		glBindBuffer( GL_ARRAY_BUFFER , self.gpos )
		glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW )
		glBindBuffer( GL_ARRAY_BUFFER , 0 )

		self.df1 = cuda_driver.mem_alloc( self.f.nbytes )
		self.df2 = cuda_driver.mem_alloc( self.f.nbytes )

		cuda_driver.memcpy_htod( self.df1 , self.f )
		cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q )

		mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' )

		self.collision = mod.get_function("collision_step")
		self.collision.prepare( "Piii" )

		self.streaming = mod.get_function("streaming_step")
		self.streaming.prepare( "PPiii" )

		self.colors = mod.get_function("colors")
		self.colors.prepare( "PPiii" )
Beispiel #17
0
    def __init__(self, filename):
        Pattern.__init__(self, filename)

        if self.n&1023 != 0:
            raise ValueError('Number of patterns must be a multiple of 1024.')

        self.patterns_gpu = cuda.mem_alloc(self.patterns.nbytes)
        cuda.memcpy_htod(self.patterns_gpu, self.patterns)

        self.input_gpu = cuda.mem_alloc(4*((40*8)+16))
        self.result_gpu = gpuarray.empty((40,self.n), dtype=numpy.float32, allocator=cuda.mem_alloc)
Beispiel #18
0
    def _initME(self):
        """Initializes the MotionEnergy CUDA functions."""
        logging.debug('initME')

        # register all device functions for easy access
        # imported from motion_energy_device.py
        self.dev_conv1 = mod.get_function("dev_conv1")
        self.dev_convn = mod.get_function("dev_convn")
        self.dev_accumDiffStims = mod.get_function("dev_accumDiffStims")
        self.dev_filt2dir = mod.get_function("dev_filt2dir")
        self.dev_edges = mod.get_function("dev_edges")
        self.dev_fullRect2 = mod.get_function("dev_fullRect2")
        self.dev_mean3 = mod.get_function("dev_mean3")
        self.dev_normalize = mod.get_function("dev_normalize")
        self.dev_split_gray = mod.get_function("dev_split_gray")
        self.dev_split_RGB = mod.get_function("dev_split_RGB")
        self.dev_sub = mod.get_function("dev_sub")
        self.dev_ave = mod.get_function("dev_ave")
        self.dev_sum = mod.get_function("dev_sum")
        self.dev_scaleHalfRect = mod.get_function("dev_scaleHalfRect")
        self.dev_scale = mod.get_function("dev_scale")
        self.dev_split_gray = mod.get_function("dev_split_gray")
        self.dev_split_RGB = mod.get_function("dev_split_RGB")
        self.dev_memcpy_dtod = mod.get_function("dev_memcpy_dtod")

        # for quick access: the size in bytes of nrX*nrY floats
        self.szXY = self.sizeofFloat * self.nrX * self.nrY

        # V1 filter responses
        self.d_resp = cuda.mem_alloc(self.szXY*self.nrFilters*self.nrScales)

        # V1 complex cell responses
        self.d_respV1c = cuda.mem_alloc(self.szXY*self.nrDirs)

        # stim frame
        self.d_stim = cuda.mem_alloc(self.szXY*self.nrC)

        # stim frame buffer (last nrT frames)
        self.d_stimBuf = cuda.mem_alloc(self.szXY*self.nrT)
        # I'm not sure if this memset works as expected... for now, memcpy an
        # array of zeros
        # cuda.memset_d32(self.d_stimBuf, 0, self.nrX*self.nrY*self.nrT)
        tmp = np.zeros(self.nrX*self.nrY*self.nrT).astype(np.float32)
        cuda.memcpy_htod(self.d_stimBuf, tmp)

        self.d_diffV1GausBufT = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)

        self.d_scalingStimBuf = cuda.mem_alloc(self.szXY*self.nrT)
        self.d_v1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
        self.d_diffV1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
        self.d_pop = cuda.mem_alloc(self.szXY*self.nrScales)

        self.d_scalingFilt = mod.get_global("d_scalingFilt")[0]
        self.d_v1GaussFilt = mod.get_global("d_v1GaussFilt")[0]
        self.d_complexV1Filt = mod.get_global("d_complexV1Filt")[0]
        self.d_normV1filt = mod.get_global("d_normV1filt")[0]
        self.d_diff1filt = mod.get_global("d_diff1filt")[0]
        self.d_diff2filt = mod.get_global("d_diff2filt")[0]
        self.d_diff3filt = mod.get_global("d_diff3filt")[0]
    def stepN(self,positions,velocities,n):
        x_gpu = cuda.mem_alloc(positions.nbytes)
        v_gpu = cuda.mem_alloc(velocities.nbytes)

        cuda.memcpy_htod(x_gpu,positions)
        cuda.memcpy_htod(v_gpu,velocities)

        import numpy as np
        self.cuBoris(x_gpu, v_gpu, np.int32(n), block=(1024,1,1), grid=(self.numParts/1024 + 1,1))

        cuda.memcpy_dtoh(positions,x_gpu)
        cuda.memcpy_dtoh(velocities,v_gpu)
Beispiel #20
0
def CudaRPN(inPath, outPath, mycode, mydata, **kw):
    """CudaRPN implements the interface to the CUDA run environment.
    """
    verbose = kw.get('verbose', False)
    BLOCK_SIZE = 1024  # Kernel grid and block size
    STACK_SIZE = 64
    # OFFSETS = 64
    # unary_operator_names = {'plus': '+', 'minus': '-'}
    function = Function(
        start=len(hardcase),
        bss=64,
        handcode=kw.get('handcode'))

    with Timing('Total execution time'):
        with Timing('Get and convert image data to gpu ready'):
            im = Image.open(inPath)
            px = array(im).astype(float32)
            function.assemble(mycode, mydata, verbose=True)
            function.disassemble(verbose=True)
            cx = array(function.final).astype(int32)
            dx = array(function.data).astype(float32)
        with Timing('Allocate mem to gpu'):
            d_px = mem_alloc(px.nbytes)
            memcpy_htod(d_px, px)
            d_cx = mem_alloc(cx.nbytes)
            memcpy_htod(d_cx, cx)
            d_dx = mem_alloc(dx.nbytes)
            memcpy_htod(d_dx, dx)
        with Timing('Kernel execution time'):
            block = (BLOCK_SIZE, 1, 1)
            checkSize = int32(im.size[0]*im.size[1])
            grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)
            kernel = INCLUDE + HEAD + function.body + convolve + TAIL
            sourceCode = kernel % {
                'pixelwidth': 3,
                'stacksize': STACK_SIZE,
                'case': function.case}
            with open("RPN_sourceCode.c", "w") as target:
                print>>target, sourceCode
            module = SourceModule(sourceCode)
            func = module.get_function("RPN")
            func(d_px, d_cx, d_dx, checkSize, block=block, grid=grid)
        with Timing('Get data from gpu and convert'):
            RPNPx = empty_like(px)
            memcpy_dtoh(RPNPx, d_px)
            RPNPx = uint8(RPNPx)
        with Timing('Save image time'):
            pil_im = Image.fromarray(RPNPx, mode="RGB")
            pil_im.save(outPath)
    # Output final statistics
    if verbose:
        print '%40s: %s%s' % ('Target image', outPath, im.size)
        print Timing.text
 def add(slice_a, slice_b):
     slice_c = np.empty_like(slice_a)
     a_gpu = cuda.mem_alloc(slice_a.nbytes)
     cuda.memcpy_htod(a_gpu, slice_a)
     b_gpu = cuda.mem_alloc(slice_b.nbytes)
     cuda.memcpy_htod(b_gpu, slice_b)
     c_gpu = cuda.mem_alloc(slice_c.nbytes)
     start = time.time()
     func(a_gpu, b_gpu, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1))
     end = time.time()
     cuda.memcpy_dtoh(slice_c, c_gpu)
     return (slice_c, end-start)
Beispiel #22
0
    def advect(self, b, d, d_0, u, v, dt):
        size = self.size - 2
        dt_0 = dt * (size)

        bX = size
        bY = size
        gX = 1
        gY = 1

        u_gpu = cuda.mem_alloc(u[1:-1, 1:-1].nbytes)
        cuda.memcpy_htod(u_gpu, u[1:-1, 1:-1].reshape(size**2))

        #u_gpu = cuda.In(u[1:-1, 1:-1].reshape(size**2))

        if self.debug and not np.array_equal(u, np.zeros((self.size, self.size))):
            print ">>> U"
            print u
            print "<<< U"

        v_gpu = cuda.mem_alloc(v[1:-1, 1:-1].nbytes)
        cuda.memcpy_htod(v_gpu, v[1:-1, 1:-1].reshape(size**2))

        #v_gpu = cuda.In(v[1:-1, 1:-1].reshape(size**2))

        d_0_gpu = cuda.mem_alloc(d_0.nbytes)
        cuda.memcpy_htod(d_0_gpu, d_0.reshape(self.size**2))

        #d_0_gpu = cuda.In(d_0.reshape(self.size**2))

        #d_gpu = cuda.mem_alloc(d[1:-1, 1:-1].nbytes)
        #cuda.memcpy_htod(d_gpu, d[1:-1, 1:-1].reshape(size**2))

        d_gpu = cuda.Out(d[1:-1, 1:-1].reshape(size**2))

        if self.debug:
            print ">>> Entry >>>"
            print d[1:-1, 1:-1]
            print "<<< Kernel Launch <<<"

        self.func_easy(u_gpu, v_gpu, d_gpu, d_0_gpu,
                       block=(bX, bY, 1), grid=(gX, gY))

        #d_result = np.empty_like(d[1:-1, 1:-1]).astype(np.float32)
        #cuda.memcpy_dtod(d_result, d_gpu, d_result.nbytes)

        if self.debug:
            print ">>> Result >>>"
            print d[1:-1, 1:-1]
            print "<<< End Result <<<"

        #d[1:-1, 1:-1] = d_result
        self.set_boundary(b, d)
def blobs_potential(r_vectors, *args, **kwargs):
  '''
  This function compute the energy of the blobs.
  '''
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  periodic_length = kwargs.get('periodic_length')
  debye_length_wall = kwargs.get('debye_length_wall')
  eps_wall = kwargs.get('repulsion_strength_wall')
  debye_length = kwargs.get('debye_length')
  eps = kwargs.get('repulsion_strength')
  weight = kwargs.get('weight')
  blob_radius = kwargs.get('blob_radius')  

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
        
  # Allocate CPU memory
  U = np.empty(number_of_blobs)

  # Allocate GPU memory
  utype = np.float64(1.)
  x_gpu = cuda.mem_alloc(x.nbytes)
  u_gpu = cuda.mem_alloc(U.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get pair interaction function
  potential_from_position_blobs = mod.get_function("potential_from_position_blobs")

  # Compute pair interactions
  potential_from_position_blobs(x_gpu, u_gpu,
                                number_of_blobs,
                                np.float64(periodic_length[0]),
                                np.float64(periodic_length[1]),
                                np.float64(debye_length_wall),
                                np.float64(eps_wall),
                                np.float64(debye_length),
                                np.float64(eps),
                                np.float64(weight),
                                np.float64(blob_radius),
                                block=(threads_per_block, 1, 1),
                                grid=(num_blocks, 1)) 
    
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(U, u_gpu)
  return np.sum(U)
Beispiel #24
0
 def prepare(self, P):
     n = len(P.state_(self.eqs._diffeq_names_nonzero[0]))
     var_len  = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t
     
     for index,varname in enumerate(self.eqs._diffeq_names):
         self.index_to_varname.append(varname)
         self.varname_to_index[varname]= index
         if varname in self.eqs._diffeq_names_nonzero :
             self.index_nonzero.append(index)
     
     self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize
     self.S_in_gpu = cuda.mem_alloc(nbytes)
     self.S_out_gpu = cuda.mem_alloc(nbytes)
     
     Z = zeros((n,var_len))
     self.A_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.A_gpu, Z)
     self.B_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.B_gpu, Z)
     self.S_temp_gpu = cuda.mem_alloc(nbytes)
     
     modFun={}
     self.applyFun = {}
     for x in self.index_nonzero:
         s = self.eqs._function_C_String[self.index_to_varname[x]]
         args_fun =[]
         for i in xrange(var_len):
             args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]")
         modFun[x] = SourceModule("""
             __device__ double f"""+ s +"""
             
             __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len)
             { 
                 
                 int idx = x + blockIdx.x * var_len;
                 S_temp[idx] = 0;
                 B[idx] = f("""+",".join(args_fun)+""");
                 S_temp[idx] = 1;
                 A[idx] = f("""+",".join(args_fun)+""") - B[idx];
                 B[idx] /= A[idx];
                 S_temp[idx] = S_in[idx];
             }
             """)
         self.applyFun[x] = modFun[x].get_function("applyFun")
         self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1))
     
     self.calc_dict = {}
     self.already_calc = {}
Beispiel #25
0
	def processor(frame):
		"""Applies the frame_filter 2D array to each channel of the image"""
		
		# allocate memory and transfer from host to device
		d_frame_in, d_frame_out = cu.mem_alloc(frame.nbytes), cu.mem_alloc(frame.nbytes) #, cu.mem_alloc(offset.nbytes), cu.mem_alloc(F.nbytes)
		cu.memcpy_htod(d_frame_in, frame)
		cu.memcpy_htod(d_frame_out, frame)
		
		filter_kernel(d_frame_in, d_frame_out, block=b_size, grid= g_size)

		# transfer from device to host
		cu.memcpy_dtoh(frame, d_frame_out)
		return frame
Beispiel #26
0
def execute(positions, num_particles, num_frames):
    #Get host positions:
    cpuPos = numpy.array(positions, dtype=numpy.float32)
    #Allocate position space on device:
    devPos = cuda.mem_alloc(cpuPos.nbytes)
    #Copy positions:
    cuda.memcpy_htod(devPos, cpuPos)
    
    #Allocate device velocities:
    devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes)
    cuda.memset_d32(devVels, 0, 2 * num_particles)
    # #Copy velocities:
    # cuda.memcpy_htod(devVels, cpuVels)
    
    #Allocate and initialize device in bounds to false:
    #inBounds = numpy.zeros(num_particles, dtype=bool)
    devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes)
    cuda.memset_d8(devInBounds, True, num_particles)
    
    # inB = numpy.zeros(num_particles, dtype=numpy.bool)
    # cuda.memcpy_dtoh(inB, devInBounds)
    # print inB
    
    # cuda.memcpy_htod(devInBounds, inBounds)
    # numBlocks = 1#(num_particles // 512) + 1;
    grid_dim = ((num_particles // NUM_THREADS) + 1, 1)
    print grid_dim
    runframe = module.get_function("runframe")
    frames = [None] * num_frames
    for i in range(num_frames):
        runframe(devPos, devVels, devInBounds, 
                 numpy.int32(num_particles),
                 grid=grid_dim,
                 block=(NUM_THREADS, 1, 1))
        #Get the positions from device:
        cuda.memcpy_dtoh(cpuPos, devPos)
        frames[i] = cpuPos.copy()
        #frames[i] = copy(cpuPos)
        #write_frame(out, cpuPos, num_particles)
    
    #Simulation destination file:
    # out = open(OUTPUT_FILE, 'w')
    # write_header(out, num_particles)
    # for frame in frames:
    #     write_frame(out, frame, num_particles)
    
    #clean up...
    #out.close()
    devPos.free()
    devVels.free()
    devInBounds.free()
Beispiel #27
0
    def _gpuAlloc(self):
        #Get GPU information
        self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
        self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols)))    # multiply by 4 as that is size of float
        # set max rows to smaller number to save memory usage
        if self.totalRows < self.maxPossRows:
            print "reducing max rows to reduce memory use on GPU"
            self.maxPossRows = self.totalRows

        # create pagelocked buffers and GPU arrays
        self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
        self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
Beispiel #28
0
    def test_multi_context(self):
        if drv.get_version() < (2,0,0):
            return
        if drv.get_version() >= (2,2,0):
            if drv.Context.get_device().compute_mode == drv.compute_mode.EXCLUSIVE:
                return

        mem_a = drv.mem_alloc(50)
        ctx2 = drv.Context.get_device().make_context()
        mem_b = drv.mem_alloc(60)

        del mem_a
        del mem_b
        ctx2.detach()
Beispiel #29
0
    def test(cls, count, correctness=False):
        keys = np.uint32(np.random.randint(0, 1<<cls.radix_bits, size=count))
        dkeys = cuda.to_device(keys)
        dout_a = cuda.mem_alloc(count * 4)
        dout_b = cuda.mem_alloc(count * 4)

        sorter = cls(count)
        stream = cuda.Stream()

        def test_stub(shift, trials=10, rounds=1):
            # Run once so that evt_a doesn't include initialization time
            sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_a = cuda.Event().record(stream)
            for i in range(trials):
                buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_b = cuda.Event().record(stream)
            evt_b.synchronize()
            dur = evt_b.time_since(evt_a) / (rounds * trials)
            print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000),
                    count * sorter.radix_bits / (dur * 32 * 1000))

            if shift == 0 and correctness:
                print '\nTesting correctness'
                out = cuda.from_device(buf, (count,), np.uint32)
                sort = np.sort(keys)
                if np.all(out == sort):
                    print 'Correct'
                else:
                    nz = np.nonzero(out != sort)[0]
                    print sorted(set(nz >> 13))
                    for i in nz:
                        print i, out[i-1:i+2], sort[i-1:i+2]
                    assert False, 'Oh no'


        for b in range(cls.radix_bits - 3):
            print '%2d (%2d sig bits),\t' % (cls.radix_bits, cls.radix_bits - b),
            test_stub(b)

        if not correctness:
            for r in range(2,3):
                keys[:] = np.uint32(
                        np.random.randint(0, 1<<(cls.radix_bits*r), count))
                cuda.memcpy_htod(dkeys, keys)
                print '%2d x %d,\t\t\t' % (cls.radix_bits, r),
                test_stub(0, rounds=r)
        print
Beispiel #30
0
	def set_round_drill( self , size ) :
		sx , sy = self.get_scale()
		nx , ny = int(size / sx + .5) , int(size / sy + .5)

		self.drillflat = False

		print 'Setting round drill:'
		print size
		print sx , sy 
		print nx , ny

		self.hdrill = np.zeros( (nx,ny) , np.float32 )

		size /= 2.0
		for x in range(nx) :
			for y in range(ny) :
				fx = (x-int(nx/2+.5)) * sx
				fy = (y-int(ny/2+.5)) * sy 
				ts = size*size - fx*fx - fy*fy
				self.hdrill[x,y] = -m.sqrt( ts ) + size if ts > 0 else size*2

		self.drillrad = size

		print self.hdrill
		print self.drillrad

		self.cdrill = cuda_driver.mem_alloc( self.hdrill.nbytes )
		cuda_driver.memcpy_htod( self.cdrill , self.hdrill )

		self.grid = map( int , ( m.ceil(nx/22.0) , m.ceil(ny/22.0) ) )
		self.block = ( min(nx,22) , min(ny,22) , 1 )

		print self.grid 
		print self.block
Beispiel #31
0
d_plotting_information['gpu_s1pf_lb_acc'] = pycuda.gpuarray.to_gpu(
    a_s1pf_lb_acc)

d_plotting_information['gpu_s1pf_mean_acc'] = pycuda.gpuarray.to_gpu(
    a_s1pf_mean_acc)

d_plotting_information['gpu_s1pf_ub_acc'] = pycuda.gpuarray.to_gpu(
    a_s1pf_ub_acc)

# get random seeds setup
local_gpu_setup_kernel = pycuda.compiler.SourceModule(
    cuda_full_observables_production.cuda_full_observables_production_code,
    no_extern_c=True).get_function('setup_kernel')
local_rng_states = drv.mem_alloc(
    np.int32(num_blocks * block_dim) * pycuda.characterize.sizeof(
        'curandStateXORWOW', '#include <curand_kernel.h>'))
local_gpu_setup_kernel(np.int32(int(num_blocks * block_dim)),
                       local_rng_states,
                       np.uint64(0),
                       np.uint64(0),
                       grid=(int(num_blocks), 1),
                       block=(int(block_dim), 1, 1))

# get observables function
gpu_observables_func = SourceModule(
    cuda_full_observables_production.cuda_full_observables_production_code,
    no_extern_c=True).get_function(
        'gpu_full_observables_production_with_log_hist_no_fv')
gpu_observables_func_arrays = SourceModule(
    cuda_full_observables_production.cuda_full_observables_production_code,
Beispiel #32
0
X, Y      = np.meshgrid(x, y)

img       = X * Y
img       = np.asarray(img, float)

plt.figure(1)
plt.imshow(img)
plt.colorbar()
plt.title('Input image')

print(img.dtype)

"""Moving the data to the device and allocating space for the result."""

# --- Move the image from host to device
d_img  = cuda.mem_alloc(img.nbytes)
cuda.memcpy_htod(d_img, img)
d_img2 = cuda.mem_alloc(img.nbytes)

"""Operating the 2D fftshift."""

fftshift2D(d_img, d_img2, np.int32(M), np.int32(N), block = blockDim, grid = gridDim)

img2   = np.empty_like(img)
cuda.memcpy_dtoh(img2, d_img2)

plt.figure(2)
plt.imshow(img2)
plt.colorbar()
plt.title('Output image')
Beispiel #33
0
def cudaArrayMalloc(state,sourceMod):
    global stateCUDA, cudaStep
    stateCUDA = cuda.mem_alloc(state.nbytes)
    cuda.memcpy_htod(stateCUDA, state) #copy state to GPU
    cudaStep = sourceMod.get_function("stepTestInterface")
Beispiel #34
0
# prepare engine
with open(cfg.weight,
          'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()

    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))

        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))

        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

# ------------------------------------------------------------------------------------------------------------
# Since also the inference procedure are done on GPU, so any other CUDA relevant operation should be excluded,
# e.g. CUDA operation in PyTorch, or some unexpected error may occur.
# ------------------------------------------------------------------------------------------------------------

# detect images
Beispiel #35
0
    def _runSimulation(self,
                       parameters,
                       initValues,
                       blocks,
                       threads,
                       in_atol=1e-12,
                       in_rtol=1e-6):

        totalThreads = threads * blocks
        experiments = len(parameters)

        neqn = self._speciesNumber

        # compile
        timer = time.time()
        ## print "Init Common..",
        init_common_Kernel = self._completeCode.get_function("init_common")
        init_common_Kernel(block=(threads, 1, 1), grid=(blocks, 1))
        ## print "finished in", round(time.time()-timer,4), "s"

        start_time = time.time()
        # output array
        ret_xt = np.zeros(
            [totalThreads, 1, self._resultNumber, self._speciesNumber])

        # calculate sizes of work spaces
        isize = 20 + self._speciesNumber
        rsize = 22 + self._speciesNumber * max(16, self._speciesNumber + 9)

        # local variables
        t = np.zeros([totalThreads], dtype=np.float64)
        jt = np.zeros([totalThreads], dtype=np.int32)
        neq = np.zeros([totalThreads], dtype=np.int32)
        itol = np.zeros([totalThreads], dtype=np.int32)
        iopt = np.zeros([totalThreads], dtype=np.int32)
        rtol = np.zeros([totalThreads], dtype=np.float64)
        iout = np.zeros([totalThreads], dtype=np.int32)
        tout = np.zeros([totalThreads], dtype=np.float64)
        itask = np.zeros([totalThreads], dtype=np.int32)
        istate = np.zeros([totalThreads], dtype=np.int32)
        atol = np.zeros([totalThreads], dtype=np.float64)

        liw = np.zeros([totalThreads], dtype=np.int32)
        lrw = np.zeros([totalThreads], dtype=np.int32)
        iwork = np.zeros([isize * totalThreads], dtype=np.int32)
        rwork = np.zeros([rsize * totalThreads], dtype=np.float64)
        y = np.zeros([self._speciesNumber * totalThreads], dtype=np.float64)

        for i in range(totalThreads):
            neq[i] = neqn
            #t[i] = self._timepoints[0]
            t[i] = 0
            itol[i] = 1
            itask[i] = 1
            istate[i] = 1
            iopt[i] = 0
            jt[i] = 2
            atol[i] = in_atol
            rtol[i] = in_rtol

            liw[i] = isize
            lrw[i] = rsize

            try:
                # initial conditions
                for j in range(self._speciesNumber):
                    # loop over species
                    y[i * self._speciesNumber + j] = initValues[i][j]
                    ret_xt[i, 0, 0, j] = initValues[i][j]
            except IndexError:
                pass

        # allocate on device
        d_t = driver.mem_alloc(t.size * t.dtype.itemsize)
        d_jt = driver.mem_alloc(jt.size * jt.dtype.itemsize)
        d_neq = driver.mem_alloc(neq.size * neq.dtype.itemsize)
        d_liw = driver.mem_alloc(liw.size * liw.dtype.itemsize)
        d_lrw = driver.mem_alloc(lrw.size * lrw.dtype.itemsize)
        d_itol = driver.mem_alloc(itol.size * itol.dtype.itemsize)
        d_iopt = driver.mem_alloc(iopt.size * iopt.dtype.itemsize)
        d_rtol = driver.mem_alloc(rtol.size * rtol.dtype.itemsize)
        d_iout = driver.mem_alloc(iout.size * iout.dtype.itemsize)
        d_tout = driver.mem_alloc(tout.size * tout.dtype.itemsize)
        d_itask = driver.mem_alloc(itask.size * itask.dtype.itemsize)
        d_istate = driver.mem_alloc(istate.size * istate.dtype.itemsize)
        d_y = driver.mem_alloc(y.size * y.dtype.itemsize)
        d_atol = driver.mem_alloc(atol.size * atol.dtype.itemsize)
        d_iwork = driver.mem_alloc(iwork.size * iwork.dtype.itemsize)
        d_rwork = driver.mem_alloc(rwork.size * rwork.dtype.itemsize)

        # copy to device
        driver.memcpy_htod(d_t, t)
        driver.memcpy_htod(d_jt, jt)
        driver.memcpy_htod(d_neq, neq)
        driver.memcpy_htod(d_liw, liw)
        driver.memcpy_htod(d_lrw, lrw)
        driver.memcpy_htod(d_itol, itol)
        driver.memcpy_htod(d_iopt, iopt)
        driver.memcpy_htod(d_rtol, rtol)
        driver.memcpy_htod(d_iout, iout)
        driver.memcpy_htod(d_tout, tout)
        driver.memcpy_htod(d_itask, itask)
        driver.memcpy_htod(d_istate, istate)
        driver.memcpy_htod(d_y, y)
        driver.memcpy_htod(d_atol, atol)
        driver.memcpy_htod(d_iwork, iwork)
        driver.memcpy_htod(d_rwork, rwork)

        param = np.zeros((totalThreads, self._parameterNumber),
                         dtype=np.float32)
        try:
            for i in range(len(parameters)):
                for j in range(self._parameterNumber):
                    param[i][j] = parameters[i][j]
        except IndexError:
            pass

        # parameter texture
        ary = sim.create_2D_array(param)
        sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4,
                                 totalThreads)
        self._param_tex.set_array(ary)

        if self._dt <= 0:
            start_time = time.time()
            #for i in range(1,self._resultNumber):
            for i in range(0, self._resultNumber):

                for j in range(totalThreads):
                    tout[j] = self._timepoints[i]
                driver.memcpy_htod(d_tout, tout)

                self._compiledRunMethod(d_neq,
                                        d_y,
                                        d_t,
                                        d_tout,
                                        d_itol,
                                        d_rtol,
                                        d_atol,
                                        d_itask,
                                        d_istate,
                                        d_iopt,
                                        d_rwork,
                                        d_lrw,
                                        d_iwork,
                                        d_liw,
                                        d_jt,
                                        block=(threads, 1, 1),
                                        grid=(blocks, 1))

                driver.memcpy_dtoh(t, d_t)
                driver.memcpy_dtoh(y, d_y)
                driver.memcpy_dtoh(istate, d_istate)

                for j in range(totalThreads):
                    for k in range(self._speciesNumber):
                        ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k]

            # end of loop over time points

        else:
            tt = self._timepoints[0]

            start_time = time.time()
            #for i in range(1,self._resultNumber):
            for i in range(0, self._resultNumber):
                while 1:

                    next_time = min(tt + self._dt, self._timepoints[i])

                    for j in range(totalThreads):
                        tout[j] = next_time
                    driver.memcpy_htod(d_tout, tout)

                    self._compiledRunMethod(d_neq,
                                            d_y,
                                            d_t,
                                            d_tout,
                                            d_itol,
                                            d_rtol,
                                            d_atol,
                                            d_itask,
                                            d_istate,
                                            d_iopt,
                                            d_rwork,
                                            d_lrw,
                                            d_iwork,
                                            d_liw,
                                            d_jt,
                                            block=(threads, 1, 1),
                                            grid=(blocks, 1))

                    driver.memcpy_dtoh(t, d_t)
                    driver.memcpy_dtoh(y, d_y)
                    driver.memcpy_dtoh(istate, d_istate)

                    if np.abs(next_time - self._timepoints[i]) < 1e-5:
                        tt = next_time
                        break

                    tt = next_time

                for j in range(totalThreads):
                    for k in range(self._speciesNumber):
                        ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k]

            # end of loop over time points

        return ret_xt[0:experiments]
def Decrypt():
    #Initialize Timers
    if cfg.DEBUG_IMAGES:
        misc_timer = np.zeros(6)
    else:
        misc_timer = np.zeros(5)

    perf_timer = np.zeros(5)
    overall_time = perf_counter()

    # Read input image
    misc_timer[0] = overall_time
    img = cv2.imread(cfg.ENC_OUT, 1)
    if img is None:
        print("File does not exist!")
        raise SystemExit(0)
    dim = img.shape

    misc_timer[1] = perf_counter()
    # Read log file
    with open(cfg.LOG, "r") as f:
        width = int(f.readline())
        height = int(f.readline())
        rounds = int(f.readline())
        #fracID = int(f.readline())
    misc_timer[1] = perf_counter() - misc_timer[1]
    
    # Flatten image to vector and send to GPU
    imgArr  = np.asarray(img).reshape(-1)
    gpuimgIn  = cuda.mem_alloc(imgArr.nbytes)
    gpuimgOut = cuda.mem_alloc(imgArr.nbytes)
    cuda.memcpy_htod(gpuimgIn, imgArr)
    misc_timer[0] = perf_counter() - misc_timer[0] - misc_timer[1]

    # Warm-Up GPU for accurate benchmarking
    if cfg.DEBUG_TIMER:
        funcTemp = cf.mod.get_function("WarmUp")
        funcTemp(grid=(1,1,1), block=(1,1,1))
    
    # Inverse Permutation: Intra-row/column rotation
    perf_timer[0] = perf_counter()
    U = cf.genRelocVec(dim[0],dim[1],cfg.P1LOG, ENC=False) # Col-rotation | len(U)=n, values from 0->m
    V = cf.genRelocVec(dim[1],dim[0],cfg.P2LOG, ENC=False) # Row-rotation | len(V)=m, values from 0->n
    perf_timer[0] = perf_counter() - perf_timer[0]
    
    misc_timer[2] = perf_counter()
    gpuU = cuda.mem_alloc(U.nbytes)
    gpuV = cuda.mem_alloc(V.nbytes)
    cuda.memcpy_htod(gpuU, U)
    cuda.memcpy_htod(gpuV, V)
    func = cf.mod.get_function("Dec_GenCatMap")
    misc_timer[2] = perf_counter() - misc_timer[2]

    perf_timer[1] = perf_counter()
    for i in range(cfg.PERM_ROUNDS):
        func(gpuimgIn, gpuimgOut, gpuU, gpuV, grid=(dim[0],dim[1],1), block=(3,1,1))
        gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn
    perf_timer[1] = perf_counter() - perf_timer[1]

    if cfg.DEBUG_IMAGES:
        misc_timer[5] += cf.interImageWrite(gpuimgIn, "OUT_1", len(imgArr), dim)

    # Inverse Fractal XOR Phase
    temp_timer = perf_counter()
    fractal, misc_timer[3] = cf.getFractal(dim[0])
    fracArr  = np.asarray(fractal).reshape(-1)
    gpuFrac = cuda.mem_alloc(fracArr.nbytes)
    cuda.memcpy_htod(gpuFrac, fracArr)
    func = cf.mod.get_function("FracXOR")
    misc_timer[3] = perf_counter() - temp_timer

    perf_timer[2] = perf_counter()
    func(gpuimgIn, gpuimgOut, gpuFrac, grid=(dim[0]*dim[1],1,1), block=(3,1,1))
    perf_timer[2] = perf_counter() - perf_timer[2]

    gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn

    if cfg.DEBUG_IMAGES:
        misc_timer[5] += cf.interImageWrite(gpuimgIn, "OUT_2", len(imgArr), dim)

    # Ar Phase: Cat-map Iterations
    misc_timer[4] = perf_counter()
    imgShuffle = np.arange(start=0, stop=len(imgArr)/3, dtype=np.uint32)
    gpuShuffIn = cuda.mem_alloc(imgShuffle.nbytes)
    gpuShuffOut = cuda.mem_alloc(imgShuffle.nbytes)
    cuda.memcpy_htod(gpuShuffIn, imgShuffle)
    func = cf.mod.get_function("ArMapTable")
    misc_timer[4] = perf_counter() - misc_timer[4]

    # Recalculate mapping to generate lookup table
    perf_timer[3] = perf_counter()
    for i in range(rounds):
        func(gpuShuffIn, gpuShuffOut, grid=(dim[0],dim[1],1), block=(1,1,1))
        gpuShuffIn, gpuShuffOut = gpuShuffOut, gpuShuffIn
    perf_timer[3] = perf_counter() - perf_timer[3]

    # Apply mapping
    gpuShuffle = gpuShuffIn
    func = cf.mod.get_function("ArMapTabletoImg")

    perf_timer[4] = perf_counter()
    func(gpuimgIn, gpuimgOut, gpuShuffle, grid=(dim[0]*dim[1],1,1), block=(3,1,1))
    perf_timer[4] = perf_counter() - perf_timer[4]

    if cfg.DEBUG_IMAGES:
        misc_timer[5] += cf.interImageWrite(gpuimgOut, "OUT_3", len(imgArr), dim)

    # Transfer vector back to host and reshape into original dimensions if needed
    temp_timer = perf_counter()
    cuda.memcpy_dtoh(imgArr, gpuimgOut)
    img = (np.reshape(imgArr,dim)).astype(np.uint8)

    if height!=width:
        img = cv2.resize(img,(height,width),interpolation=cv2.INTER_CUBIC)
        dim = img.shape

    cv2.imwrite(cfg.DEC_OUT, img)
    misc_timer[0] += perf_counter() - temp_timer

    # Print timing statistics
    if cfg.DEBUG_TIMER:
        overall_time = perf_counter() - overall_time
        perf = np.sum(perf_timer)
        misc = np.sum(misc_timer)

        print("\nTarget: {} ({}x{})".format(cfg.ENC_IN, dim[1], dim[0]))

        print("\nPERF. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(perf, perf/overall_time*100))
        print("Shuffle Gen:   \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[0], perf_timer[0]/overall_time*100))
        print("Perm. Kernel:  \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[1], perf_timer[1]/overall_time*100))
        print("XOR Kernel:   \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[2], perf_timer[2]/overall_time*100))
        print("LUT Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[3], perf_timer[3]/overall_time*100))
        print("Mapping Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[4], perf_timer[4]/overall_time*100))
        
        print("\nMISC. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(misc, misc/overall_time*100))
        print("I/O:\t\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[0], misc_timer[0]/overall_time*100)) 
        print("Log Read:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[1], misc_timer[1]/overall_time*100))
        print("Permute Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[2], misc_timer[2]/overall_time*100)) 
        print("FracXOR Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[3], misc_timer[3]/overall_time*100)) 
        print("LUT Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[4], misc_timer[4]/overall_time*100)) 

        if cfg.DEBUG_IMAGES:
            print("Debug Images:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[5], misc_timer[5]/overall_time*100))

        print("\nNET TIME:\t{0:7.5f}s\n".format(overall_time))
Beispiel #37
0
    def __call__(self,
                 input_im,
                 row_kernel,
                 col_kernel,
                 result=None,
                 input_shape=None,
                 row_shape=None,
                 col_shape=None,
                 **kwargs):

        self.ctx.push()
        use_cached_buffers = kwargs.get("use_cached_buffers", True)

        if input_im.__class__ == numpy.ndarray and input_im.dtype != numpy.float32:
            raise KernelMustUseFloat32Exception

        (input_dev, input_shape,
         input_type) = self.transfer_to_device(input_im)
        (row_dev, row_shape, row_type) = self.transfer_to_device(row_kernel)
        (col_dev, col_shape, col_type) = self.transfer_to_device(col_kernel)

        if input_shape is None:
            raise UnknownArrayShapeException

        if row_shape is None:
            raise UnknownArrayShapeException

        if col_shape is None:
            raise UnknownArrayShapeException

        row_tile_width = 128
        col_tile_width = 16
        col_tile_height = 48
        col_hstride = 8
        assert numpy.mod(row_shape[0], 2) == 1, "Kernels must be of odd width"
        row_kernel_radius = row_shape[0] / 2

        coallescing_quantum = 16
        row_kernel_radius_aligned = (row_kernel_radius /
                                     coallescing_quantum) * coallescing_quantum
        if row_kernel_radius_aligned == 0:
            row_kernel_radius_aligned = coallescing_quantum

        assert numpy.mod(col_shape[0], 2) == 1, "Kernels must be of odd width"
        col_kernel_radius = col_shape[0] / 2

        #build_args = (im_shape, row_kernel_radius, row_kernel_radius_aligned, row_tile_width, col_kernel_radius, col_tile_width, col_tile_height, col_hstride
        build_args = (input_type, input_shape, row_kernel_radius,
                      row_kernel_radius_aligned, row_tile_width,
                      col_kernel_radius, col_tile_width, col_tile_height,
                      col_hstride)
        if build_args in self.cached_programs:
            prg = self.cached_programs[build_args]
        else:
            prg = self.build_program(*build_args)

        row_local_size = (row_kernel_radius_aligned + row_tile_width +
                          row_kernel_radius, 1, 1)
        row_group_size = (int_div_up(input_shape[1],
                                     row_tile_width), input_shape[0])
        row_global_size = (row_local_size[0] * row_group_size[0],
                           row_local_size[1] * row_group_size[1])

        col_local_size = (col_tile_width, col_hstride, 1)
        col_group_size = (int_div_up(input_shape[1], col_tile_width),
                          int_div_up(input_shape[0], col_tile_height))
        col_global_size = (col_local_size[0] * col_group_size[0],
                           col_local_size[1] * col_group_size[1])

        #print col_local_size
        #print col_group_size
        #print col_global_size

        # a device buffer for the intermediate result
        intermediate_dev = None
        if (use_cached_buffers) and ((input_shape, input_type)
                                     in self.cached_intermediate_buffers):
            intermediate_dev = self.cached_intermediate_buffers[(input_shape,
                                                                 input_type)]
        else:
            dummy = numpy.array([1], dtype=input_type)
            intermediate_dev = cuda.mem_alloc(input_shape[0] * input_shape[1] *
                                              dummy.itemsize)
            self.cached_intermediate_buffers[(input_shape,
                                              input_type)] = intermediate_dev

        # a device buffer for the result, if not already supplied
        result_dev = None
        if result is None or result.__class__ == numpy.ndarray:
            # need to make or repurpose a device buffer
            if (use_cached_buffers) and ((input_shape, input_type)
                                         in self.cached_result_buffers):
                result_dev = self.cached_result_buffers[(input_shape,
                                                         input_type)]
                #print "Here"
                #print(result_dev)
            else:
                dummy = numpy.array([1], dtype=input_type)
                result_dev = cuda.mem_alloc(input_shape[0] * input_shape[1] *
                                            dummy.itemsize)
                self.cached_result_buffers[(input_shape,
                                            input_type)] = result_dev
                self.cached_shapes[result_dev] = input_shape
                self.cached_types[result_dev] = input_type
        else:
            # assume that result is a device buffer already (possibly not a safe assumption)
            result_dev = result

        #t = Timer()
        try:
            f = prg.get_function("separable_convolution_row")
            f(intermediate_dev,
              input_dev,
              row_dev,
              grid=[int(e) for e in row_group_size],
              block=[int(e) for e in row_local_size])
            self.ctx.synchronize()
        except Exception as e:
            print(input_shape)
            print(intermediate_dev)
            print(input_dev)
            print(row_dev)
            print(row_global_size)
            print(row_local_size)
            raise e

        try:
            f = prg.get_function("separable_convolution_col")
            f(result_dev,
              intermediate_dev,
              col_dev,
              grid=[int(e) for e in col_group_size],
              block=[int(e) for e in col_local_size])
            self.ctx.synchronize()
        except Exception as e:
            print(input_shape)
            print(result_dev)
            print(intermediate_dev)
            print(row_dev)
            print(row_shape)
            raise e

        #print("Elapsed: %f" % t.elapsed)

        if kwargs.get("readback_from_device", False):
            if result is None:
                result = self.transfer_from_device(result_dev,
                                                   shape=input_shape)
            else:
                self.transfer_from_device(result_dev, result)
        else:
            result = result_dev

        self.ctx.pop()
        return result
Beispiel #38
0
 def allocate_GPU_mem(self):
     self.pyramid_d = cuda.mem_alloc(self.pyramid.nbytes)
     self.pyrlevel_d = cuda.mem_alloc(self.pyrlevelCones.nbytes)
     self.fov_d = cuda.mem_alloc(self.pyrlevelCones.nbytes)
Beispiel #39
0
  def __init__(self, path, workspace):
    # parameters
    self.path = path

    # config from path
    try:
      yaml_path = self.path + "/cfg.yaml"
      print("Opening config file %s" % yaml_path)
      self.CFG = yaml.load(open(yaml_path, 'r'))
    except Exception as e:
      print(e)
      print("Error opening cfg.yaml file from trained model.")
      quit()

    # get the data
    parserModule = imp.load_source("parserModule",
                                   booger.TRAIN_PATH + '/tasks/classification/dataset/' +
                                   self.CFG["dataset"]["name"] + '/parser.py')
    self.parser = parserModule.Parser(img_prop=self.CFG["dataset"]["img_prop"],
                                      img_means=self.CFG["dataset"]["img_means"],
                                      img_stds=self.CFG["dataset"]["img_stds"],
                                      classes=self.CFG["dataset"]["labels"],
                                      train=False)

    # some useful data
    self.data_h, self.data_w, self.data_d = self.parser.get_img_size()
    self.means, self.stds = self.parser.get_means_stds()
    self.means = np.array(self.means, dtype=np.float32)
    self.stds = np.array(self.stds, dtype=np.float32)
    self.nclasses = self.parser.get_n_classes()

    # try to deserialize the engine first
    self.engine = None
    self.engine_serialized_path = path + "/model.trt"
    try:
      with open(self.engine_serialized_path, "rb") as f:
        self.runtime = trt.Runtime(TRT_LOGGER)
        self.engine = self.runtime.deserialize_cuda_engine(f.read())
    except Exception as e:
      print("Could not deserialize engine. Generate instead. Error: ", e)
      self.engine = None

    # architecture definition from onnx if no engine is there
    # get weights?
    if self.engine is None:
      try:
        # basic stuff for onnx parser
        self.model_path = path + "/model.onnx"
        self.builder = trt.Builder(TRT_LOGGER)
        self.network = self.builder.create_network()
        self.onnxparser = trt.OnnxParser(self.network, TRT_LOGGER)
        self.model = open(self.model_path, 'rb')
        self.onnxparser.parse(self.model.read())
        print("Successfully ONNX weights from ", self.model_path)
      except Exception as e:
        print("Couldn't load ONNX network. Error: ", e)
        quit()

      print("Wait while tensorRT profiles the network and build engine")
      # trt parameters
      try:
        self.builder.max_batch_size = 1
        self.builder.max_workspace_size = workspace
        self.builder.fp16_mode = self.builder.platform_has_fast_fp16
        print("Platform has fp16 mode: ", self.builder.platform_has_fast_fp16)
        print("Calling build_cuda_engine")
        self.engine = self.builder.build_cuda_engine(self.network)
        assert(self.engine is not None)
      except Exception as e:
        print("Failed creating engine for TensorRT. Error: ", e)
        quit()
      print("Done generating tensorRT engine.")

      # serialize for later
      print("Serializing tensorRT engine for later (for example in the C++ interface)")
      try:
        self.serialized_engine = self.engine.serialize()
        with open(self.engine_serialized_path, "wb") as f:
          f.write(self.serialized_engine)
      except Exception as e:
        print("Couln't serialize engine. Not critical, so I continue. Error: ", e)
    else:
      print("Successfully opened engine from inference directory.")
      print("WARNING: IF YOU WANT TO PROFILE FOR THIS COMPUTER DELETE model.trt FROM THAT DIRECTORY")

    # create execution context
    self.context = self.engine.create_execution_context()

    # Determine dimensions and create CUDA memory buffers
    # to hold host inputs/outputs.
    self.d_input_size = self.data_h * self.data_w * self.data_d * 4
    self.d_output_size = self.nclasses * 4
    # Allocate device memory for inputs and outputs.
    self.d_input = cuda.mem_alloc(self.d_input_size)
    self.d_output = cuda.mem_alloc(self.d_output_size)
    # Create a stream in which to copy inputs/outputs and run inference.
    self.stream = cuda.Stream()
    # h_input = cuda.pagelocked_empty(engine.get_binding_shape(0).volume(), dtype=np.float32)

    # h_output = cuda.pagelocked_empty(engine.get_binding_shape(1).volume(), dtype=np.float32)

    # d_input = cuda.mem_alloc(h_input.nbytes)

    # d_output = cuda.mem_alloc(h_output.nbytes)


    with builder.build_cuda_engine(network) as engine:
        output = np.empty(10, dtype = np.float32)


        # Alocate device memory
        d_input = cuda.mem_alloc(1 * img.nbytes)
        d_output = cuda.mem_alloc(1 * output.nbytes)
        bindings=[int(d_input), int(d_output)]

        stream = cuda.Stream()

        with engine.create_execution_context() as context:
            cuda.memcpy_htod_async(d_input, img, stream)

            context.execute_async(bindings = bindings, stream_handle=stream.handle)

            cuda.memcpy_dtoh_async(output, d_output, stream)

            stream.synchronize()

            print("true label : ", label)
Beispiel #41
0
    def _run_simulation(self, parameters, init_values, blocks, threads):

        total_threads = blocks * threads
        experiments = len(parameters)

        # simulation specific parameters
        param = np.zeros(
            (total_threads / self._beta + 1, self._parameterNumber),
            dtype=np.float32)
        try:
            for i in range(experiments):
                for j in range(self._parameterNumber):
                    param[i][j] = parameters[i][j]
        except IndexError:
            pass

        if not self._putIntoShared:
            # parameter texture
            ary = sim.create_2D_array(param)
            sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4,
                                     total_threads / self._beta + 1)
            self._param_tex.set_array(ary)
            shared_memory_parameters = 0
        else:
            # parameter shared Mem
            shared_memory_parameters = self._parameterNumber * (
                threads / self._beta + 2) * 4

        shared_memory_per_block_for_rng = threads / self._warp_size * self._state_words * 4
        shared_tot = shared_memory_per_block_for_rng + shared_memory_parameters

        if self._putIntoShared:
            parameters_input = np.zeros(self._parameterNumber * total_threads /
                                        self._beta,
                                        dtype=np.float32)
        species_input = np.zeros(self._speciesNumber * total_threads,
                                 dtype=np.float32)
        result = np.zeros(self._speciesNumber * total_threads *
                          self._resultNumber,
                          dtype=np.float32)

        # non coalesced
        try:
            for i in range(len(init_values)):
                for j in range(self._speciesNumber):
                    species_input[i * self._speciesNumber +
                                  j] = init_values[i][j]
        except IndexError:
            pass
        if self._putIntoShared:
            try:
                for i in range(experiments):
                    for j in range(self._parameterNumber):
                        parameters_input[i * self._parameterNumber +
                                         j] = parameters[i][j]
            except IndexError:
                pass

        # set seeds using python rng
        seeds = np.zeros(total_threads / self._warp_size * self._state_words,
                         dtype=np.uint32)
        for i in range(len(seeds)):
            seeds[i] = np.uint32(4294967296 * np.random.uniform(0, 1))
            # seeds[i] =  np.random.random_integers(0,4294967295)

        species_gpu = driver.mem_alloc(species_input.nbytes)
        if self._putIntoShared:
            parameters_gpu = driver.mem_alloc(parameters_input.nbytes)
        seeds_gpu = driver.mem_alloc(seeds.nbytes)
        result_gpu = driver.mem_alloc(result.nbytes)

        driver.memcpy_htod(species_gpu, species_input)
        if self._putIntoShared:
            driver.memcpy_htod(parameters_gpu, parameters_input)
        driver.memcpy_htod(seeds_gpu, seeds)
        driver.memcpy_htod(result_gpu, result)

        # run code
        if self._putIntoShared:
            self._compiledRunMethod(species_gpu,
                                    parameters_gpu,
                                    seeds_gpu,
                                    result_gpu,
                                    block=(threads, 1, 1),
                                    grid=(blocks, 1),
                                    shared=shared_tot)
        else:
            self._compiledRunMethod(species_gpu,
                                    seeds_gpu,
                                    result_gpu,
                                    block=(threads, 1, 1),
                                    grid=(blocks, 1),
                                    shared=shared_tot)

        # fetch from GPU memory
        driver.memcpy_dtoh(result, result_gpu)

        # reshape result
        result = result[0:experiments * self._beta * self._resultNumber *
                        self._speciesNumber]
        result.shape = (experiments, self._beta, self._resultNumber,
                        self._speciesNumber)

        return result
Beispiel #42
0
def get_ptr(array):
    ptr = cuda.mem_alloc(MatrixStruct.mem_size)
    mat = MatrixStruct(array, ptr)
    return ptr, mat
Beispiel #43
0
def calculation(in_queue, out_queue):

    device_num, params = in_queue.get()

    chunk_size = params['chunk_size']
    chunks_num = params['chunks_num']
    particles = params['particles']
    state = params['state']
    representation = params['representation']
    quantities = params['quantities']

    decoherence = params['decoherence']
    if decoherence is not None:
        decoherence_steps = decoherence['steps']
        decoherence_coeff = decoherence['coeff']
    else:
        decoherence_steps = 0
        decoherence_coeff = 1

    binning = params['binning']
    if binning is not None:
        s = set()
        for names, _, _ in binning:
            s.update(names)
        quantities = sorted(list(s))

    c_dtype = numpy.complex128
    c_ctype = 'double2'
    s_dtype = numpy.float64
    s_ctype = 'double'
    Fs = []

    cuda.init()

    device = cuda.Device(device_num)
    ctx = device.make_context()
    free, total = cuda.mem_get_info()
    max_chunk_size = float(total) / len(quantities) / numpy.dtype(
        c_dtype).itemsize / 1.1
    max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10))
    #print free, total, max_chunk_size

    if max_chunk_size > chunk_size:
        subchunk_size = chunk_size
        subchunks_num = 1
    else:
        assert chunk_size % max_chunk_size == 0
        subchunk_size = max_chunk_size
        subchunks_num = chunk_size / subchunk_size

    buffers = []
    for quantity in sorted(quantities):
        buffers.append(GPUArray(subchunk_size, c_dtype))

    stream = cuda.Stream()

    # compile code
    try:
        source = TEMPLATE.render(c_ctype=c_ctype,
                                 s_ctype=s_ctype,
                                 particles=particles,
                                 state=state,
                                 representation=representation,
                                 quantities=quantities,
                                 decoherence_coeff=decoherence_coeff)
    except:
        print exceptions.text_error_template().render()
        raise

    try:
        module = SourceModule(source, no_extern_c=True)
    except:
        for i, l in enumerate(source.split("\n")):
            print i + 1, ":", l
        raise

    kernel_initialize = module.get_function("initialize")
    kernel_calculate = module.get_function("calculate")
    kernel_decoherence = module.get_function("decoherence")

    # prepare call parameters

    gen_block_size = min(kernel_initialize.max_threads_per_block,
                         kernel_calculate.max_threads_per_block)
    gen_grid_size = device.get_attribute(
        cuda.device_attribute.MULTIPROCESSOR_COUNT)
    gen_block = (gen_block_size, 1, 1)
    gen_grid = (gen_grid_size, 1, 1)

    num_gen = gen_block_size * gen_grid_size
    assert num_gen <= 20000

    # prepare RNG states

    #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32))
    seeds = to_gpu(
        numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
    state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    states = cuda.mem_alloc(num_gen * state_type_size)

    #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k
    kernel_initialize(states,
                      seeds.gpudata,
                      block=gen_block,
                      grid=gen_grid,
                      stream=stream)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size)

    # run calculation
    args = [states] + [buf.gpudata
                       for buf in buffers] + [numpy.int32(subchunk_size)]

    if binning is None:

        results = {
            quantity: numpy.zeros(
                (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype)
            for quantity in quantities
        }
        for i in xrange(chunks_num * subchunks_num):
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)

            for k in xrange(decoherence_steps + 1):
                if k > 0:
                    kernel_decoherence(*args,
                                       block=gen_block,
                                       grid=gen_grid,
                                       stream=stream)

                for j, quantity in enumerate(sorted(quantities)):
                    F = (gpuarray.sum(buffers[j], stream=stream) /
                         buffers[j].size).get()
                    results[quantity][k, i] = F

        for quantity in sorted(quantities):
            results[quantity] = results[quantity].reshape(
                decoherence_steps + 1, chunks_num,
                subchunks_num).mean(2).real.tolist()

        out_queue.put(results)

    else:

        bin_accums = [
            numpy.zeros(tuple([binnum] * len(vals)), numpy.int64)
            for vals, binnum, _ in binning
        ]
        bin_edges = [None] * len(binning)

        for i in xrange(chunks_num * subchunks_num):
            bin_edges = []
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)
            results = {
                quantity: buffers[j].get().real
                for j, quantity in enumerate(sorted(quantities))
            }

            for binparam, bin_accum in zip(binning, bin_accums):
                qnames, binnum, ranges = binparam
                sample_lines = [results[quantity] for quantity in qnames]
                sample = numpy.concatenate(
                    [arr.reshape(subchunk_size, 1) for arr in sample_lines],
                    axis=1)

                hist, edges = numpy.histogramdd(sample, binnum, ranges)
                bin_accum += hist
                bin_edges.append(numpy.array(edges))

        results = [[acc.tolist(), edges.tolist()]
                   for acc, edges in zip(bin_accums, bin_edges)]

        out_queue.put(results)

    #ctx.pop()
    ctx.detach()
Beispiel #44
0
def run_pnpoly(context, cc):

    #read kernel into string
    with open('pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    #compile the kernels
    module = SourceModule(kernel_string,
                          arch='compute_' + cc,
                          code='sm_' + cc,
                          cache_dir=False,
                          no_extern_c=True)
    pnpoly_kernel = module.get_function("cn_pnpoly")

    #set the number of points and the number of vertices
    size = numpy.int32(2e7)
    vertices = 600

    #allocate page-locked device-mapped host memory
    points = allocate(2 * size, numpy.float32)
    bitmap = allocate(size, numpy.int32)
    vertices = allocate(2 * vertices, numpy.float32)
    # HINT: need to reference constant memory
    #
    d_bitmap = numpy.intp(bitmap.base.get_device_pointer())
    d_points = numpy.intp(points.base.get_device_pointer())

    #generate/read input data
    numpy.copyto(points, numpy.random.randn(2 * size).astype(numpy.float32))
    numpy.copyto(vertices, numpy.fromfile("vertices.dat", dtype=numpy.float32))

    #allocate gpu device memory for storing the vertices
    d_vertices = drv.mem_alloc(vertices.nbytes)

    #copy from host memory to GPU device memory
    drv.memcpy_htod(d_vertices, vertices)
    # HINT: need to also copy memory to constant array
    #
    #kernel arguments
    gpu_args = [d_bitmap, d_points, d_vertices, size]

    #setup thread block sizes
    threads = (256, 1, 1)
    grid = (int(numpy.ceil(size / float(threads[0]))), 1)

    #create events for time measurement
    start = drv.Event()
    end = drv.Event()

    #warm up the device a bit before measurement
    context.synchronize()
    for i in range(5):
        pnpoly_kernel(*gpu_args, block=threads, grid=grid)
    context.synchronize()

    #run the kernel and measure time using events
    start.record()
    pnpoly_kernel(*gpu_args, block=threads, grid=grid)
    end.record()
    context.synchronize()
    print("cn_pnpoly took", end.time_since(start), "ms.")

    #compute the reference answer using the reference kernel
    reference = allocate(size, numpy.int32)
    d_reference = numpy.intp(reference.base.get_device_pointer())
    reference_kernel = module.get_function("cn_pnpoly_reference_kernel")
    ref_args = [d_reference, d_points, d_vertices, size]
    context.synchronize()
    start.record()
    reference_kernel(*ref_args, block=threads, grid=grid)
    end.record()
    context.synchronize()
    print("reference kernel took", end.time_since(start), "ms.")

    #check if the result is the same
    test = numpy.sum(numpy.absolute(bitmap - reference)) == 0
    if test != True:
        print("answer:")
        print(bitmap)
        print("reference:")
        print(reference)
    else:
        print("ok!")
def get_cov(A, blocks=None, threads=None):

    rows, cols = A.shape
    rows = int(rows)
    cols = int(cols)

    # Assign block and thread size
    if blocks and threads and blocks <= 1024 and threads <= 1024:
        blockCount = blocks
        threadCount = threads
    else:
        # Number of threads per block
        if rows >= 1024:
            threadCount = 1024
        else:
            threadCount = rows

        # Number of blocks per grid
        if cols >= 1024:
            blockCount = 1024
        else:
            blockCount = cols

    # Host Memory
    means = np.zeros(cols)
    means = means.astype(np.float32)

    covariances = np.zeros(cols * cols)
    covariances = covariances.astype(np.float32)

    # Allocate on device
    d_A = cuda.mem_alloc(A.size * A.dtype.itemsize)
    d_means = cuda.mem_alloc(means.size * means.dtype.itemsize)
    d_covariances = cuda.mem_alloc(covariances.size *
                                   covariances.dtype.itemsize)

    # Copy from host to device
    cuda.memcpy_htod(d_A, A)
    cuda.memcpy_htod(d_means, means)
    cuda.memcpy_htod(d_covariances, covariances)

    # # Number of threads per block
    # if rows >= 1024:
    #     threadCount = 1024
    # else:
    #     threadCount = rows

    # # Number of blocks per grid
    # blockCount = cols

    # Start GPU time
    start = cuda.Event()
    end = cuda.Event()
    start.record()

    # Run Kernel
    func(
        np.int32(cols),
        np.int32(rows),
        d_A,
        d_means,
        d_covariances,
        block=(threadCount, 1, 1),
        grid=(blockCount, 1),
        shared=threadCount * A.dtype.itemsize,
    )

    # End GPU time
    end.record()
    end.synchronize()
    ms = start.time_till(end)

    # Copy result to host
    cuda.memcpy_dtoh(covariances, d_covariances)

    # Return Covariance Matrix
    return np.resize(covariances, (cols, cols)), ms
Beispiel #46
0
        numpy.getbuffer() needed due to lack of new-style buffer interface for
        scalar numpy arrays as of numpy version 1.9.1

        see: https://github.com/inducer/pycuda/pull/60
        """
        cuda.memcpy_htod(int(struct_arr_ptr),
                         numpy.getbuffer(numpy.int32(array.size)))
        cuda.memcpy_htod(
            int(struct_arr_ptr) + 8,
            numpy.getbuffer(numpy.uintp(int(self.data))))

    def __str__(self):
        return str(cuda.from_device(self.data, self.shape, self.dtype))


struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size)
do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size

array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32),
                        struct_arr)
array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr)

print("original arrays")
print(array1)
print(array2)

mod = SourceModule("""
    struct DoubleOperation {
        int datalen, __padding; // so 64-bit ptrs can be aligned
        float *ptr;
    };
#print "#### Data length:", len(data)


#dataLength = len(data)
dataLength = dataTimeSize

pool = genome.Pool(db, 'sell', 'AUDUSD', endDate)


data = numpy.array(data).astype(numpy.float32)

printFreeMemory()
print "Data size ", data.nbytes/1024, " KB"
printFreeMemory()

data_gpu = cuda.mem_alloc(data.nbytes)
cuda.memcpy_htod(data_gpu, data)


trees = []
for x in range(poolSize):
	trees.append( genome.randomTree(treeLength) )
	
### Main Loop
generations = 0
dataDim = math.floor(dataLength/64.0)

evalArray = None
lastTrees = None
winCount = None
lossCount = None
Beispiel #48
0
# pycuda tutorial from: https://documen.tician.de/pycuda/tutorial.html

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy

a = numpy.random.randn(4,4)
b = numpy.random.randn(4, 4, 2)



a = a.astype(numpy.float32)
b = b.astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

mod = SourceModule("""
  __global__ void doublify(float *a, float ***b)
  {
    int idx = threadIdx.x + threadIdx.y * blockDim.x;
    a[idx] *= 2;

    b[threadIdx.x][threadIdx.y][0] = (float) (threadIdx.x + 100.0 * threadIdx.y);
  }
  """)

func = mod.get_function("doublify")
Beispiel #49
0
file_data.close()

start = time.time()

for i in range(0,numPoints):
    for j in range(0, numDims+1):
        dataT[j][i] = data[i][j]



###
## allocate memory on device
###

X_gpu = cuda.mem_alloc(data.nbytes)
X_t_gpu = cuda.mem_alloc(dataT.nbytes)
weights_gpu = cuda.mem_alloc(weights.nbytes)
old_weights_gpu = cuda.mem_alloc(weights.nbytes)
distances_gpu = cuda.mem_alloc(weights.nbytes)
labels_gpu = cuda.mem_alloc(labels.nbytes)
error_gpu = cuda.mem_alloc(labels.nbytes)
prob_gpu = cuda.mem_alloc(labels.nbytes)

###
## transfer data to gpu
###
cuda.memcpy_htod(X_gpu, data)
cuda.memcpy_htod(X_t_gpu, dataT)

cuda.memcpy_htod(weights_gpu, weights)
Beispiel #50
0
a1 = np.zeros(1, dtype=np.float64)
b1 = np.zeros(10, dtype=np.float64)
c1 = np.zeros(100, dtype=np.float64)

print a1
print b1
print c1

a1_addr = drv.to_device(a1)
b1_addr = drv.to_device(b1)
c1_addr = drv.to_device(c1)

#print int(a1_addr)
#print sys.getsizeof(int(b1_addr))

twod_gpu = drv.mem_alloc(3 * 8)
address = np.array([int(a1_addr), int(b1_addr),
                    int(c1_addr)]).astype(np.uint64)
#print address

drv.memcpy_htod(twod_gpu, address)

#diag_kernel(drv.InOut(a1), drv.InOut(b1), drv.InOut(c1), block=(32,1,1))
diag_kernel(twod_gpu, block=(32, 1, 1))

drv.memcpy_dtoh(a1, a1_addr)
print a1
drv.memcpy_dtoh(b1, b1_addr)
print b1
drv.memcpy_dtoh(c1, c1_addr)
print c1
 def __init__(self, stream, cache_file=""):
     trt.IInt8MinMaxCalibrator.__init__(self)       
     self.stream = stream
     self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
     self.cache_file = cache_file
     stream.reset()
Beispiel #52
0
    def get_distance(self, msa):

        if not isinstance(msa, MultipleSeqAlignment):
            raise TypeError("Must provide a MultipleSeqAlignment object.")
        i = 0
        for record in msa:
            record.index = i
            i += 1
        names = [record.id for record in msa]
        indices = [record.index for record in msa]

        dm = DistanceMatrix(names)
        pair_combinations = list(itertools.combinations(
            msa, 2))  # in order to combine take from here.
        combinations = len(pair_combinations)
        seqLength = len(pair_combinations[0][0])

        # host arrays
        host_combinations = []
        for pair in range(combinations):
            couple = [
                "%s" % (pair_combinations[pair][0].seq),
                "%s" % (pair_combinations[pair][1].seq)
            ]
            host_combinations.extend(couple)

        host_names = self.scoring_matrix.names
        attributes = len(host_names)

        hst_scoring_matrix = []
        for name in host_names:
            sequence = self.scoring_matrix[name]
            hst_scoring_matrix.extend(sequence)

        host_scoring_matrix = np.array(hst_scoring_matrix)
        host_scoring_matrix = host_scoring_matrix.astype(np.float64)
        host_d_matrix = np.zeros((combinations, ), dtype=float)
        host_d_matrix = host_d_matrix.astype(np.float64)
        host_names = np.asarray(host_names)
        host_combinations = np.asarray(host_combinations)

        ###GPU code
        start = cuda.Event()
        end = cuda.Event()

        # get the optimum block size based on dataset size
        if (combinations < 128):
            BLOCKSIZE = 128
        elif (combinations < 256):
            BLOCKSIZE = 256
        elif (combinations < 512):
            BLOCKSIZE = 512
        else:
            BLOCKSIZE = 1024

        # Allocate GPU device memory
        device_scoring_matrix = cuda.mem_alloc(host_scoring_matrix.nbytes)
        device_names = cuda.mem_alloc(sys.getsizeof(host_names))
        device_combinations = cuda.mem_alloc(sys.getsizeof(host_combinations))
        device_d_matrix = cuda.mem_alloc(host_d_matrix.nbytes)

        # Memcopy from host to device
        cuda.memcpy_htod(device_combinations, host_combinations)
        cuda.memcpy_htod(device_names, host_names)
        cuda.memcpy_htod(device_scoring_matrix, host_scoring_matrix)

        mod = SourceModule("""
          #include <stdio.h>
          #include <string.h>
          #include <stdlib.h>
          __global__ void DeviceDM(char device_combinations[] , char device_names[], int n,  int N, const int seqLength, double *device_scoring_matrix, double *device_d_matrix)
          {
            const int tid = threadIdx.y + blockIdx.y* blockDim.y;
            if (tid >= N) return;



            int start1= (tid*2)*(seqLength);
            int start2= (tid*2+1)*(seqLength);

            char skip_letters[] = {'-', '*'};

            int score = 0;
            int max_score = 0;
            
            if(device_scoring_matrix){
                double max_score1 = 0.0;
                double max_score2 = 0.0;
                
                for(int i=0; i < seqLength; i++){
                    char l1 = device_combinations[start1+i];
                    char l2 = device_combinations[start2+i];
                    int l1rank = 0;
                    int l2rank = 0;
                    if(!(l1==skip_letters[0] || l1==skip_letters[1] || l2==skip_letters[0] || l2==skip_letters[1])){
                        for(int i=0; i< n; i++){
                            if(l1==device_names[i]){
                                l1rank=i;
                            }
                            if(l2==device_names[i]){
                                l2rank=i;
                            }
                            if(l1rank!=0 && l2rank!=0){
                                break;
                            }                                   
                        }

                        max_score1 = max_score1 + device_scoring_matrix[l1rank* n + l1rank];
                        max_score2 = max_score2 + device_scoring_matrix[l2rank* n + l2rank];
                        score += device_scoring_matrix[l1rank*n + l2rank];


                    }

                }
               
                if(max_score1>=max_score2){
                    max_score= max_score1;	
                }else{
                    max_score= max_score2;
                }

            }else{
                for(int i=0; i < seqLength; i++){
                    char l1 = device_combinations[start1+i];
                    char l2 = device_combinations[start2+i];
                    if(!(l1==skip_letters[0] || l1==skip_letters[1] || l2==skip_letters[0] || l2==skip_letters[1])){
                        if(l1==l2){
                            score= score + 1;
                        }
                    }

                }
                max_score = seqLength;   

             }   
            if(max_score == 0){
                device_d_matrix[tid]=1;
            }else{
                device_d_matrix[tid]=1 - (score * 1.0 / max_score);
            }


          } 
        """)

        # --- Define a reference to the __global__ function and call it 1 - (score * 1.0 / max_score);
        DeviceDM = mod.get_function("DeviceDM")
        blockDim = (1, BLOCKSIZE, 1)
        gridDim = (1, combinations / BLOCKSIZE + 1, 1)

        start.record()

        DeviceDM(device_combinations,
                 device_names,
                 np.int32(attributes),
                 np.int32(combinations),
                 np.int32(seqLength),
                 device_scoring_matrix,
                 device_d_matrix,
                 block=blockDim,
                 grid=gridDim)
        end.record()
        end.synchronize()
        secs = start.time_till(end) * 1e-3
        #print("Processing time = %fs" % (secs))
        distance_matrix_list = np.empty_like(host_d_matrix)
        cuda.memcpy_dtoh(distance_matrix_list, device_d_matrix)
        device_d_matrix.free()
        device_combinations.free()
        device_names.free()
        device_scoring_matrix.free()
        final_distance_matrix = distance_matrix_list.tolist()

        for pair in range(combinations):
            dm[pair_combinations[pair][0].id,
               pair_combinations[pair][1].id] = final_distance_matrix[pair]

        return dm
Beispiel #53
0
 def __init__(self, input_layers, stream):
     trt.infer.EntropyCalibrator.__init__(self)
     self.input_layers = input_layers
     self.stream = stream
     self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
     stream.reset()
Beispiel #54
0
							out[idx] = 0;
						}
						else{
							out[idx] = in[gidx];
						}
					}
				}
			}
		}
	}

""")

b = np.random.randn(25, 37, 12).astype(np.float32)
out = np.random.randn(25, 37, 12, 60, 5, 5).astype(np.float32)
b_gpu = cuda.mem_alloc(b.nbytes)
out_gpu = cuda.mem_alloc(out.nbytes)
cuda.memcpy_htod(b_gpu, b)

start = time.time()
func = mod.get_function("UnrollBKernel")
func(b_gpu, out_gpu, grid=(37, 12, 25), block=(1, 1, 1))
end = time.time()

a_doubled = np.empty_like(out)
cuda.memcpy_dtoh(a_doubled, out_gpu)
x = 0

# for i in a_doubled[0,0,0]:
# 	print(i)
# 	print(x)
#

# as above statement create double precision data and  nvidia supports only single precious of data so convert it.
h_list_a = h_list_a.astype(np.float32)

h_list_b = h_list_b.astype(np.float32)

h_list_out = np.empty_like(h_list_a)

#

#pass this data from host to device

#step 1: alloc data on device first
d_list_a = cuda_driver.mem_alloc(h_list_a.nbytes)

d_list_b = cuda_driver.mem_alloc(h_list_b.nbytes)

d_list_out = cuda_driver.mem_alloc(h_list_b.nbytes)
#

#step 2: send data to alloced device
cuda_driver.memcpy_htod(d_list_a, h_list_a)

cuda_driver.memcpy_htod(d_list_b, h_list_b)
#

#

#write cuda kernel and compile
Beispiel #56
0
        binding_idx_offset = selected_profile * num_binding_per_profile

        # Specify input shapes. These must be within the min/max bounds of the active profile
        # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
        input_shape = (args.batch_size, max_seq_length)
        input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
        for binding in range(3):
            context.set_binding_shape(binding_idx_offset + binding,
                                      input_shape)
        assert context.all_binding_shapes_specified

        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()

        # Allocate device memory for inputs.
        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]

        # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
        h_output = cuda.pagelocked_empty(tuple(
            context.get_binding_shape(binding_idx_offset + 3)),
                                         dtype=np.float32)
        d_output = cuda.mem_alloc(h_output.nbytes)

        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                "NetworkOutput",
                ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []
Beispiel #57
0
{

__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
{
        int id = blockIdx.x*blockDim.x + threadIdx.x;

        if (id >= nthreads)
            return;

        curand_init(seed+id, id, offset, &s[id]);
}

} // extern "C"
"""
rng_states_gpu = cuda.mem_alloc(
    NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK *
    characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
module = SourceModule(init_rng_src, no_extern_c=True)
init_rng = module.get_function('init_rng')
init_rng(np.int32(NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK),
         rng_states_gpu,
         np.uint32(time.time()),
         np.uint64(0),
         block=(32, NUM_RUNS_PER_BLOCK, 1),
         grid=(NUM_RUNS, 1))

is_simiulation = 0
if NUM_RUNS == 1:
    is_simulation = 1
defines = "#define NUM_ROUTES " + str(NUM_ROUTES) + "\n" +\
          "#define NUM_STOPS " + str(NUM_STOPS) + "\n" +\
Beispiel #58
0
    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
        assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width/x_div, 4)
            out_height = roundup(self.dst_height/y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w/blockw/xdiv)
        if gridw*2*blockw<w:
            gridw += 1
        gridh = max(1, h/blockh/ydiv)
        if gridh*2*blockh<h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1))
        self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides)-1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
Beispiel #59
0
        float fa = a*M_PI/8.0;
        // go around traction circle and try different moves
        float v1 = max(min(v + AxMAX*dt*cos(fa), (float)VMAX), (float)VMIN);
        float k1 = max(min(sin(fa)*AyMAX/(v*v), STEER_LIMIT_K), -STEER_LIMIT_K);
        float t1 = theta + k1*v*dt;
        float c = vlookup(Vprev, x+v1*cos(t1)*dt, y+v1*sin(t1)*dt, t1, v1, dt);
        bestcost = min(bestcost, c);
    }

    V[di] = pathcost + penalty + bestcost;
}
  """ % (STEER_LIMIT_K, NANGLES, NSPEEDS, GRID_RES, xsize, ysize, VMIN, VMAX,
         AxMAX, AyMAX, TIMESTEP, homex, homey))

valueiter = mod.get_function("valueiter")
V0_gpu = cuda.mem_alloc(NSPEEDS * NANGLES * xsize * ysize * 4)
V = np.zeros((NSPEEDS, NANGLES, ysize, xsize), np.float32) + 1000.
cuda.memcpy_htod(V0_gpu, V)

ye_in = gpuarray.to_gpu(ye)
tk_in = gpuarray.to_gpu(tk)
tang_in = gpuarray.to_gpu(tang)
del ye
del tk
del tang

s = trange(110)
v0 = np.sum(V, dtype=np.float64)
for j in s:
    for i in range(20):
        valueiter(V0_gpu,
        z[n] = x[n] + y[n];
    }
}
""".replace('real', real_cpp))
add = mod.get_function("add")

EPSILON = 1e-15
NUM_REPEATS = 10
a = 1.23
b = 2.34
c = 3.57
N = 100000000
h_x = numpy.full((N, 1), a, dtype=real_py)
h_y = numpy.full((N, 1), b, dtype=real_py)
h_z = numpy.zeros_like(h_x, dtype=real_py)
d_x = drv.mem_alloc(h_x.nbytes)
d_y = drv.mem_alloc(h_y.nbytes)
d_z = drv.mem_alloc(h_z.nbytes)
drv.memcpy_htod(d_x, h_x)
drv.memcpy_htod(d_y, h_y)
t_sum = 0
t2_sum = 0
for repeat in range(NUM_REPEATS + 1):
    start = drv.Event()
    stop = drv.Event()
    start.record()

    add(d_x,
        d_y,
        d_z,
        numpy.int32(N),