Exemple #1
0
	def _debug_print( self ) :
		cuda_driver.memcpy_dtoh( self.f , self.df1 )

		np.set_printoptions( 3 , 10000 , linewidth = 200 , suppress = True )

		print '#'*80
		print self.f
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
def loop(iterations):
    ts = 0
    while(ts<iterations):
        ' To avoid overwrites a temporary copy is made of F '
        T[:] = F
        cuda.memcpy_htod(T_gpu, T)
        
        ' Propagate '
        prop(F_gpu, T_gpu, 
             block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate density and get bounceback from obstacle nodes '
        density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu,
                block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate equilibrium '
        eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, 
           U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), 
           grid=(gridDimX,gridDimY))
        
        ' Transfer bounceback to obstacle nodes '
        bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu,
                   block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
                              
        ' Copy F to host for copy to T in beginning of loop '
        cuda.memcpy_dtoh(F, F_gpu)
        
        ts += 1
def convolution_cuda(sourceImage,  filterx,  filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32',  'source image must be float32'
    (imageHeight,  imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4  # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns,  filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = numpy.int32(DATA_H)
    DATA_W = numpy.int32(DATA_W)
    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=[int(e) for e in blockGridRows],  block=[int(e) for e in threadBlockRows])    
    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=[int(e) for e in blockGridColumns],  block=[int(e) for e in threadBlockColumns])

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage
def fromSourceFile():
    import numpy as np
    import pycuda.driver as cuda
    import pycuda.autoinit
    from pycuda.compiler import SourceModule

    #random data
    np.random.seed(1)
    a = np.random.randn(4,4)
    a = a.astype(np.float32)

    #read code and get function
    mod = SourceModule(open('simple.cu').read())
    func = mod.get_function("doublify")

    #allocate memory on the GPU
    a_gpu = cuda.mem_alloc(a.nbytes)

    #transfer to the GPU memory
    cuda.memcpy_htod(a_gpu, a)

    #execute
    func(a_gpu, block=(4,4,1))

    #collect results
    a_doubled = np.empty_like(a)
    cuda.memcpy_dtoh(a_doubled, a_gpu)

    print a_doubled
    print a_doubled / (a*2)
 def test_pycuda(self):
     """
     Test pycuda installation with small example.
     :return:
     :rtype:
     """
     try:
         import pycuda.driver as cuda
         import pycuda.autoinit
         from pycuda.compiler import SourceModule
         import numpy as np
         a = np.random.randn(4, 4)
         print(a)
         a= a.astype(np.float32)
         a_gpu = cuda.mem_alloc(a.nbytes)
         cuda.memcpy_htod(a_gpu, a)
         mod = SourceModule(
             """
             __global__ void doublify(float *a)
             {
             int idx = threadIdx.x + threadIdx.y*4;
             a[idx] *= 2;
             }
             """
         )
         func = mod.get_function("doublify")
         func(a_gpu, block=(4,4,1))
         a_doubled = np.empty_like(a)
         cuda.memcpy_dtoh(a_doubled, a_gpu)
         #print(a_doubled)
         #print(a)
     except Exception:
         self.fail('Still not working')
Exemple #7
0
    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
Exemple #8
0
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
    def runTest(self):
        nx, ny, nz, str_f, pt0, pt1, is_array = self.args
        slice_xyz = common.slices_two_points(pt0, pt1)

        # generate random source
        if is_array:
            shape = common.shape_two_points(pt0, pt1)
            value = np.random.rand(*shape).astype(np.float32)
        else:
            value = np.random.ranf()

        # instance
        fields = Fields(0, nx, ny, nz, '', 'single')

        tfunc = lambda tstep: np.sin(0.03*tstep)
        incident = IncidentDirect(fields, str_f, pt0, pt1, tfunc, value) 

        # host allocations
        eh = np.zeros(fields.ns_pitch, dtype=fields.dtype)

        # verify
        eh[slice_xyz] = fields.dtype(value) * fields.dtype(tfunc(1))
        fields.update_e()
        fields.update_h()

        copy_eh_buf = fields.get_buf(str_f)
        copy_eh = np.zeros_like(eh)
        cuda.memcpy_dtoh(copy_eh, copy_eh_buf)

        original = eh[slice_xyz]
        copy = copy_eh[slice_xyz]
        norm = np.linalg.norm(original - copy)
        self.assertEqual(norm, 0, '%s, %g' % (self.args, norm))

        fields.context_pop()
Exemple #10
0
 def fromGPU(self, shared_mem, buff_dtype=np.float32 ):
     
     buff = np.frombuffer(shared_mem.get_obj(), dtype=buff_dtype)
     buff = buff[:self.buffer_nnets*self.buffer_nsamples]
     buff = buff.reshape( (self.buffer_nnets, self.buffer_nsamples) )
     cuda.memcpy_dtoh(buff, self.gpu_data)
     return buff
Exemple #11
0
    def calcV1complex(self, stim, speed):
        """Compute V1 complex cell responses of a frame."""

        # allocate stim on device
        self._loadInput(stim)

        # convolve the stimulus with separate V1 filters
        self._calcV1linear()

        # rectify linear response to get V1 simple cell firing rate
        self._calcV1rect()

        # spatial pooling to get V1 complex
        self._calcV1blur()

        # divisive normalization
        self._calcV1normalize()

        # steer filters in specified directions
        self._calcV1direction(speed)

        # get data from device
        res = np.zeros(self.nrX*self.nrY*self.nrDirs).astype(np.float32)
        cuda.memcpy_dtoh(res, self.d_respV1c)

        return res
Exemple #12
0
def scenario_inplace_padded_C2R(batch,tic,toc):

  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([16*(BENG_CHANNELS//16+1)],int32)
  onembed = array([2*inembed[0]],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0],
  	                                       onembed.ctypes.data, 1, onembed[0],
  					       cufft.CUFFT_C2R, batch)

  data_shape = (batch,inembed[0])
  cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_data = cpu_data.astype(complex64)
  gpu_data  = cuda.mem_alloc(8*batch*inembed[0])		# complex64
  cuda.memcpy_htod(gpu_data,cpu_data)

  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data))
  toc.record()
  toc.synchronize()

  cpu_result = np.empty(batch*onembed[0],dtype=np.float32)
  cuda.memcpy_dtoh(cpu_result,gpu_data)
  cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_)
  result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1)
  print 'Batched in-place scenario'
  print 'test passed:',np.allclose(cpu_result,result)
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)' 
    def calculate (self, data, f_high, f_bins):
        
        import pycuda.driver as driver
        import pycuda.compiler as compiler
        import pycuda.autoinit
        
        log = logging.getLogger("astroplpython.function.signal")   
        log.debug("CULSP.calculate() called")
        
        log.debug("Orig Data:"+str(data)) 
        
        log.debug(" TODO: Calculate blocksize")

        log.debug("set up GPU, allocate memory for working")
        a_gpu = driver.mem_alloc(data.size * data.dtype.itemsize)
        
        log.debug("push data into GPU memory")
        driver.memcpy_htod(a_gpu, data)
        
        log.debug("compile and run the culsp_kernel on data in the GPU")
        culsp_func = compiler.SourceModule(self._kernelStr).get_function("culsp_kernel") 
        culsp_func (a_gpu, block=(4,4,1))

        log.debug("pull data from GPU back into main memory")
        result = np.empty_like(data)
        driver.memcpy_dtoh(result, a_gpu)
        
        log.debug("return result") 
        return result
Exemple #14
0
def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
	def calc_bandwidth_d2h( s ):
		t1 = datetime.now()
		cuda.memcpy_dtoh( s.a, s.dev_a )
		dt = datetime.now() - t1
		dt_float = dt.seconds + dt.microseconds*1e-6

		return s.nbytes/dt_float/gbytes
Exemple #16
0
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
	# create Cheetah template and fill in variables for mask kernel
	mask_template = Template(mask_source)
	mask_template.BLOCK_DIM_X = b_size[0]
  	mask_template.BLOCK_DIM_Y = b_size[1]
  	mask_template.WIDTH = dest_im.shape[1]
  	mask_template.HEIGHT = dest_im.shape[0]
  	mask_template.RGB = RGB
  	mask_template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	mask_kernel = cuda_compile(mask_template, "mask_kernel")

  	# alloc memory to GPU
  	d_source = cu.mem_alloc(source_im.nbytes)
  	cu.memcpy_htod(d_source, source_im)

  	# sends to GPU filter out interior points in the mask
  	mask_kernel(d_source, block=b_size, grid=g_size)

  	# retrieves interior point buffer from GPU
  	inner_buffer = np.array(dest_im, dtype =np.uint8)
  	cu.memcpy_dtoh(inner_buffer, d_source)

  	# returns the interior buffer
  	return inner_buffer
Exemple #17
0
    def test_prepared_invocation(self):
        a = np.random.randn(4,4).astype(np.float32)
        a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)

        drv.memcpy_htod(a_gpu, a)

        mod = SourceModule("""
            __global__ void doublify(float *a)
            {
              int idx = threadIdx.x + threadIdx.y*blockDim.x;
              a[idx] *= 2;
            }
            """)

        func = mod.get_function("doublify")
        func.prepare("P")
        func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
        a_doubled = np.empty_like(a)
        drv.memcpy_dtoh(a_doubled, a_gpu)
        print (a)
        print (a_doubled)
        assert la.norm(a_doubled-2*a) == 0

        # now with offsets
        func.prepare("P")
        a_quadrupled = np.empty_like(a)
        func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
        drv.memcpy_dtoh(a_quadrupled, a_gpu)
        assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
Exemple #18
0
    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
Exemple #19
0
def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
Exemple #20
0
def GPU():
    im = Image.open(sys.argv[1])
    print sys.argv[1], ": ", im.format, im.size, im.mode, '\n'

    pixels = np.array(im.getdata())
    #r, g, b = im.split()
    print pixels
    #print pixels[:,0].nbytes
    pixels = np.array(im)
    
    gpu = cuda.mem_alloc(pixels.nbytes)
    cuda.memcpy_htod(gpu, pixels)
    kernel = SourceModule("""
        #define MAX_PIXEL_VALUE 255
        #define THRESHOLD 50

        __global__ void process_pixel(int *r, int *g, int *b)
        {
            int id = blockDim.x * blockIdx.x + threadIdx.x;

            if ((r[id] > THRESHOLD) && (g[id] > THRESHOLD) && (b[id] > THRESHOLD)) {
                r[id] = MAX_PIXEL_VALUE;
                g[id] = MAX_PIXEL_VALUE;
                b[id] = MAX_PIXEL_VALUE;
            }
        }
        """)

    func = kernel.get_function("process_pixel")
    func(gpu, block=(4,4,1))

    newpixels = np.zeros_like(pixels)
    cuda.memcpy_dtoh(newpixels, gpu)
Exemple #21
0
def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
Exemple #22
0
def main(context, stream, plan1, N1, N2, g_buf1, g_buf2):
    #N1 = # ffts applied
    #N2 = dim of ffts
    x = np.linspace(0, 2 * np.pi, N2)

    y = np.sin(2 * x)
    ys = y
    ys = ys.reshape(1,N2)
    y = np.concatenate((y,np.zeros(nearest_2power(N2)-N2)))
    y = y.reshape(1,nearest_2power(N2))

    for i in xrange(N1-1): #append N1-1 sines
        yi = np.sin(2 * (i+2) * x)
        yis = yi
        yis = yis.reshape(1,N2)
        ys = np.concatenate(((ys),(yis)),0)
        yi = np.concatenate((yi,np.zeros(nearest_2power(N2)-N2)))
        yi = yi.reshape(1,nearest_2power(N2))
        y = np.concatenate(((y),(yi)),0)
        
    y = y.transpose()
    yim= np.zeros(y.shape)
    y = np.array(y,np.float64)
    yw = y.transpose()
    yimw = yim.transpose()

    aw = np.fft.fft(ys,int(nearest_2power(N2)),1)
    bw = np.real(np.fft.ifft(aw,int(nearest_2power(N2)),1))
    aw0 = np.fft.fft(y,int(nearest_2power(N2)),0)
    bw0 = np.real(np.fft.ifft(aw0,int(nearest_2power(N2)),0))
    
    gpu_testmat = gpuarray.to_gpu(y)
    gpu_testmatim = gpuarray.to_gpu(yim)
    plan1.execute(gpu_testmat, gpu_testmatim, batch=N1) 
    gfft = gpu_testmat.get() #get fft result
    plan1.execute(gpu_testmat, gpu_testmatim, inverse=True, batch=N1) 
    gifft = np.real(gpu_testmat.get()) #get ifft result
    
    cuda.memcpy_htod(g_buf1, y)
    cuda.memset_d32(g_buf2, 0, yim.size*2) # double all zero bits should be zero again. This function only works for 32 bit, so we need twice as many
    plan1.execute(g_buf1, g_buf2, batch=N1) 
    grfft=np.empty_like(y)
    cuda.memcpy_dtoh(grfft, g_buf1)  #fft result
    plan1.execute(g_buf1, g_buf2, inverse=True, batch=N1) 
    grifft=np.empty_like(y)
    cuda.memcpy_dtoh(grifft, g_buf1) #ifft result

    if Plot:
        np.set_printoptions(threshold=np.nan)
        
        #plot cuda fft results
        f, axarr = plt.subplots(5, sharex=False)
        axarr[0].plot(y)
        axarr[1].plot(gfft)
        axarr[2].plot(gifft)
        axarr[3].plot(grfft)
        axarr[4].plot(grifft)
        plt.show()
        raise SystemExit    
Exemple #23
0
def save_data(data, data_package):
	# if data is numpy.ndarray, copy to GPU and save only devptr
	dp = data_package

	data_name = dp.data_name
	data_range = dp.data_range
	shape = dp.data_shape
	u, ss, sp = dp.get_id()
	
	#log("rank%d, \"%s\", u=%d, saved"%(rank, data_name, u),'general',log_type)
	buf_dtype = type(data)

	if buf_dtype == numpy.ndarray:
		dp.memory_type = 'devptr'
		dp.data_dtype = 'cuda_memory'
		dp.devptr = to_device_with_time(data, data_name, u)
	elif buf_dtype == cuda.DeviceAllocation:
		dp.memory_type = 'devptr'
		dp.data_dtype = 'cuda_memory'
		dp.devptr = data
	elif buf_dtype == 'cuda_memory':
		pass
	else:
		assert(False)

	if dp.devptr == None:
		assert(False)
	target = (u,ss,sp)
	if target not in gpu_list:
		gpu_list.append(target)

	# save image
	if log_type in ['image', 'all']:
		if len(dp.data_shape) == 2 and dp.memory_type == 'devptr':
			image_data_shape = dp.data_memory_shape
			md = dp.data_contents_memory_dtype
			a = numpy.empty(image_data_shape,dtype=md)
#			cuda.memcpy_dtoh_async(a, dp.devptr, stream=stream[1])
			cuda.memcpy_dtoh(a, dp.devptr)
			ctx.synchronize()

			buf = a
			extension = 'png'
			dtype = dp.data_contents_dtype
			chan = dp.data_contents_memory_shape
			buf = buf.astype(numpy.uint8)
				
			if chan == [1]: img = Image.fromarray(buf, 'L')
			elif chan == [3]: img = Image.fromarray(buf, 'RGB')
			elif chan == [4]: img = Image.fromarray(buf, 'RGBA')
					
			e = os.system("mkdir -p result")
			img.save('./result/%s%s%s.png'%(u,ss,sp), format=extension)


	u = dp.get_unique_id()
	if u not in data_list: data_list[u] = {}
	if ss not in data_list[u]:data_list[u][ss] = {}
	data_list[u][ss][sp] = dp
Exemple #24
0
	def dtoh(self,nam):
		try:
			import pycuda.driver as cuda
			import pycuda.autoinit
		except:
			return
		var_id = self.nam_args.index(nam)
		cuda.memcpy_dtoh(self.args[var_id], self.cu_args[var_id]);
Exemple #25
0
 def get_from_gpu(self):
     if not self._cptr is None:
         tempstr = np.array([' ']*self.nbytes())
         cuda.memcpy_dtoh(tempstr,self._cptr)
         self.C = np.fromstring(tempstr[:self.C.nbytes],
                                dtype=self.C.dtype).resize(self.C.shape)
         self.num = np.fromstring(tempstr[self.C.nbytes:],
                                  dtype=self.num.dtype)
Exemple #26
0
def confirmInitialization(featuresForSOM,somMatrix):
    #allocate memory for the somcuda on the device
    somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
    somBytesPerRow = np.int32(somMatrix.strides[0])
    somNumberOfRows = np.int32(somMatrix.shape[0])
    somNumberOfColumns = np.int32(somMatrix.shape[1])
    pycuda.memcpy_htod(somMatrixPtr,somMatrix)
    #allocate space for bmu index
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    intraDayOffset = features.columns.get_loc('Ret_121')
    dayOffset = features.columns.get_loc('Ret_PlusOne')
    objVal = 0.0;
    objSampSize=0.0
    r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] 
    nodeHitMatrix = np.array(r).astype(np.float32)
    hitCountDict = defaultdict(list)
    samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
    if len(samples) >= len(featuresForSOM):
        samples = [x for x in range (0, len(featuresForSOM))]       
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
    fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
    ax = plt.subplot(111)
    somplot = plt.imshow(nodeHitMatrix,interpolation="none")
    plt.show()
    plt.pause(0.1)
Exemple #27
0
def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False):
	chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32)
	zoom = numpy.float32(zoom)
	iterations = numpy.int32(iterations)
	blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32)
	result = numpy.zeros(dimensions,dtype=numpy.int32)

	#Center position
	position = Vector(position[0]*zoom,position[1]*zoom)
	position = position - (Vector(result.shape[0],result.shape[1])/2)
	position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32)

	#For progress reporting:
	ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter
	ppc[0,0] = 0
	ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to
	#End progress reporting

	#Copy parameters over to device
	chunkS = In(chunkSize)
	posit = In(position)
	blockD = In(blockDim)
	zoo = In(zoom)
	iters = In(iterations)
	res = In(result)

	if not silent:
		print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0]))
	start_time = time.time()

	genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block)
	
	if report:
		total = (dimensions[0]*dimensions[1])
		print "Reporting up to "+str(total)+", "+str(ppc[0,0])
		while ppc[0,0] < ((dimensions[0]*dimensions[1])):
			pct = (ppc[0,0]*100)/(total)
			hashes = "#"*pct
			dashes = "-"*(100-pct)
			print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True),
			time.sleep(0.00001)


	cuda.Context.synchronize()
	if not silent:
		print "Done. "+str(ppc[0,0])

	#Copy result back from device
	cuda.memcpy_dtoh(result, res)

	if not silent: 
		end_time = time.time()
		elapsed_time = end_time-start_time
		print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n")
		print(result)
		
	result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image
	return result
Exemple #28
0
  def __gini(self, n_samples, indices_offset, si_gpu_in):
    n_block, n_range = self.__get_block_size(n_samples)
    
    self.scan_total_2d.prepared_call(
          (self.max_features, n_block),
          (self.COMPUTE_THREADS_PER_BLOCK, 1, 1),
          si_gpu_in.ptr + indices_offset,
          self.labels_gpu.ptr,
          self.label_total_2d.ptr,
          self.features_array_gpu.ptr,
          n_range,
          n_samples)

    self.scan_reduce.prepared_call(
          (self.max_features, 1),
          (32, 1, 1),
          self.label_total_2d.ptr,
          n_block)  
    
    self.comput_total_2d.prepared_call(
         (self.max_features, n_block),
         (self.COMPUTE_THREADS_PER_BLOCK, 1, 1),
         si_gpu_in.ptr + indices_offset,
         self.samples_gpu.ptr,
         self.labels_gpu.ptr,
         self.impurity_2d.ptr,
         self.label_total_2d.ptr,
         self.min_split_2d.ptr,
         self.features_array_gpu.ptr,
         n_range,
         n_samples)

    self.reduce_2d.prepared_call(
         (self.max_features, 1),
         (32, 1, 1),
         self.impurity_2d.ptr,
         self.impurity_left.ptr,
         self.impurity_right.ptr,
         self.min_split_2d.ptr,
         self.min_split.ptr,
         n_block)    
    
    self.find_min_kernel.prepared_call(
                (1, 1),
                (32, 1, 1),
                self.impurity_left.ptr,
                self.impurity_right.ptr,
                self.min_split.ptr,
                self.max_features)
    
    
    cuda.memcpy_dtoh(self.min_imp_info, self.impurity_left.ptr)
    min_right = self.min_imp_info[1] 
    min_left = self.min_imp_info[0] 
    col = int(self.min_imp_info[2]) 
    row = int(self.min_imp_info[3])
    row = self.features_array[row]  
    return min_left, min_right, row, col
Exemple #29
0
    def get(self):
        # Allocate an empty buffer
        buf = np.empty(self.datashape, dtype=self.dtype)

        # Copy
        cuda.memcpy_dtoh(buf, self.data)

        # Slice to give the expected I/O shape
        return buf[...,:self.ioshape[-1]]
    def wait(self):
        nx, ny, nz_pitch = self.mainf.ns_pitch

        for shift_idx, source_buf in enumerate(self.source_bufs):
            self.kernel_copy( \
                    nx, ny, nz_pitch, np.int32(shift_idx), self.target_buf, source_buf, \
                    grid=self.mainf.gs, block=self.mainf.bs)

        cuda.memcpy_dtoh(self.host_array, self.target_buf)
Exemple #31
0
def chambolle_pock_TVl1_CUDA(image, clambda, tau, sigma, iters=100):
    r""" 2D ROF CUDA solver using Chambolle-Pock Method

	Parameters
	----------
	image : numpy array
		The noisy image we are processing
	clambda : float
		The non-negative weight in the optimization problem
	tau : float
		Parameter of the proximal operator
	iters : int
		Number of iterations allowed

	"""
    print("2D Primal-Dual TV-l1 CUDA solver using Chambolle-Pock method")

    start_time = timeit.default_timer()

    (h, w) = image.shape
    dim = w * h
    nc = 1

    # Load Modules
    init_module = SourceModule(
        open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_init.cu', 'r').read())
    primal_module = SourceModule(
        open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_primal.cu',
             'r').read())
    dual_module = SourceModule(
        open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_dual.cu', 'r').read())
    extrapolate_module = SourceModule(
        open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_extrapolate.cu',
             'r').read())
    solution_module = SourceModule(
        open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_solution.cu',
             'r').read())

    # Memory Allocation
    nbyted = image.astype(np.float32).nbytes
    d_imgInOut = drv.mem_alloc(nbyted)
    d_x = drv.mem_alloc(nbyted)
    d_xbar = drv.mem_alloc(nbyted)
    d_xcur = drv.mem_alloc(nbyted)
    d_y1 = drv.mem_alloc(nbyted)
    d_y2 = drv.mem_alloc(nbyted)

    # Copy host memory
    h_img = image.astype(np.float32)
    drv.memcpy_htod(d_imgInOut, h_img)

    # Launch kernel
    block = (16, 16, 1)
    grid = (np.ceil(
        (w + block[0] - 1) / block[0]), np.ceil((h + block[1] - 1) / block[1]))
    grid = (int(grid[0]), int(grid[1]))

    # Function definition
    init_func = init_module.get_function('init')
    primal_func = primal_module.get_function('primal')
    dual_func = dual_module.get_function('dual')
    extrapolate_func = extrapolate_module.get_function('extrapolate')
    solution_func = solution_module.get_function('solution')

    # Initialization
    init_func(d_xbar,
              d_xcur,
              d_x,
              d_y1,
              d_y2,
              d_imgInOut,
              np.int32(w),
              np.int32(h),
              np.int32(nc),
              block=block,
              grid=grid)
    w = np.int32(w)
    h = np.int32(h)
    nc = np.int32(nc)
    sigma = np.float32(sigma)
    tau = np.float32(tau)
    clambda = np.float32(clambda)

    for i in range(iters):
        primal_func(d_y1,
                    d_y2,
                    d_xbar,
                    sigma,
                    w,
                    h,
                    nc,
                    block=block,
                    grid=grid)
        dual_func(d_x,
                  d_xcur,
                  d_y1,
                  d_y2,
                  d_imgInOut,
                  tau,
                  clambda,
                  w,
                  h,
                  nc,
                  block=block,
                  grid=grid)
        extrapolate_func(d_xbar,
                         d_xcur,
                         d_x,
                         np.float32(0.5),
                         w,
                         h,
                         nc,
                         block=block,
                         grid=grid)
    solution_func(d_imgInOut, d_x, w, h, nc, block=block, grid=grid)

    drv.memcpy_dtoh(h_img, d_imgInOut)

    print(
        "Finished Chambolle-Pock TV-l1 CUDA denoising in %d iterations and %f sec"
        % (iters, timeit.default_timer() - start_time))

    return (h_img, 0)
Exemple #32
0
    def compute(self, col_idx, prefix_sums, cpm_resolution):

        # check input
        cpm_resolution = np.float32(cpm_resolution)
        d_col_idx = ready_input(col_idx)
        d_prefix_sums = ready_input(prefix_sums)

        # prepare GPU memory:

        # array for storing community assignment per node
        community_idx = np.arange(self.N).astype(np.int32)
        d_community_idx = allocate_and_copy(community_idx)
        d_tmp_community_idx = allocate_and_copy(community_idx)

        # array for storing community sizes
        community_sizes = np.ones(self.N).astype(np.int32)
        d_community_sizes = allocate_and_copy(community_sizes)
        d_tmp_community_sizes = allocate_and_copy(community_sizes)

        # array for storing community inter connecting edges
        community_inter = np.zeros(self.N).astype(np.int32)
        d_community_inter = allocate_and_copy(community_inter)
        d_tmp_community_inter = allocate_and_copy(community_inter)

        # array for storing partial cpm for each community
        part_cpm = np.zeros(self.N).astype(np.float32)
        d_part_cpm = allocate_and_copy(part_cpm)

        # config
        iterations_limit = 15  # limit of cpm iterations
        cpm_thresh = 0.02  # threshold ratio supports decision if another cpm iteration iss needed

        iterations = 0  # counter
        iter_cpm_score = 0  # current cpm score

        # CPM execution:
        while True:

            # calclate best community assignment
            args_move_nodes = [
                self.N, d_col_idx, d_prefix_sums, d_community_idx,
                d_community_sizes, d_tmp_community_idx, d_tmp_community_sizes,
                cpm_resolution
            ]
            self.move_nodes(*args_move_nodes,
                            block=self.threads,
                            grid=self.grid,
                            stream=None,
                            shared=0)

            # memory reset
            d_tmp_community_inter = allocate_and_copy(
                np.zeros(self.N).astype(np.int32))
            d_part_cpm = allocate_and_copy(np.zeros(self.N).astype(np.float32))

            # calculate interconnecting edges per community
            args_community_internal_edges = [
                self.N, d_col_idx, d_prefix_sums, d_tmp_community_idx,
                d_tmp_community_inter
            ]
            self.community_internal_edges(*args_community_internal_edges,
                                          block=self.threads,
                                          grid=self.grid,
                                          stream=None,
                                          shared=0)

            # calculate partial cpm per community
            args_calculate_part_cpm = [
                self.N, d_tmp_community_inter, d_tmp_community_sizes,
                d_part_cpm, cpm_resolution
            ]
            self.calculate_part_cpm(*args_calculate_part_cpm,
                                    block=self.threads,
                                    grid=self.grid,
                                    stream=None,
                                    shared=0)

            # calculate overall cpm score
            drv.memcpy_dtoh(part_cpm, d_part_cpm)
            current_cpm_score = sum(part_cpm)

            # check cpm improvement for given iteration
            if iter_cpm_score != 0:
                cpm_diff = abs(
                    (current_cpm_score - iter_cpm_score) / iter_cpm_score)
            else:
                cpm_diff = 1

            if cpm_diff <= cpm_thresh or iterations > iterations_limit:
                # terminate if improvement below threshold or iteration limit reached
                break
            else:
                # prepare next iteration
                iterations += 1
                iter_cpm_score = current_cpm_score

                # copy temporary results of iteration
                drv.memcpy_dtod(d_community_idx, d_tmp_community_idx,
                                community_idx.nbytes)
                drv.memcpy_dtod(d_community_sizes, d_tmp_community_sizes,
                                community_idx.nbytes)
                drv.memcpy_dtod(d_community_inter, d_tmp_community_inter,
                                community_idx.nbytes)

        # classify communities
        community_class = np.zeros(self.N).astype(np.int32)
        d_community_class = allocate_and_copy(community_class)
        args_classify_communities = [
            self.N, d_community_inter, d_community_sizes, d_community_class
        ]
        self.classify_communities(*args_classify_communities,
                                  block=self.threads,
                                  grid=self.grid,
                                  stream=None,
                                  shared=0)

        # classify hits
        hit_class = np.zeros(self.N).astype(np.int32)
        d_hit_class = allocate_and_copy(hit_class)
        args_classify_hits = [
            self.N, d_community_idx, d_community_class, d_hit_class
        ]
        self.classify_hits(*args_classify_hits,
                           block=self.threads,
                           grid=self.grid,
                           stream=None,
                           shared=0)

        # get classified hits
        drv.memcpy_dtoh(hit_class, d_hit_class)
        classified_hits = sum(hit_class)

        return classified_hits
Exemple #33
0
import atexit

print("pycuda module version : ",pycuda.VERSION)
# print(pycuda.VERSION_TEXT)
drv.init()
print("CUDA toolkit driver version : ",drv.get_version())
print("cuda gpu in this system : ",drv.Device.count())

dev = drv.Device(0)
print("GPU name : ",dev.name())
print("If you want to check device attributes,\
	check this dictionary : ",type(dev.get_attributes()))
ctx = dev.make_context()
print(ctx.get_device())
# ctx.pop()
atexit.register(ctx.pop,)

print("global memory (free,total) : ",\
	[drv.mem_get_info()[i]/1024/1024 for i in range(2)], 'MB')

a = np.arange(10)
# print(a)
a_gpu = drv.mem_alloc(a.nbytes)
drv.memcpy_htod(a_gpu,a)
a_rcv = np.empty_like(a)
print("a_rcv before: ",a_rcv)
drv.memcpy_dtoh(a_rcv,a_gpu)
print("a_rcv after: ",a_rcv)
# ctx.push()
# ctx.detach()
Exemple #34
0
	start, stop = cuda.Event(), cuda.Event()
	exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 
			'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 
			'src_e':np.zeros(tmax)}

# main loop
ey_tmp = np.zeros((ny,nz),'f')
ez_tmp = np.zeros_like(ey_tmp)
hy_tmp = np.zeros_like(ey_tmp)
hz_tmp = np.zeros_like(ey_tmp)
for tn in xrange(1, tmax+1):
	if rank == 1: start.record()
	for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args)

	if rank == 0:
		cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		comm.Send(hy_tmp, 1, 20)
		comm.Send(hz_tmp, 1, 21)
	elif rank == 1:
		stop.record()
		stop.synchronize()
		exec_time['update_h'][tn-1] = stop.time_since(start)
		start.record()

		comm.Recv(hy_tmp, 0, 20)
		comm.Recv(hz_tmp, 0, 21)
		stop.record()
		stop.synchronize()
		exec_time['mpi_recv_h'][tn-1] = stop.time_since(start)
		start.record()
Exemple #35
0
    #funBC(ftemp_g, feq_g, fin_g,block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))     
#     
#     stop.record()
#     stop.synchronize()
    #cudaDeviceSynchronize();

#    time_gpu = start.time_till(stop)
    
    if( (It%Pinterval == 0) & (SaveVTK | SavePlot)) :
        
#         print('time spent in gpu (in millisecs) at this iteration is ',time_gpu)
#         print('So, Gpu bandwidth ( in GBPS) is ', 35*8*0.000001/(time_gpu))
#         
        u_past = u.copy()
        #cuda.memcpy_dtoh(fin,fin_g)
        cuda.memcpy_dtoh(rho,rho_g)
        cuda.memcpy_dtoh(u,u_g)
        rho = rho.transpose()
        u[0,:,:] = u[0,:,:].transpose()
        u[1,:,:] = u[1,:,:].transpose()   
         
        print ('current iteration :', It)
        #print (np.mean(u[0,:,0])/uLB)
        Usquare = u[0,:,:]**2 + u[1,:,:]**2
        
        Usquare = Usquare/(uLB**2)
        BCoffset = int(xsize/40)
        # replacing all boundaries with nan to get location of vortices
        Usquare[0:BCoffset,:] = nan ; Usquare[:,0:BCoffset] = nan
        Usquare[xsize_max-BCoffset:xsize,:] = nan;Usquare[:,ysize_max-BCoffset:ysize] = nan
        Loc1 = np.unravel_index(np.nanargmin(Usquare),Usquare.shape)
Exemple #36
0
def _walk_on_current_gpu(raw, slices, allLabels, indices, nbrw, sorw, name,
                         foundAxis):

    walkmap = np.zeros((len(allLabels), ) + raw.shape, dtype=np.float32)

    if raw.dtype == 'uint8':
        kernel = _build_kernel_int8()
        raw = (raw - 128).astype('int8')
    else:
        kernel = _build_kernel_float32()
        raw = raw.astype(np.float32)

    fill_gpu = _build_kernel_fill()

    zsh, ysh, xsh = raw.shape
    xsh_gpu = np.int32(xsh)
    ysh_gpu = np.int32(ysh)
    zsh_gpu = np.int32(zsh)

    block = (32, 32, 1)
    x_grid = (xsh // 32) + 1
    y_grid = (ysh // 32) + 1
    grid2 = (int(x_grid), int(y_grid), int(zsh))

    slshape = [None] * 3
    indices_gpu = [None] * 3
    beta_gpu = [None] * 3
    slices_gpu = [None] * 3
    ysh = [None] * 3
    xsh = [None] * 3

    print(indices)

    for k, found in enumerate(foundAxis):
        if found:
            indices_tmp = np.array(indices[k], dtype=np.int32)
            slices_tmp = slices[k].astype(np.int32)
            slshape[k], ysh[k], xsh[k] = slices_tmp.shape
            indices_gpu[k] = gpuarray.to_gpu(indices_tmp)
            slices_gpu[k] = gpuarray.to_gpu(slices_tmp)
            Beta = np.zeros(slices_tmp.shape, dtype=np.float32)
            for m in range(slshape[k]):
                for n in allLabels:
                    A = _calc_label_walking_area(slices_tmp[m], n)
                    plane = indices_tmp[m]
                    if k == 0: raw_tmp = raw[plane]
                    if k == 1: raw_tmp = raw[:, plane]
                    if k == 2: raw_tmp = raw[:, :, plane]
                    Beta[m] += _calc_var(raw_tmp.astype(float), A)
            beta_gpu[k] = gpuarray.to_gpu(Beta)

    sorw = np.int32(sorw)
    nbrw = np.int32(nbrw)
    raw_gpu = gpuarray.to_gpu(raw)
    a = np.empty(raw.shape, dtype=np.float32)
    a_gpu = cuda.mem_alloc(a.nbytes)

    for label_counter, segment in enumerate(allLabels):
        print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' +
              str(len(allLabels)))
        fill_gpu(a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2)
        segment_gpu = np.int32(segment)
        for k, found in enumerate(foundAxis):
            if found:
                axis_gpu = np.int32(k)
                x_grid = (xsh[k] // 32) + 1
                y_grid = (ysh[k] // 32) + 1
                grid = (int(x_grid), int(y_grid), int(slshape[k]))
                kernel(axis_gpu,
                       segment_gpu,
                       raw_gpu,
                       slices_gpu[k],
                       a_gpu,
                       xsh_gpu,
                       ysh_gpu,
                       zsh_gpu,
                       indices_gpu[k],
                       sorw,
                       beta_gpu[k],
                       nbrw,
                       block=block,
                       grid=grid)
        cuda.memcpy_dtoh(a, a_gpu)
        walkmap[label_counter] += a
    return walkmap
Exemple #37
0
    def process(self, img, **kwargs):
        if isinstance(img, Dataset):
            img = img.pixel_array
        w = img.shape[1]
        h = img.shape[0]
        n = w * h

        all_segments = kwargs.get('all_segments', True)
        max_it = kwargs.get('max_it', 100)
        if not isinstance(max_it, int) or max_it <= 0:
            raise ValueError('Number of iterations should not be negative')
        n_clusters = kwargs.get('n_clusters', 2)
        if not isinstance(n_clusters, int) or n_clusters <= 0:
            raise ValueError('Number of clusters should not be less than 1')

        numpass = kwargs.get('numpass', 5)
        median_radius = kwargs.get('median_radius', 10)
        high_intensity_threshold = kwargs.get('high_intensity_threshold', 0.1)
        blur_radius = kwargs.get('blur_radius', 5)

        img, _ = median_otsu(img, numpass=numpass, median_radius=median_radius)
        img = (img - np.min(img)) / (np.max(img) - np.min(img))
        blurred = cv.blur(img, (15, 15))
        edges = np.clip(img - blurred, 0.0, 1.0)
        edges[edges > high_intensity_threshold] = 1.0
        edges[edges <= high_intensity_threshold] = 0.0
        edges = cv.dilate(edges, np.ones((3, 3)), iterations=1)
        img = np.clip(img - edges, 0.0, 1.0)
        img = cv.erode(img, np.ones((3, 3)), iterations=1)
        img = cv.blur(img, (blur_radius, blur_radius))

        # src = (img - np.min(img)) / (np.max(img) - np.min(img))
        src = img.astype(np.float32)
        src = src.reshape((-1))

        centers = np.random.rand(n_clusters).astype(np.float32)

        # Image
        src_gpu = cuda.mem_alloc(src.nbytes)
        cuda.memcpy_htod(src_gpu, src)

        # Cluster centers

        centers_gpu = cuda.mem_alloc(centers.nbytes)
        cuda.memcpy_htod(centers_gpu, centers)

        # Labels
        labels = np.empty_like(src).astype(np.int32)
        labels_gpu = cuda.mem_alloc(labels.nbytes)
        cuda.memcpy_htod(labels_gpu, labels)

        module = SourceModule(Template(SRC).render(N=n))
        relabel = module.get_function('relabel')
        calculate_clusters = module.get_function('calculateClusters')
        find_centers = module.get_function('findCenters')

        for it in range(max_it):
            relabel(src_gpu,
                    centers_gpu,
                    np.int32(n),
                    np.int32(n_clusters),
                    labels_gpu,
                    block=(BLOCKDIM, 1, 1),
                    grid=((n + BLOCKDIM - 1) // BLOCKDIM, 1))
            for c in range(n_clusters):
                calculate_clusters(src_gpu,
                                   labels_gpu,
                                   np.int32(n),
                                   np.int32(c),
                                   block=(BLOCKDIM, 1, 1),
                                   grid=((n + BLOCKDIM - 1) // BLOCKDIM, 1),
                                   shared=8 * BLOCKDIM)
                find_centers(np.int32(n),
                             np.int32(c),
                             centers_gpu,
                             block=((n + BLOCKDIM - 1) // BLOCKDIM, 1, 1),
                             grid=((1, 1)),
                             shared=8 * (n + BLOCKDIM - 1) // BLOCKDIM)

        cuda.memcpy_dtoh(labels, labels_gpu)
        cuda.memcpy_dtoh(centers, centers_gpu)

        labels = labels.reshape((-1))
        if not all_segments:
            c_index = np.argmax(centers)
            flat = np.full(n, 0, dtype=np.uint8)
            flat[labels == c_index] = 1
            mask = flat.reshape((h, w))

            return mask
        else:
            return labels.reshape((h, w))
Exemple #38
0
	def pre_filter( self, array, return_gpu=True ):
		""" res = spline.pre_filter( array, return_gpu = True )
		
		Pre-filter a data array to prepare for spline interpolation.
		Returns an allocation object for the array in gpu memory if
		return_gpu = true, otherwise the pre filtered array """
		
		# make sure we are ready to execute
		if not self._prepare_cuda():
			return False
		
		import time
		now = time.time()
		
		# make sure the data array is little endian type
		array = self.force_le( array )
		
		# array size
		( ncols, nrows ) = array.shape
		
		# allocate memory for array
		array_gpu = drv.mem_alloc( array.nbytes )
		
		# and another for the transposed array
		t_array_gpu = drv.mem_alloc( array.nbytes )
		
		# and copy to gpu
		drv.memcpy_htod( array_gpu, array )
		
		# first apply gain.  Might as well apply it with GPU since
		# the array is already there
		blocks = int( np.ceil( float( array.size )/self.nthreads ) )
		self.cuda_apply_gain( array_gpu, np.int64( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) )
		
		# now filter.  Rows are filtered individually.  Therefore
		# Each GPU thread will filter one row
		blocks = int( np.ceil( float( nrows )/self.nthreads ) )
		self.cuda_pre_filter_rows( array_gpu, np.int32( nrows ), np.int32( ncols ), grid=(blocks,1), block=(self.nthreads,1,1) )
		
		## now we need to transpose the array to filter in the other direction
		#blocks = int( np.ceil( float( array.size )/self.nthreads ) )
		#self.cuda_transpose( array_gpu, t_array_gpu, np.int32( nrows ), np.int32( ncols ), np.int32( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) )
		
		## apply gain again.  Must be applied at both steps
		#blocks = int( np.ceil( float( array.size )/self.nthreads ) )
		#self.cuda_apply_gain( t_array_gpu, grid=(blocks,1), block=(self.nthreads,1,1) )
		
		## and run pre_filter on the transposed array
		#blocks = int( np.ceil( float( ncols )/self.nthreads ) )
		#self.cuda_pre_filter_rows( t_array_gpu, np.int32( ncols ), np.int32( nrows ), grid=(blocks,1), block=(self.nthreads,1,1) )
		
		## finally transpose back
		#blocks = int( np.ceil( float( array.size )/self.nthreads ) )
		#self.cuda_transpose( t_array_gpu, array_gpu, np.int32( ncols ), np.int32( nrows ), np.int32( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) )
		
		# return gpu array
		if return_gpu:
			return array_gpu
		
		# otherwise fetch the array back out of GPU memory
		output = np.empty_like( array )
		drv.memcpy_dtoh( output, array_gpu )
		array_gpu.free()
		
		print time.time()-now
		
		# and return
		return output