Example #1
0
    def batch_indexing(self, planes, data_points):

        data_size = data_points.shape[0] / 128

        self.benchmark_begin('preparing')

        gpu_alloc_objs = []

        # for data points

        #addresses = [] 
        #for point in data_points:
        #    point_addr = drv.to_device(point)
        #    gpu_alloc_objs.append(point_addr)
        #    addresses.append(int(point_addr))

        #np_addresses = numpy.array(addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
        #drv.memcpy_htod(arrays_gpu, np_addresses)

        # for planes

        planes_addresses = [] 
        for plane in planes:
            plane_addr = drv.to_device(plane)
            gpu_alloc_objs.append(plane_addr)
            planes_addresses.append(int(plane_addr))

        planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
        drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)

        # projections
 
        projections = numpy.zeros(data_size).astype(numpy.uint64)

        length = numpy.array([data_size]).astype(numpy.uint64)
 
        print "total: " + str(data_size) + " data points to indexing." 

        self.benchmark_end('preparing')
        self.benchmark_begin('cudaing')

        self.indexing_kernel(
            planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
            block = self.block, grid = self.grid)
        
        self.benchmark_end('cudaing')

        #count = 0
        #for pro in projections:
        #    print "count: " + str(count) + " " + str(pro)
        #    count += 1
        #print projections.shape

        return projections
Example #2
0
    def _set(self, ary):
        # Allocate a new buffer with suitable padding and assign
        buf = np.zeros(self.datashape, dtype=self.dtype)
        buf[...,:self.ioshape[-1]] = ary

        # Copy
        cuda.memcpy_htod(self.data, buf)
Example #3
0
	def set_round_drill( self , size ) :
		sx , sy = self.get_scale()
		nx , ny = int(size / sx + .5) , int(size / sy + .5)

		self.drillflat = False

		print 'Setting round drill:'
		print size
		print sx , sy 
		print nx , ny

		self.hdrill = np.zeros( (nx,ny) , np.float32 )

		size /= 2.0
		for x in range(nx) :
			for y in range(ny) :
				fx = (x-int(nx/2+.5)) * sx
				fy = (y-int(ny/2+.5)) * sy 
				ts = size*size - fx*fx - fy*fy
				self.hdrill[x,y] = -m.sqrt( ts ) + size if ts > 0 else size*2

		self.drillrad = size

		print self.hdrill
		print self.drillrad

		self.cdrill = cuda_driver.mem_alloc( self.hdrill.nbytes )
		cuda_driver.memcpy_htod( self.cdrill , self.hdrill )

		self.grid = map( int , ( m.ceil(nx/22.0) , m.ceil(ny/22.0) ) )
		self.block = ( min(nx,22) , min(ny,22) , 1 )

		print self.grid 
		print self.block
Example #4
0
    def __init__(self, view_tile, size, sigma, debug=False):
        self.debug = debug
        if size[0] < 2 or size[1] < 2:
            raise ValueError("Split needs to be at least 2x2")

        self.data_sets = view_tile.get_Data()
        for dset in self.data_sets:
            data = dset.getDataSet()
            if not data.flags['C_CONTIGUOUS']:
                print "NOT CONTIGUOUS, trying to reformat the points"
                data = np.require(data, dtype=data.dtype, requirements=['C'])
                if not data.flags['C_CONTIGUOUS']:
                    raise Exception("Points are not contiguous")
                dset.setDataSet(data)

        self.view_tile = view_tile
        self.sigma = sigma
        self.pts_gpu = None

        # Initiates all of cuda stuff
        self.grid = np.zeros(size).astype(np.float32)
        self.grid_gpu = cuda.mem_alloc_like(self.grid)
        cuda.memcpy_htod(self.grid_gpu, self.grid)

        kernel = SourceModule(self.__cuda_code)
        self.gpu_gaussian = kernel.get_function("gpu_gaussian")

        self.view = self.view_tile.get_View()

        self.grid_size, self.block_size = self.__setup_cuda_sizes(size)

        self.dx = 1 / float(size[1] - 1)
        self.dy = 1 / float(size[0] - 1)
Example #5
0
def edgetaper_gpu(y_gpu, sf, win='barthann'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))

  # Ensure that sf is odd
  sf = sf+(1-np.mod(sf,2))
  wx = scipy.signal.get_window(win, sf[1])
  wy = scipy.signal.get_window(win, sf[0])
  maxw = wx.max() * wy.max()
  
  hsf = np.floor(sf/2)
  wx = (wx[0:hsf[1]] / maxw).astype(dtype)
  wy = (wy[0:hsf[0]] / maxw).astype(dtype)

  preproc = _generate_preproc(dtype, shape)
  preproc += '#define wx_size %d\n' % wx.size
  preproc += '#define wy_size %d\n' % wy.size
  mod = SourceModule(preproc + edgetaper_code, keep=True)
  edgetaper_gpu = mod.get_function("edgetaper")
  wx_gpu, wx_size = mod.get_global('wx')
  wy_gpu, wy_size = mod.get_global('wy')

  cu.memcpy_htod(wx_gpu, wx)
  cu.memcpy_htod(wy_gpu, wy)

  edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
                block=block_size, grid=grid_size)
Example #6
0
    def test_prepared_invocation(self):
        a = np.random.randn(4,4).astype(np.float32)
        a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)

        drv.memcpy_htod(a_gpu, a)

        mod = SourceModule("""
            __global__ void doublify(float *a)
            {
              int idx = threadIdx.x + threadIdx.y*blockDim.x;
              a[idx] *= 2;
            }
            """)

        func = mod.get_function("doublify")
        func.prepare("P")
        func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
        a_doubled = np.empty_like(a)
        drv.memcpy_dtoh(a_doubled, a_gpu)
        print (a)
        print (a_doubled)
        assert la.norm(a_doubled-2*a) == 0

        # now with offsets
        func.prepare("P")
        a_quadrupled = np.empty_like(a)
        func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
        drv.memcpy_dtoh(a_quadrupled, a_gpu)
        assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
Example #7
0
    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
Example #8
0
def loop(iterations):
    ts = 0
    while(ts<iterations):
        ' To avoid overwrites a temporary copy is made of F '
        T[:] = F
        cuda.memcpy_htod(T_gpu, T)
        
        ' Propagate '
        prop(F_gpu, T_gpu, 
             block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate density and get bounceback from obstacle nodes '
        density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu,
                block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate equilibrium '
        eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, 
           U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), 
           grid=(gridDimX,gridDimY))
        
        ' Transfer bounceback to obstacle nodes '
        bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu,
                   block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
                              
        ' Copy F to host for copy to T in beginning of loop '
        cuda.memcpy_dtoh(F, F_gpu)
        
        ts += 1
Example #9
0
File: types.py Project: pv101/PyFR
    def _set(self, ary):
        # Allocate a new buffer with suitable padding and pack it
        buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
        buf[:, :self.ncol] = self._pack(ary)

        # Copy
        cuda.memcpy_htod(self.data, buf)
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
    def prepare_device_arrays(self):

        self.maxLayers  = self.grid_prop.GetMaxLayers()
        nczbins_fine    = len(self.czcen_fine)
        numLayers       = np.zeros(nczbins_fine,dtype=np.int32)
        densityInLayer  = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
        distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)

        self.grid_prop.GetNumberOfLayers(numLayers)
        self.grid_prop.GetDensityInLayer(densityInLayer)
        self.grid_prop.GetDistanceInLayer(distanceInLayer)

        # Copy all these earth info arrays to device:
        self.d_numLayers       = cuda.mem_alloc(numLayers.nbytes)
        self.d_densityInLayer  = cuda.mem_alloc(densityInLayer.nbytes)
        self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
        cuda.memcpy_htod(self.d_numLayers,numLayers)
        cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
        cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)

        self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
        self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
        cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
        cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)

        return
Example #12
0
    def _read_LPU_input(self, in_gpot_dict, in_spike_dict):
        """
        Put inputs from other LPUs to buffer.

        """

        for other_lpu, gpot_data in in_gpot_dict.iteritems():
            i = self.other_lpu_map[other_lpu]
            if self.num_input_gpot_neurons[i] > 0:
                cuda.memcpy_htod(int(int(self.buffer.gpot_buffer.gpudata) \
                    +(self.buffer.gpot_current * self.buffer.gpot_buffer.ld \
                    + self.my_num_gpot_neurons + self.cum_virtual_gpot_neurons[i]) \
                    * self.buffer.gpot_buffer.dtype.itemsize), gpot_data)
                if self.debug:
                    self.in_gpot_files[other_lpu].root.array.append(gpot_data.reshape(1,-1))
            
        
        #Will need to change this if only spike indexes are transmitted
        for other_lpu, sparse_spike in in_spike_dict.iteritems():
            i = self.other_lpu_map[other_lpu]
            if self.num_input_spike_neurons[i] > 0:
                full_spike = np.zeros(self.num_input_spike_neurons[i],dtype=np.int32)
                if len(sparse_spike)>0:
                    idx = np.asarray([self.input_spike_idx_map[i][k] \
                                      for k in sparse_spike], dtype=np.int32)
                    full_spike[idx] = 1

                cuda.memcpy_htod(int(int(self.buffer.spike_buffer.gpudata) \
                    +(self.buffer.spike_current * self.buffer.spike_buffer.ld \
                    + self.my_num_spike_neurons + self.cum_virtual_spike_neurons[i]) \
                    * self.buffer.spike_buffer.dtype.itemsize), full_spike)
Example #13
0
def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
Example #14
0
def compile_for_GPU(function_package, kernel_function_name='default'):
	kernel_code = ''
	if kernel_function_name == 'default':
		kernel_code = attachment
		source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code)
	else:
		fp = function_package
		
		from vivaldi_translator import translate_to_CUDA
		function_name = fp.function_name

		Vivaldi_code = function_code_dict[function_name]
		
		function_code = translate_to_CUDA(Vivaldi_code=Vivaldi_code, function_name=function_name, function_arguments=fp.function_args)
		
		kernel_code = attachment + 'extern "C"{\n'
		kernel_code += function_code
		kernel_code += '\n}'

		if True: # print for debugging
			f = open('asdf.cu','w')
			f.write(kernel_code)
			f.close()

		#print function_code
		args = [kernel_code]
		source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code)

		temp,_ = source_module_dict[kernel_function_name].get_global('DEVICE_NUMBER')
		cuda.memcpy_htod(temp, numpy.int32(device_number))
		
		func_dict[kernel_function_name] = source_module_dict[kernel_function_name].get_function(kernel_function_name)
		
		create_helper_textures(source_module_dict[kernel_function_name])
Example #15
0
    def from_np(np_data):
        cudabuf = cuda.mem_alloc(np_data.nbytes)
        cuda.memcpy_htod(cudabuf, np_data)
#        self.cpudata = np_data
        tensor = MyTensor(cudabuf, shape=np_data.shape, size=np_data.size)
        tensor.cpudata = np_data
        return tensor
Example #16
0
    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
Example #17
0
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
	# create Cheetah template and fill in variables for mask kernel
	mask_template = Template(mask_source)
	mask_template.BLOCK_DIM_X = b_size[0]
  	mask_template.BLOCK_DIM_Y = b_size[1]
  	mask_template.WIDTH = dest_im.shape[1]
  	mask_template.HEIGHT = dest_im.shape[0]
  	mask_template.RGB = RGB
  	mask_template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	mask_kernel = cuda_compile(mask_template, "mask_kernel")

  	# alloc memory to GPU
  	d_source = cu.mem_alloc(source_im.nbytes)
  	cu.memcpy_htod(d_source, source_im)

  	# sends to GPU filter out interior points in the mask
  	mask_kernel(d_source, block=b_size, grid=g_size)

  	# retrieves interior point buffer from GPU
  	inner_buffer = np.array(dest_im, dtype =np.uint8)
  	cu.memcpy_dtoh(inner_buffer, d_source)

  	# returns the interior buffer
  	return inner_buffer
 def test_pycuda(self):
     """
     Test pycuda installation with small example.
     :return:
     :rtype:
     """
     try:
         import pycuda.driver as cuda
         import pycuda.autoinit
         from pycuda.compiler import SourceModule
         import numpy as np
         a = np.random.randn(4, 4)
         print(a)
         a= a.astype(np.float32)
         a_gpu = cuda.mem_alloc(a.nbytes)
         cuda.memcpy_htod(a_gpu, a)
         mod = SourceModule(
             """
             __global__ void doublify(float *a)
             {
             int idx = threadIdx.x + threadIdx.y*4;
             a[idx] *= 2;
             }
             """
         )
         func = mod.get_function("doublify")
         func(a_gpu, block=(4,4,1))
         a_doubled = np.empty_like(a)
         cuda.memcpy_dtoh(a_doubled, a_gpu)
         #print(a_doubled)
         #print(a)
     except Exception:
         self.fail('Still not working')
Example #19
0
    def __init__(self, n_dict, V, dt, debug=False, LPU_id=None):
        self.num_neurons = len(n_dict['id'])
        self.dt = np.double(dt)
        self.steps = max(int(round(dt / 1e-5)), 1)
        self.debug = debug
        self.LPU_id = LPU_id
        self.ddt = dt / self.steps

        self.V = V
        self.X0 = garray.to_gpu( np.asarray( n_dict['X0'], dtype=np.float64 ))
        self.X1 = garray.to_gpu( np.asarray( n_dict['X1'], dtype=np.float64 ))
        self.X2 = garray.to_gpu( np.asarray( n_dict['X2'], dtype=np.float64 ))

        # Copies an initial V into V
        cuda.memcpy_htod(int(self.V), np.asarray(n_dict['Vinit'], dtype=np.double))
        self.update = self.get_gpu_kernel()
        if self.debug:
            if self.LPU_id is None:
                self.LPU_id = "anon"
            self.I_file = tables.openFile(self.LPU_id + "_I.h5", mode="w")
            self.I_file.createEArray("/","array", \
                                     tables.Float64Atom(), (0,self.num_neurons))
            self.V_file = tables.openFile(self.LPU_id + "_V.h5", mode="w")
            self.V_file.createEArray("/","array", \
                                     tables.Float64Atom(), (0,self.num_neurons))
Example #20
0
    def __init__(self, pts, axis, split, sigma):
        if split[0] < 2 or split[1] < 2:
            raise ValueError("Split needs to be at least 2x2")

        if not pts.flags['C_CONTIGUOUS']:
            pts = np.require(pts, dtype=pts.dtype, requirements=['C'])
            if not pts.flags['C_CONTIGUOUS']:
                raise Exception("Points are not contiguous")

        self.axis = axis
        self.sigma = sigma
        self.pts = pts
        self.pts_gpu = None

        # Initiates all of cuda stuff
        self.grid = np.zeros(split).astype(pts.dtype)
        self.grid_gpu = cuda.mem_alloc_like(self.grid)
        cuda.memcpy_htod(self.grid_gpu, self.grid)

        kernel = SourceModule(self.__cuda_code)
        self.gpu_gaussian = kernel.get_function("gpu_gaussian")

        self.dx = 1 / float(split[0] - 1)
        self.dy = 1 / float(split[1] - 1)

        self.grid_size, self.block_size = self.__setup_cuda_sizes(split)
Example #21
0
def GPU():
    im = Image.open(sys.argv[1])
    print sys.argv[1], ": ", im.format, im.size, im.mode, '\n'

    pixels = np.array(im.getdata())
    #r, g, b = im.split()
    print pixels
    #print pixels[:,0].nbytes
    pixels = np.array(im)
    
    gpu = cuda.mem_alloc(pixels.nbytes)
    cuda.memcpy_htod(gpu, pixels)
    kernel = SourceModule("""
        #define MAX_PIXEL_VALUE 255
        #define THRESHOLD 50

        __global__ void process_pixel(int *r, int *g, int *b)
        {
            int id = blockDim.x * blockIdx.x + threadIdx.x;

            if ((r[id] > THRESHOLD) && (g[id] > THRESHOLD) && (b[id] > THRESHOLD)) {
                r[id] = MAX_PIXEL_VALUE;
                g[id] = MAX_PIXEL_VALUE;
                b[id] = MAX_PIXEL_VALUE;
            }
        }
        """)

    func = kernel.get_function("process_pixel")
    func(gpu, block=(4,4,1))

    newpixels = np.zeros_like(pixels)
    cuda.memcpy_dtoh(newpixels, gpu)
    def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False):
        if cuda_verbose:
            self.compile_options = ['--ptxas-options=-v']
        else:
            self.compile_options = []

        self.num_neurons = len(n_dict['id'])
        self.dt = np.double(dt)
        self.steps = max(int(round(dt / 1e-5)),1)
        self.debug = debug

        self.ddt = dt / self.steps

        self.V = V

        self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64))

        self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64))
        self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64))
        self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64))
        self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64))
        self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64))
        self.offset = garray.to_gpu(np.asarray(n_dict['offset'],
                                               dtype=np.float64))

        cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'], dtype=np.double))
        self.update = self.get_euler_kernel()
Example #23
0
 def _to_device(self, module):
     ptr, size = module.get_global(self.name)
     if size != self.data.nbytes:
         raise RuntimeError("Const %s needs %d bytes, but only space for %d" % (self, self.data.nbytes, size))
     if self.state is DeviceDataMixin.HOST:
         driver.memcpy_htod(ptr, self._data)
         self.state = DeviceDataMixin.BOTH
Example #24
0
  def evaluate(self, params, returnOutputs=False):
    """Evaluate several networks (with given params) on training set.
    
    @param params: network params
    @type params: list of Parameters
    @param returnOutputs: return network output values (debug)
    @type returnOutputs: bool, default False
    
    @return output matrix if returnOutputs=True, else None
    """
    if self.popSize != len(params):
      raise ValueError("Need %d Parameter structures (provided %d)" % (
        self.popSize, len(params)))
    
    paramArrayType = Parameters * len(params)
    driver.memcpy_htod(self.params, paramArrayType(*params))

    # TODO: remove
    driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
    
    self.evaluateKernel.prepared_call(self.evaluateGridDim,
                                      self.trainSetDev,
                                      self.trainSet.size,
                                      self.params,
                                      self.popSize,
                                      self.outputs)

    driver.Context.synchronize()

    self.outputsMat = driver.from_device(self.outputs,
                                         shape=(self.popSize, self.trainSet.size),
                                         dtype=np.float32)
    
    if returnOutputs:
      return self.outputsMat
Example #25
0
    def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False):
        if cuda_verbose:
            self.compile_options = ["--ptxas-options=-v"]
        else:
            self.compile_options = []

        self.num_neurons = len(n_dict["id"])
        self.dt = np.double(dt)
        self.steps = max(int(round(dt / 1e-5)), 1)
        self.debug = debug

        self.ddt = dt / self.steps

        self.V = V

        self.n = garray.to_gpu(np.asarray(n_dict["initn"], dtype=np.float64))

        self.V_1 = garray.to_gpu(np.asarray(n_dict["V1"], dtype=np.float64))
        self.V_2 = garray.to_gpu(np.asarray(n_dict["V2"], dtype=np.float64))
        self.V_3 = garray.to_gpu(np.asarray(n_dict["V3"], dtype=np.float64))
        self.V_4 = garray.to_gpu(np.asarray(n_dict["V4"], dtype=np.float64))
        self.V_l = garray.to_gpu(np.asarray(n_dict["V_l"], dtype=np.float64))
        self.V_ca = garray.to_gpu(np.asarray(n_dict["V_ca"], dtype=np.float64))
        self.V_k = garray.to_gpu(np.asarray(n_dict["V_k"], dtype=np.float64))
        self.G_l = garray.to_gpu(np.asarray(n_dict["G_l"], dtype=np.float64))
        self.G_ca = garray.to_gpu(np.asarray(n_dict["G_ca"], dtype=np.float64))
        self.G_k = garray.to_gpu(np.asarray(n_dict["G_k"], dtype=np.float64))
        self.Tphi = garray.to_gpu(np.asarray(n_dict["phi"], dtype=np.float64))
        self.offset = garray.to_gpu(np.asarray(n_dict["offset"], dtype=np.float64))

        cuda.memcpy_htod(int(self.V), np.asarray(n_dict["initV"], dtype=np.double))
        self.update = self.get_euler_kernel()
	def calc_bandwidth_h2d( s ):
		t1 = datetime.now()
		cuda.memcpy_htod( s.dev_a, s.a )
		dt = datetime.now() - t1
		dt_float = dt.seconds + dt.microseconds*1e-6

		return s.nbytes/dt_float/gbytes
Example #27
0
  def __compile_kernels(self):
    """ DFS module """
    f = self.forest
    self.find_min_kernel = f.find_min_kernel  
    self.fill_kernel = f.fill_kernel 
    self.scan_reshuffle_tex = f.scan_reshuffle_tex 
    self.comput_total_2d = f.comput_total_2d 
    self.reduce_2d = f.reduce_2d
    self.scan_total_2d = f.scan_total_2d 
    self.scan_reduce = f.scan_reduce 
    
    """ BFS module """
    self.scan_total_bfs = f.scan_total_bfs
    self.comput_bfs_2d = f.comput_bfs_2d
    self.fill_bfs = f.fill_bfs 
    self.reshuffle_bfs = f.reshuffle_bfs 
    self.reduce_bfs_2d = f.reduce_bfs_2d 
    self.get_thresholds = f.get_thresholds 

    """ Other """
    self.predict_kernel = f.predict_kernel 
    self.mark_table = f.mark_table
    const_sorted_indices = f.bfs_module.get_global("sorted_indices_1")[0]
    const_sorted_indices_ = f.bfs_module.get_global("sorted_indices_2")[0]
    cuda.memcpy_htod(const_sorted_indices, np.uint64(self.sorted_indices_gpu.ptr)) 
    cuda.memcpy_htod(const_sorted_indices_, np.uint64(self.sorted_indices_gpu_.ptr)) 
Example #28
0
    def __init__(self, n_dict, V, dt, debug=False):

        self.num_neurons = len(n_dict['id'])
        self.dt = np.double(dt)
        self.steps = max(int(round(dt / 1e-5)), 1)
        self.debug = debug

        self.ddt = dt / self.steps

        self.V = V

        self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64))

        self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64))
        self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64))
        self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64))
        self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64))
        self.V_l = garray.to_gpu(np.asarray(n_dict['V_l'], dtype = np.float64))
        self.V_ca = garray.to_gpu(np.asarray(n_dict['V_ca'], dtype = np.float64))
        self.V_k = garray.to_gpu(np.asarray(n_dict['V_k'], dtype = np.float64))
        self.G_l = garray.to_gpu(np.asarray(n_dict['G_l'], dtype = np.float64))
        self.G_ca = garray.to_gpu(np.asarray(n_dict['G_ca'], dtype = np.float64))
        self.G_k = garray.to_gpu(np.asarray(n_dict['G_k'], dtype = np.float64))
        self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64))
        self.offset = garray.to_gpu(np.asarray(n_dict['offset'],
                                               dtype=np.float64))

        cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'], 
                         dtype=np.double))
        self.update = self.get_euler_kernel()
Example #29
0
    def __compute_guassian_on_pts(self):
        view = self.view_tile.get_View()

        for dset in self.data_sets:
            _data = np.array(dset.getDataSet(), copy=True)
            _data[:, 0] = (_data[:, 0] - view.left)/view.width()
            _data[:, 1] = (_data[:, 1] - view.bottom)/view.height()

            for row in range(self.grid_size[0]):
                for col in range(self.grid_size[1]):
                    # 3 * SIGMA give the 95%
                    left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma)
                    right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma)
                    bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma)
                    top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma)
                    pts = getFilteredDataSet(_data, (left, right, bottom, top))

                    if len(pts) > 0:
                        self.pts_gpu = cuda.mem_alloc_like(pts)
                        cuda.memcpy_htod(self.pts_gpu, pts)

                        self.gpu_gaussian(self.grid_gpu,  # Grid
                                          self.pts_gpu,  # Points
                                          np.int32(col),  # Block Index x
                                          np.int32(row),  # Block Index y
                                          np.int32(self.grid_size[1]),  # Grid Dimensions x
                                          np.int32(self.grid_size[0]),  # Grid Dimensions y
                                          np.int32(pts.shape[0]),  # Point Length
                                          np.float32(self.dx),  # dx
                                          np.float32(self.dy),  # dy
                                          np.float32(self.sigma),  # Sigma
                                          block=self.block_size)

                        self.pts_gpu.free()
Example #30
0
    def set(self, ary, device=None):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
            device: device id, if not the one attached to current context
        Returns:
            self
        """
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == self.strides

        if device is None:
            drv.memcpy_htod(self.gpudata, ary)
        else:
            # with multithreaded datasets, make a context before copying
            # and destroy it again once done.
            ctx = drv.Device(device).make_context()
            drv.memcpy_htod(self.gpudata, ary)
            ctx.pop()
            del ctx

        return self
h_b = np.random.randn(1, N)

"""Cast ```h_a``` and ```h_b``` to single precision (```float32```)."""

h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)

"""Allocate ```h_a.nbytes```, ```h_b.nbytes``` and ```h_c.nbytes``` of GPU device memory space pointed to by ```d_a```, ```d_b``` and ```d_c```, respectively."""

d_a = cuda.mem_alloc(h_a.nbytes)
d_b = cuda.mem_alloc(h_b.nbytes)
d_c = cuda.mem_alloc(h_a.nbytes)

"""Copy the ```h_a``` and ```h_b``` arrays from host to the device arrays ```d_a``` and ```d_b```, respectively."""

cuda.memcpy_htod(d_a, h_a)
cuda.memcpy_htod(d_b, h_b)

"""Define the CUDA kernel function ```deviceAdd``` as a string. ```deviceAdd``` performs the elementwise summation of ```d_a``` and ```d_b``` and puts the result in ```d_c```."""

mod = SourceModule("""
  #include <stdio.h>
  __global__ void deviceAdd(float * __restrict__ d_c, const float * __restrict__ d_a, const float * __restrict__ d_b, const int N)
  {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= N) return;
    d_c[tid] = d_a[tid] + d_b[tid];
  } 
  """)

"""Define a reference to the ```__global__``` function ```deviceAdd```."""
Example #32
0
# matrix B
b = np.random.randn(n, n) * 10
b = b.astype(np.float32)

# matrix B
c = np.empty([n, n])
c = c.astype(np.float32)

# allocate memory on device
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

# copy matrix to memory
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

# compile kernel
mod = SourceModule("""
__global__ void matmul2(int n, float *a, float *b, float *c)
{
  int row_num_a,col_num_a,row_num_b,col_num_b,row_num_c,col_num_c;
  row_num_a=col_num_a=row_num_b=col_num_b=row_num_c=col_num_c = n;
  __shared__ float A[32][32];//2D array for storing shared matrix values of A & B
  __shared__ float B[32][32];//Process subMatrix in block

  
  int Col=blockIdx.x*blockDim.x+threadIdx.x;//Col and Row Ids of threads
  int Row=blockIdx.y*blockDim.y+threadIdx.y;
  double temp = 0;
Example #33
0
    def sampleC(self):
        """
        Sample the process affiliations for each event. These must be done sequentially 
        since they are made dependent through the parent relationships Z.
        """
        N = self.base.data.N
        K = self.modelParams["proc_id_model", "K"]
        D = self.base.data.D
        gridx = int(np.ceil(np.float32(N) / self.params["blockSz"]))

        for n in np.arange(self.base.data.N):
            # Sample c_n conditioned upon all other C's
            self.gpuKernels["computePerSpikePrCn"](
                np.int32(n),
                np.int32(N),
                np.int32(K),
                np.int32(D),
                self.base.data.gpu.X.gpudata,
                self.gpuPtrs["proc_id_model", "Xmean"].gpudata,
                self.gpuPtrs["proc_id_model", "Xprec"].gpudata,
                self.gpuPtrs["parent_model", "Z"].gpudata,
                self.gpuPtrs["proc_id_model", "C"].gpudata,
                self.gpuPtrs["graph_model", "A"].gpudata,
                self.gpuPtrs["weight_model", "W"].gpudata,
                self.gpuPtrs["bkgd_model", "lam"].gpudata,
                self.gpuPtrs["proc_id_model", "Xstats"].gpudata,
                block=(self.params["blockSz"], 1, 1),
                grid=(gridx, K))

            # Sum the log prob for each process and sample a new cn
            prcn = np.zeros((K, ), dtype=np.float32)
            blockLogPrSum = self.gpuPtrs["proc_id_model", "Xstats"].get()
            prcn[:] = np.sum(blockLogPrSum, 1)

            try:
                cn = log_sum_exp_sample(prcn)

            except Exception as ex:
                log.info("Exception on spike %d!", n)
                log.info(ex)

                log.info("K=%d", K)

                log.info("X[n]:")
                log.info(self.base.data.gpu.X.get()[:, n])

                WGS = self.gpuPtrs["graph_model", "WGS"].get()
                log.info("WGS[:,%d]:", n)
                log.info(WGS[:, n])

                C = self.gpuPtrs["proc_id_model", "C"].get()
                Z = self.gpuPtrs["parent_model", "Z"].get()
                A = self.gpuPtrs["graph_model", "A"].get()
                W = self.gpuPtrs["weight_model", "W"].get()

                log.info("W")
                log.info(W)

                if Z[n] > -1:
                    log.info("Spike %d (c=%d) parented by spike %d (c=%d)", n,
                             C[n], Z[n], C[Z[n]])

                    #                    log.info ("A")
                    #                    log.info(self.gpuPtrs["graph_model","A"].get())
                    log.info("Edge exists from parent? ")
                    log.info(A[C[Z[n]], C[n]])
                    #                    log.info ("W")
                    #                    log.info(self.gpuPtrs["weight_model","A"].get())
                    log.info("Weight from parent:")
                    log.info(W[C[Z[n]], C[n]])

                log.info("Spikes parented by n")
                log.info(np.count_nonzero(Z == n))
                for ch in np.nonzero(Z == n)[0]:
                    log.info("Spike %d (c=%d) parented spike %d (c=%d)", n,
                             C[n], ch, C[ch])
                    log.info("Edge exists to child? ")
                    log.info(A[C[n], C[ch]])

                    if not A[C[n], C[ch]]:
                        log.info("WGS[:,%d]:", ch)
                        log.info(WGS[:, ch])

                log.info("lam:")
                log.info(self.gpuPtrs["bkgd_model", "lam"].get()[:, n])
                log.info("gaussians")
                log.info(self.gpuPtrs["proc_id_model", "Xmean"].get()[C[n], :])
                log.info(self.gpuPtrs["proc_id_model",
                                      "Xprec"].get()[C[n], :, :])

                log.info("blockLogPrSum")
                log.info(blockLogPrSum)
                log.info("prcn")
                log.info(prcn)

                exit()

            # Copy the new cn to the GPU
            cn_buff = np.array([cn], dtype=np.int32)
            cuda.memcpy_htod(
                self.gpuPtrs["proc_id_model", "C"].ptr +
                int(n * cn_buff.itemsize), cn_buff)

        # Update Ns and C
        C = self.gpuPtrs["proc_id_model", "C"].get()
        self.modelParams["proc_id_model", "C"] = C

        Ns = np.zeros((K, ), dtype=np.int32)
        for n in np.arange(N):
            Ns[C[n]] += 1

        self.modelParams["proc_id_model", "Ns"] = Ns
        self.gpuPtrs["proc_id_model",
                     "Ns"].set(self.modelParams["proc_id_model", "Ns"])
Example #34
0
def solve_gpu(currentmodelrun, modelend, G):
    """Solving using FDTD method on GPU. Implemented using Nvidia CUDA.

    Args:
        currentmodelrun (int): Current model run number.
        modelend (int): Number of last model to run.
        G (class): Grid class instance - holds essential parameters describing the model.

    Returns:
        tsolve (float): Time taken to execute solving
    """

    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    drv.init()

    # Create device handle and context on specifc GPU device (and make it current context)
    dev = drv.Device(G.gpu.deviceID)
    ctx = dev.make_context()

    # Electric and magnetic field updates - prepare kernels, and get kernel functions
    if Material.maxpoles > 0:
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]))
    else:   # Set to one any substitutions for dispersive materials
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1))
    update_e_gpu = kernels_fields.get_function("update_e")
    update_h_gpu = kernels_fields.get_function("update_h")

    # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels
    updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0]
    updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0]
    if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem:
        raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name))
    else:
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)

    # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU
    if Material.maxpoles > 0:  # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values).
        update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A")
        update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B")
        G.gpu_initialise_dispersive_arrays()

    # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU
    G.gpu_set_blocks_per_grid()
    G.gpu_initialise_arrays()

    # PML updates
    if G.pmls:
        # Prepare kernels
        kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]))
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels
        updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        # Set block per grid, initialise arrays on GPU, and get kernel functions
        for pml in G.pmls:
            pml.gpu_set_blocks_per_grid(G)
            pml.gpu_initialise_arrays()
            pml.gpu_get_update_funcs(kernels_pml)

    # Receivers
    if G.rxs:
        # Initialise arrays on GPU
        rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G)
        # Prepare kernel and get kernel function
        kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]))
        store_outputs_gpu = kernel_store_outputs.get_function("store_outputs")

    # Sources - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.voltagesources + G.hertziandipoles + G.magneticdipoles:
        kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]))
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels
        updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        if G.hertziandipoles:
            srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G)
            update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole")
        if G.magneticdipoles:
            srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G)
            update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole")
        if G.voltagesources:
            srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G)
            update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source")

    # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.snapshots:
        # Initialise arrays on GPU
        snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G)
        # Prepare kernel and get kernel function
        kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]))
        store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot")

    # Iteration loop timer
    iterstart = drv.Event()
    iterend = drv.Event()
    iterstart.record()

    for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable):

        # Get GPU memory usage on final iteration
        if iteration == G.iterations - 1:
            memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0]

        # Store field component values for every receiver
        if G.rxs:
            store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration),
                              rxcoords_gpu.gpudata, rxs_gpu.gpudata,
                              G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                              G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                              block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1))

        # Store any snapshots
        for i, snap in enumerate(G.snapshots):
            if snap.time == iteration + 1:
                store_snapshot_gpu(np.int32(i), np.int32(snap.xs),
                                   np.int32(snap.xf), np.int32(snap.ys),
                                   np.int32(snap.yf), np.int32(snap.zs),
                                   np.int32(snap.zf), np.int32(snap.dx),
                                   np.int32(snap.dy), np.int32(snap.dz),
                                   G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                   G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                   snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata,
                                   snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata,
                                   block=Snapshot.tpb, grid=Snapshot.bpg)
                if G.snapsgpu2cpu:
                    gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                           snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

        # Update magnetic field components
        update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                     G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata,
                     G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata,
                     G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg)

        # Update magnetic field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_magnetic(G)

        # Update magnetic field components for magetic dipole sources
        if G.magneticdipoles:
            update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata,
                                       srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1))

        # Update electric field components
        # If all materials are non-dispersive do standard update
        if Material.maxpoles == 0:
            update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata,
                         G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                         G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                         block=G.tpb, grid=G.bpg)
        # If there are any dispersive materials do 1st part of dispersive update
        # (it is split into two parts as it requires present and updated electric field values).
        else:
            update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

        # Update electric field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_electric(G)

        # Update electric field components for voltage sources
        if G.voltagesources:
            update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration),
                                      floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                      srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata,
                                      srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1))

        # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last)
        if G.hertziandipoles:
            update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata,
                                       srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1))

        # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates.
        if Material.maxpoles > 0:
            update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

    # Copy output from receivers array back to correct receiver objects
    if G.rxs:
        gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G)

    # Copy data from any snapshots back to correct snapshot objects
    if G.snapshots and not G.snapsgpu2cpu:
        for i, snap in enumerate(G.snapshots):
            gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                   snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

    iterend.record()
    iterend.synchronize()
    tsolve = iterstart.time_till(iterend) * 1e-3

    # Remove context from top of stack and delete
    ctx.pop()
    del ctx

    return tsolve, memsolve
Example #35
0
    def _run_simulation(self,
                        parameters,
                        init_values,
                        blocks,
                        threads,
                        in_atol=1e-6,
                        in_rtol=1e-6):

        total_threads = threads * blocks
        experiments = len(parameters)

        neqn = self._speciesNumber

        # compile
        init_common_kernel = self._completeCode.get_function("init_common")
        init_common_kernel(block=(threads, 1, 1), grid=(blocks, 1))

        # output array
        ret_xt = np.zeros(
            [total_threads, 1, self._resultNumber, self._speciesNumber])
        ret_istate = np.ones([total_threads], dtype=np.int32)

        # calculate sizes of work spaces
        isize = 20 + self._speciesNumber
        rsize = 22 + self._speciesNumber * max(16, self._speciesNumber + 9)

        # local variables
        t = np.zeros([total_threads], dtype=np.float64)
        jt = np.zeros([total_threads], dtype=np.int32)
        neq = np.zeros([total_threads], dtype=np.int32)
        itol = np.zeros([total_threads], dtype=np.int32)
        iopt = np.zeros([total_threads], dtype=np.int32)
        rtol = np.zeros([total_threads], dtype=np.float64)
        iout = np.zeros([total_threads], dtype=np.int32)
        tout = np.zeros([total_threads], dtype=np.float64)
        itask = np.zeros([total_threads], dtype=np.int32)
        istate = np.zeros([total_threads], dtype=np.int32)
        atol = np.zeros([total_threads], dtype=np.float64)

        liw = np.zeros([total_threads], dtype=np.int32)
        lrw = np.zeros([total_threads], dtype=np.int32)
        iwork = np.zeros([isize * total_threads], dtype=np.int32)
        rwork = np.zeros([rsize * total_threads], dtype=np.float64)
        y = np.zeros([self._speciesNumber * total_threads], dtype=np.float64)

        for i in range(total_threads):
            neq[i] = neqn
            t[i] = 0
            itol[i] = 1
            itask[i] = 1
            istate[i] = 1
            iopt[i] = 0
            jt[i] = 2
            atol[i] = in_atol
            rtol[i] = in_rtol

            liw[i] = isize
            lrw[i] = rsize

            try:
                # initial conditions
                for j in range(self._speciesNumber):
                    # loop over species
                    y[i * self._speciesNumber + j] = init_values[i][j]
                    ret_xt[i, 0, 0, j] = init_values[i][j]
            except IndexError:
                pass

        # allocate on device
        d_t = driver.mem_alloc(t.size * t.dtype.itemsize)
        d_jt = driver.mem_alloc(jt.size * jt.dtype.itemsize)
        d_neq = driver.mem_alloc(neq.size * neq.dtype.itemsize)
        d_liw = driver.mem_alloc(liw.size * liw.dtype.itemsize)
        d_lrw = driver.mem_alloc(lrw.size * lrw.dtype.itemsize)
        d_itol = driver.mem_alloc(itol.size * itol.dtype.itemsize)
        d_iopt = driver.mem_alloc(iopt.size * iopt.dtype.itemsize)
        d_rtol = driver.mem_alloc(rtol.size * rtol.dtype.itemsize)
        d_iout = driver.mem_alloc(iout.size * iout.dtype.itemsize)
        d_tout = driver.mem_alloc(tout.size * tout.dtype.itemsize)
        d_itask = driver.mem_alloc(itask.size * itask.dtype.itemsize)
        d_istate = driver.mem_alloc(istate.size * istate.dtype.itemsize)
        d_y = driver.mem_alloc(y.size * y.dtype.itemsize)
        d_atol = driver.mem_alloc(atol.size * atol.dtype.itemsize)
        d_iwork = driver.mem_alloc(iwork.size * iwork.dtype.itemsize)
        d_rwork = driver.mem_alloc(rwork.size * rwork.dtype.itemsize)

        # copy to device
        driver.memcpy_htod(d_t, t)
        driver.memcpy_htod(d_jt, jt)
        driver.memcpy_htod(d_neq, neq)
        driver.memcpy_htod(d_liw, liw)
        driver.memcpy_htod(d_lrw, lrw)
        driver.memcpy_htod(d_itol, itol)
        driver.memcpy_htod(d_iopt, iopt)
        driver.memcpy_htod(d_rtol, rtol)
        driver.memcpy_htod(d_iout, iout)
        driver.memcpy_htod(d_tout, tout)
        driver.memcpy_htod(d_itask, itask)
        driver.memcpy_htod(d_istate, istate)
        driver.memcpy_htod(d_y, y)
        driver.memcpy_htod(d_atol, atol)
        driver.memcpy_htod(d_iwork, iwork)
        driver.memcpy_htod(d_rwork, rwork)

        param = np.zeros((total_threads, self._parameterNumber),
                         dtype=np.float32)
        try:
            for i in range(len(parameters)):
                for j in range(self._parameterNumber):
                    param[i][j] = parameters[i][j]
        except IndexError:
            pass

        # parameter texture
        ary = sim.create_2D_array(param)
        sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4,
                                 total_threads)
        self._param_tex.set_array(ary)

        if self._dt <= 0:
            for i in range(self._resultNumber):

                for j in range(total_threads):
                    tout[j] = self._timepoints[i]
                driver.memcpy_htod(d_tout, tout)

                self._compiledRunMethod(d_neq,
                                        d_y,
                                        d_t,
                                        d_tout,
                                        d_itol,
                                        d_rtol,
                                        d_atol,
                                        d_itask,
                                        d_istate,
                                        d_iopt,
                                        d_rwork,
                                        d_lrw,
                                        d_iwork,
                                        d_liw,
                                        d_jt,
                                        block=(threads, 1, 1),
                                        grid=(blocks, 1))

                driver.memcpy_dtoh(t, d_t)
                driver.memcpy_dtoh(y, d_y)
                driver.memcpy_dtoh(istate, d_istate)

                for j in range(total_threads):
                    for k in range(self._speciesNumber):
                        ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k]

                    if istate[j] < 0:
                        ret_istate[j] = 0

                        # end of loop over time points

        else:
            tt = self._timepoints[0]

            for i in range(self._resultNumber):
                while 1:

                    next_time = min(tt + self._dt, self._timepoints[i])

                    for j in range(total_threads):
                        tout[j] = next_time
                    driver.memcpy_htod(d_tout, tout)

                    self._compiledRunMethod(d_neq,
                                            d_y,
                                            d_t,
                                            d_tout,
                                            d_itol,
                                            d_rtol,
                                            d_atol,
                                            d_itask,
                                            d_istate,
                                            d_iopt,
                                            d_rwork,
                                            d_lrw,
                                            d_iwork,
                                            d_liw,
                                            d_jt,
                                            block=(threads, 1, 1),
                                            grid=(blocks, 1))

                    driver.memcpy_dtoh(t, d_t)
                    driver.memcpy_dtoh(y, d_y)
                    driver.memcpy_dtoh(istate, d_istate)

                    if np.abs(next_time - self._timepoints[i]) < 1e-5:
                        tt = next_time
                        break

                    tt = next_time

                for j in range(total_threads):
                    for k in range(self._speciesNumber):
                        ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k]

                    if istate[j] < 0:
                        ret_istate[j] = 0

        # loop over and check ret_istate
        # it will will be zero if there was problems
        for j in range(total_threads):
            if ret_istate[j] == 0:
                for i in range(self._resultNumber):
                    for k in range(self._speciesNumber):
                        ret_xt[j, 0, i, k] = float('NaN')

        return ret_xt[0:experiments]
Example #36
0
 print "Loading matrices to device..."
 
 # Matrices dimensions
 N = 8192
 A_rows = N
 A_cols = N
 B_cols = N
 # A_rows = 784
 # A_cols = 1024
 # B_cols = 4096
 
 # A is a matrix randomly filled with 1 and -1
 A = sign(np.random.randn(A_rows,A_cols))
 A = A.astype(np.float32)
 A_gpu = cuda.mem_alloc(A.nbytes)
 cuda.memcpy_htod(A_gpu, A)
 
 # B is a matrix randomly filled with 1 and -1
 B = sign(np.random.randn(A_cols,B_cols))
 B = B.astype(np.float32)
 B_gpu = cuda.mem_alloc(B.nbytes)
 cuda.memcpy_htod(B_gpu, B)
 
 # C is the resulting matrix
 C1 = np.zeros((A_rows,B_cols)).astype(np.float32)
 C2 = np.zeros((A_rows,B_cols)).astype(np.float32)
 C_gpu = cuda.mem_alloc(C1.nbytes)
 
 print "XNOR kernel..."
 
 # wait until the GPU is done with the work
Example #37
0
feq_g6 = cuda.mem_alloc(rho.size * rho.dtype.itemsize)
feq_g7 = cuda.mem_alloc(rho.size * rho.dtype.itemsize)
feq_g8 = cuda.mem_alloc(rho.size * rho.dtype.itemsize)

rho_g = cuda.mem_alloc(rho.size * rho.dtype.itemsize)
taus_g = cuda.mem_alloc(tauS.size * tauS.dtype.itemsize)
u_g = cuda.mem_alloc(u.nbytes)
c_g = cuda.mem_alloc(c.nbytes)
t_g = cuda.mem_alloc(t.nbytes)

#fpost_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize)
#Pinning memory for faster transfers between cpu and gpu
#fin_pin = cuda.register_host_memory(fin)
#this didnt seem to make difference, so using fin only for transfers

cuda.memcpy_htod(fin_g0, fin[0])
cuda.memcpy_htod(fin_g1, fin[1])
cuda.memcpy_htod(fin_g2, fin[2])
cuda.memcpy_htod(fin_g3, fin[3])
cuda.memcpy_htod(fin_g4, fin[4])
cuda.memcpy_htod(fin_g5, fin[5])
cuda.memcpy_htod(fin_g6, fin[6])
cuda.memcpy_htod(fin_g7, fin[7])
cuda.memcpy_htod(fin_g8, fin[8])

cuda.memcpy_htod(feq_g0, fin[0])
cuda.memcpy_htod(feq_g1, fin[1])
cuda.memcpy_htod(feq_g2, fin[2])
cuda.memcpy_htod(feq_g3, fin[3])
cuda.memcpy_htod(feq_g4, fin[4])
cuda.memcpy_htod(feq_g5, fin[5])
Example #38
0
def expand_cuda(sourceImage):
    sourceImage = np.float32(sourceImage)
    # Perform separable convolution on sourceImage using CUDA.
    destImage = sourceImage.copy()
    destImage = np.float32(destImage)
    fil = np.zeros((3, 3))
    (imageHeight, imageWidth) = destImage.shape
    # print(imageWidth,imageHeight)
    fil = np.float32(fil)
    DATA_H = imageHeight
    DATA_W = imageWidth
    DATA_H = np.int32(DATA_H)
    DATA_W = np.int32(DATA_W)
    # Prepare device arrays

    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    fil_gpu = cuda.mem_alloc_like(fil)
    destImage_gpu = cuda.mem_alloc_like(sourceImage)

    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(fil_gpu, fil)
    minGPU(destImage_gpu,
           sourceImage_gpu,
           fil_gpu,
           DATA_W,
           DATA_H,
           block=(imageHeight, 1, 1),
           grid=(1, imageWidth))
    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage, destImage_gpu)
    return destImage
    sourceImage = np.float32(sourceImage)
    # Perform separable convolution on sourceImage using CUDA.
    destImage = sourceImage.copy()
    destImage = np.float32(destImage)
    fil = np.zeros((3, 3))
    (imageHeight, imageWidth) = destImage.shape
    # print(imageWidth,imageHeight)
    fil = np.float32(fil)
    DATA_H = imageHeight
    DATA_W = imageWidth
    DATA_H = np.int32(DATA_H)
    DATA_W = np.int32(DATA_W)
    # Prepare device arrays

    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    fil_gpu = cuda.mem_alloc_like(fil)
    destImage_gpu = cuda.mem_alloc_like(sourceImage)

    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(fil_gpu, fil)
    minGPU(destImage_gpu,
           sourceImage_gpu,
           fil_gpu,
           DATA_W,
           DATA_H,
           block=(imageHeight, 1, 1),
           grid=(1, imageWidth))
    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage, destImage_gpu)
    return destImage
Example #39
0
def initB(CR, npoints, ncoeffsLO, ncoeffsHI, R_SS):

	pycuda.tools.clear_context_caches()

	# Read in g and h coefficients as 2D arrays
	g2D = np.zeros([ncoeffsLO+1, ncoeffsLO+1])
	h2D = np.zeros([ncoeffsLO+1, ncoeffsLO+1])
	#fname = 'CR' + str(CR) + '_MDI' + str(ncoeffs) + '.dat'
	fname = '/home/cdkay/MagnetoPickles/SCScoeffs'+ str(CR) + '.pkl'
	f1 = open(fname, 'rb')
	GH = pickle.load(f1)
	f1.close()
	gLO2D = GH[0]
	hLO2D = GH[1]
	gHI2D = GH[2]
	hHI2D = GH[3]
		
	# Convert to 1D arrays
	global gLO, hLO, gHI, hHI 
	nelemsLO = np.sum(range(ncoeffsLO+2))
	gLO = np.zeros(nelemsLO)
	hLO = np.zeros(nelemsLO)
	nelemsHI = np.sum(range(ncoeffsHI+2))
	gHI = np.zeros(nelemsHI)
	hHI = np.zeros(nelemsHI)

	# No more normalizing here
	for l in range(ncoeffsLO+1):
		for m in range(ncoeffsLO+1):
			myID = m * (ncoeffsLO+1) - int(np.sum(range(m))) + l - m
			gLO[myID] = gLO2D[l,m]

	for l in range(ncoeffsLO+1):
		for m in range(ncoeffsLO+1):
			myID = m * (ncoeffsLO+1) - int(np.sum(range(m))) + l - m
			if m==0: hLO[myID] = 0
			else:
				hLO[myID] = hLO2D[l,m]
	for l in range(ncoeffsHI+1):
		for m in range(ncoeffsHI+1):
			myID = m * (ncoeffsHI+1) - int(np.sum(range(m))) + l - m
			gHI[myID] = gHI2D[l,m]

	for l in range(ncoeffsHI+1):
		for m in range(ncoeffsHI+1):
			myID = m * (ncoeffsHI+1) - int(np.sum(range(m))) + l - m
			if m==0: hHI[myID] = 0
			else:
				hHI[myID] = hHI2D[l,m]


	# Allocate g/h space on GPU and transfer LO (assume HI <= than in memory)
	gLO = gLO.astype(np.float32)
	hLO = hLO.astype(np.float32)
	gHI = gHI.astype(np.float32)
	hHI = hHI.astype(np.float32)
	global g_gpu, h_gpu
	g_gpu = cuda.mem_alloc(gLO.nbytes)
	cuda.memcpy_htod(g_gpu, gLO)
	h_gpu = cuda.mem_alloc(hLO.nbytes)
	cuda.memcpy_htod(h_gpu, hLO)

	# Set up memory for r, colat, lon
	global theta, theta_gpu, r, r_gpu, phi, phi_gpu
	theta = np.zeros(npoints)
	theta = theta.astype(np.float32)
	theta_gpu = cuda.mem_alloc(theta.nbytes)
	r = np.zeros(npoints)
	r = r.astype(np.float32)
	r_gpu = cuda.mem_alloc(r.nbytes)
	phi = np.zeros(npoints)
	phi = phi.astype(np.float32)
	phi_gpu = cuda.mem_alloc(phi.nbytes)
	
	# Set up memory for atomic sum and output
	a = np.zeros(ncoeffsLO+1)
	a = a.astype(np.float32)
	global ssize
	ssize = npoints * a.nbytes 

	# Set up outs to contain Br, Btheta, Bphi and Br for each point
	global outs, outs_gpu
	outs = np.zeros([4*npoints])
	outs = outs.astype(np.float32)
	outs_gpu = cuda.mem_alloc(outs.nbytes)
	cuda.memcpy_htod(outs_gpu, outs)

	# Set up justB to contain only B (less copying if not needed)
	global justB, justB_gpu
	justB = np.zeros([npoints])
	justB = justB.astype(np.float32)
	justB_gpu = cuda.mem_alloc(justB.nbytes)
	cuda.memcpy_htod(justB_gpu, justB)

	global RSS_gpu # ratio of rstar to rsun
	RSSar = np.array(R_SS, dtype=np.float32)
	RSS_gpu = cuda.mem_alloc(RSSar.size * RSSar.dtype.itemsize)
	cuda.memcpy_htod(RSS_gpu, RSSar)

	# Get GPU kernel
	global func
	func = mod.get_function("PFSS_kern")	

	# Calculate magnetic field at 2.5 Rsun
	#B105 = np.zeros([180,360])

	#Set up r and a
	global aSCS_gpu
	r = np.array([2.5]*npoints)
	r = r.astype(np.float32)
	r_gpu = cuda.mem_alloc(r.nbytes)
	cuda.memcpy_htod(r_gpu, r)
	aSCS = np.array([0.2]*npoints)
	aSCS = aSCS.astype(np.float32)
	aSCS_gpu = cuda.mem_alloc(aSCS.nbytes)
	cuda.memcpy_htod(aSCS_gpu, aSCS)
Example #40
0
    print("Beta function Random Sampler using curand_kernel.h")
    print("g_a ~ Gamma[a], beta is generated by g_a/(g_a + g_b)")
    print("********************************************")

    alpha = 1.2
    beta = 1.3

    nw = 1
    nt = 100000
    nq = 1
    nb = nw * nt * nq
    sharedsize = 0  #byte
    x = np.zeros(nb)
    x = x.astype(np.float32)
    dev_x = cuda.mem_alloc(x.nbytes)
    cuda.memcpy_htod(dev_x, x)

    source_module = gabcrm_module()
    pkernel = source_module.get_function("betagen")
    pkernel(dev_x,
            np.float32(alpha),
            np.float32(beta),
            block=(int(nw), 1, 1),
            grid=(int(nt), int(nq)),
            shared=sharedsize)
    cuda.memcpy_dtoh(x, dev_x)

    plt.hist(x, bins=30, density=True)
    xl = np.linspace(betafunc.ppf(0.001, alpha, beta),
                     betafunc.ppf(0.999, alpha, beta), 100)
    plt.plot(xl, betafunc.pdf(xl, alpha, beta))
Example #41
0
    #    Q * [r1,r2]

    print("********************************************")
    print("Gaussian Random Sampler using curand_kernel.h")
    print("********************************************")

    nw = 1
    nt = 100000
    nq = 1
    nb = nw * nt * nq
    sharedsize = 0  #byte

    x1 = np.zeros(nb)
    x1 = x1.astype(np.float32)
    dev_x1 = cuda.mem_alloc(x1.nbytes)
    cuda.memcpy_htod(dev_x1, x1)

    x2 = np.zeros(nb)
    x2 = x2.astype(np.float32)
    dev_x2 = cuda.mem_alloc(x2.nbytes)
    cuda.memcpy_htod(dev_x2, x2)

    source_module = norm2d_module()
    pkernel = source_module.get_function("norm2d")
    pkernel(dev_x1,
            dev_x2,
            np.float32(a),
            np.float32(b),
            np.float32(c),
            block=(int(nw), 1, 1),
            grid=(int(nt), int(nq)),
Example #42
0
def calcB(R_in, Lat_in, Lon_in, dlon, npoints, LOorHI, ncoeffs, aSCS_in):
	if LOorHI == 0:
		cuda.memcpy_htod(g_gpu, gLO)
		cuda.memcpy_htod(h_gpu, hLO)
	if LOorHI == 1:
		cuda.memcpy_htod(g_gpu, gHI)
		cuda.memcpy_htod(h_gpu, hHI)

	#ncoeffs = 90
	#Transfer the thetas and phis to GPU
	r = np.array([R_in] * npoints, dtype=np.float32)
	theta = np.array([(90. - Lat_in) ] * npoints, dtype=np.float32) * dtor
	Lons = [Lon_in + dlon * i for i in range(npoints)]
	#print Lons
	phi = np.array(Lons, dtype=np.float32) * dtor
	cuda.memcpy_htod(theta_gpu, theta)
	cuda.memcpy_htod(phi_gpu, phi)
	cuda.memcpy_htod(r_gpu, r)
	cuda.memcpy_htod(aSCS_gpu, np.array([aSCS_in], dtype=np.float32))
	
	a = np.zeros(ncoeffs+1)
	a = a.astype(np.float32)
	ssize = a.nbytes * npoints

	func = mod.get_function("PFSS_kern")
	# Theta and phi should be in rads going into the kernel
	# R should be in solar radii
	func(theta_gpu, phi_gpu, r_gpu, g_gpu, h_gpu, outs_gpu, justB_gpu, aSCS_gpu, block=(ncoeffs+1, 1, 1), grid=(npoints,1,1), shared=ssize)
	cuda.memcpy_dtoh(outs, outs_gpu)
	result = outs#[]
	#for i in range(npoints):		
	#	temp = SPHVEC2CART(Lat_in, Lons[i], outs[0 + 4 * i], outs[1 + 4 * i], outs[2 + 4 * i])
	#	result.append(temp[0]) 
	#	result.append(temp[1])
	#	result.append(temp[2])
	#	result.append(outs[3 + 4 * i]) 
	return result
def CudaNegative(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void ng( float *inIm, int check ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
 
        if(idx *3 < check*3)
        { 
        	inIm[idx*3]= 255-inIm[idx*3];
        	inIm[idx*3+1]= 255-inIm[idx*3+1];
        	inIm[idx*3+2]= 255-inIm[idx*3+2];
        }
    }
    """

    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("ng")
    func(d_px, checkSize, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    ngPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(ngPx, d_px)
    ngPx = (numpy.uint8(ngPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(ngPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Negative image"
    print "Image size: ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
def Encrypt():
    #Initialize perf_timer
    perf_timer = np.zeros(4)
    overall_time = perf_counter()
    
    # Read image & clear temp directories
    img, dim, misc_timer = PreProcess()

    # Resize image for Arnold Mapping
    misc_timer[3] = perf_counter()
    if dim[0]!=dim[1]:
        N = max(dim[0], dim[1])
        img = cv2.resize(img,(N,N), interpolation=cv2.INTER_CUBIC)
        dim = img.shape

    # Calculate no. of rounds
    rounds = randint(8,16)
    misc_timer[3] = perf_counter() - misc_timer[3]
    
    # Flatten image to vector,transfer to GPU
    temp_timer = perf_counter()
    imgArr = np.asarray(img).reshape(-1)
    gpuimgIn = cuda.mem_alloc(imgArr.nbytes)
    gpuimgOut = cuda.mem_alloc(imgArr.nbytes)
    cuda.memcpy_htod(gpuimgIn, imgArr)
    func = cf.mod.get_function("ArMapImg")
    misc_timer[1] += perf_counter() - temp_timer

    # Warm-Up GPU for accurate benchmarking
    if cfg.DEBUG_TIMER:
        funcTemp = cf.mod.get_function("WarmUp")
        funcTemp(grid=(1,1,1), block=(1,1,1))

    # Log no. of rounds of ArMapping
    temp_timer = perf_counter()
    with open(cfg.LOG, 'a+') as f:
        f.write(str(rounds)+"\n")
    misc_timer[2] += perf_counter() - temp_timer

    # Perform Arnold Mapping
    perf_timer[0] = perf_counter()
    for i in range (max(rounds,5)):
        func(gpuimgIn, gpuimgOut, grid=(dim[0],dim[1],1), block=(3,1,1))
        gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn
    perf_timer[0] = perf_counter() - perf_timer[0]

    if cfg.DEBUG_IMAGES:
        misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_1", len(imgArr), dim)

    # Fractal XOR Phase
    temp_timer = perf_counter()
    fractal, misc_timer[4] = cf.getFractal(dim[0])
    fracArr  = np.asarray(fractal).reshape(-1)
    gpuFrac = cuda.mem_alloc(fracArr.nbytes)
    cuda.memcpy_htod(gpuFrac, fracArr)
    func = cf.mod.get_function("FracXOR")
    misc_timer[4] = perf_counter() - temp_timer

    perf_timer[1] = perf_counter()
    func(gpuimgIn, gpuimgOut, gpuFrac, grid=(dim[0]*dim[1],1,1), block=(3,1,1))
    perf_timer[1] = perf_counter() - perf_timer[1]

    gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn

    if cfg.DEBUG_IMAGES:
        misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_2", len(imgArr), dim)

    # Permutation: ArMap-based intra-row/column rotation
    perf_timer[2] = perf_counter()
    U = cf.genRelocVec(dim[0],dim[1],cfg.P1LOG, ENC=True) # Col-rotation | len(U)=n, values from 0->m
    V = cf.genRelocVec(dim[1],dim[0],cfg.P2LOG, ENC=True) # Row-rotation | len(V)=m, values from 0->n
    perf_timer[2] = perf_counter() - perf_timer[2]
    
    # Transfer rotation-vectors to GPU
    misc_timer[5] = perf_counter()
    gpuU = cuda.mem_alloc(U.nbytes)
    gpuV = cuda.mem_alloc(V.nbytes)
    cuda.memcpy_htod(gpuU, U)
    cuda.memcpy_htod(gpuV, V)
    func = cf.mod.get_function("Enc_GenCatMap")
    misc_timer[5] = perf_counter() - misc_timer[5]

    # Perform permutation
    perf_timer[3] = perf_counter()
    for i in range(cfg.PERM_ROUNDS):
        func(gpuimgIn, gpuimgOut, gpuU, gpuV, grid=(dim[0],dim[1],1), block=(3,1,1))
        gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn
    perf_timer[3] = perf_counter() - perf_timer[3]

    if cfg.DEBUG_IMAGES:
        misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_3", len(imgArr), dim)

    # Transfer vector back to host and reshape into encrypted output
    temp_timer = perf_counter()
    cuda.memcpy_dtoh(imgArr, gpuimgIn)
    img = (np.reshape(imgArr,dim)).astype(np.uint8)
    cv2.imwrite(cfg.ENC_OUT, img)
    misc_timer[1] += perf_counter() - temp_timer
    
    # Print timing statistics
    if cfg.DEBUG_TIMER:
        overall_time = perf_counter() - overall_time
        perf = np.sum(perf_timer)
        misc = np.sum(misc_timer)

        print("\nTarget: {} ({}x{})".format(cfg.ENC_IN, dim[1], dim[0]))    

        print("\nPERF. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(perf, perf/overall_time*100))
        print("ArMap Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[0], perf_timer[0]/overall_time*100))   
        print("XOR Kernel: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[1], perf_timer[1]/overall_time*100))
        print("Shuffle Gen: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[2], perf_timer[2]/overall_time*100))
        print("Perm. Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[3], perf_timer[3]/overall_time*100))

        print("\nMISC. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(misc, misc/overall_time*100))
        print("Dir. Cleanup:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[0], misc_timer[0]/overall_time*100)) 
        print("I/O:\t\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[1], misc_timer[1]/overall_time*100))
        print("Logging:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[2], misc_timer[2]/overall_time*100))
        print("ArMap Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[3], misc_timer[3]/overall_time*100)) 
        print("FracXOR Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[4], misc_timer[4]/overall_time*100)) 
        print("Permute Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[5], misc_timer[5]/overall_time*100))

        if cfg.DEBUG_IMAGES:
            print("Debug Images:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[6], misc_timer[6]/overall_time*100))

        print("\nNET TIME:\t{0:7.5f}s\n".format(overall_time))
def build_sparse_transition_model(filename = 'Transition_dict', n_actions = 16, nt = None, dt =None, F =None, startpos = None, endpos = None, Test_grid =False):
    
    global state_list
    global base_path
    global save_path

    print("Building Sparse Model")
    t1 = time.time()
    #setup grid
    print("input to build_sparse_trans_model:\n")
    print("n_actions", n_actions)
    print("nt, dt", nt, dt)

    g, xs, ys, X, Y, vel_field_data, nmodes, num_rzns, path_mat, setup_params, setup_param_str = setup_grid(num_actions = n_actions, nt = nt, Test_grid= Test_grid)

    print("xs: ",xs)
    print("ys", ys)

    all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi = vel_field_data
    check_nt, check_nrzns, nmodes = all_Yi.shape

    all_u_mat = all_u_mat.astype(np.float32)
    all_v_mat = all_v_mat.astype(np.float32)
    all_ui_mat = all_ui_mat.astype(np.float32)
    all_vi_mat = all_vi_mat.astype(np.float32)
    all_Yi = all_Yi.astype(np.float32)


    #setup_params = [num_actions, nt, dt, F, startpos, endpos] reference from setup grid
    nT = setup_params[1]  # total no. of time steps TODO: check default value
    print("****CHECK: ", nt, nT, check_nt)
    # assert (nt == nT), "nt and nT are not the same!"
    #if nt specified in runner is within nT from param file, then use nt. i.e. model will be built for nt timesteps.
    if nt != None and nt <= nT:
        nT = nt
    is_stationary = 0  # 0 is false. any other number is true. is_stationry = 0 (false) means that flow is NOT stationary
    #  and S2 will be indexed by T+1. if is_stationary = x (true), then S2 is indexed by 0, same as S1.
    # list_size = 10     #predefined size of list for each S2
    # if nt > 1:
    #     is_stationary = 0
    gsize = g.ni  # size of grid along 1 direction. ASSUMING square grid.
    num_actions = setup_params[0]
    nrzns = num_rzns
    bDimx = nrzns # for small test cases
    if nrzns>=1000:
        bDimx = 1000   #for large problems     
    dt = setup_params[2]
    F = setup_params[3]
    r_outbound = g.r_outbound
    r_terminal = g.r_terminal
    i_term = g.endpos[0]  # terminal state indices
    j_term = g.endpos[1]

    #name of output pickle file containing transtion prob in dictionary format
    if nT > 1:
        prefix = '3D_' + str(nT) + 'nT_a'
    else:
        prefix = '2D_a'
    filename =  filename + prefix + str(n_actions) #TODO: change filename
    base_path = join(ROOT_DIR,'DP/Trans_matxs_3D/')
    save_path = base_path + filename
    if exists(save_path):
        print("Folder Already Exists !!\n")
        return
    # TODO: remove z from params. it is only for chekcs
    z=-9999
    params = np.array(
        [gsize, num_actions, nrzns, F, dt, r_outbound, r_terminal, nmodes, i_term, j_term, nT, is_stationary, z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z]).astype(
        np.float32)
    st_sp_size = (gsize ** 2) # size of spatial state space
    print("check stsp_size", gsize, nT, st_sp_size)
    save_file_for_each_a = False

    print("params")
    print("gsize ", params[0], "\n",
        "num_actions ", params[1], "\n",
        "nrzns ", params[2], "\n",
        "F ", params[3], "\n",
        "dt ", params[4], "\n",
        "r_outbound ", params[5], "\n",
        "r_terminal ", params[6], "\n",
        "nmodes ", params[7], "\n",
        "i_term ", params[8], "\n",
        "j_term ", params[9], "\n",
        "nT", params[10], "\n",
        "is_stationary ", params[11], ""
    
        )


    # cpu initialisations.
    # dummy intialisations to copy size to gpu
    # vxrzns = np.zeros((nrzns, gsize, gsize), dtype=np.float32)
    # vyrzns = np.zeros((nrzns, gsize, gsize), dtype=np.float32)

    results = -1 * np.ones(((gsize ** 2) * nrzns), dtype=np.float32)
    sumR_sa = np.zeros(st_sp_size).astype(np.float32)
    Tdummy = np.zeros(2, dtype = np.float32)

    #  informational initialisations
    ac_angles = np.linspace(0, 2 * pi, num_actions, endpoint =  False, dtype=np.float32)
    print("action angles:\n", ac_angles)

    ac_angle = ac_angles[0].astype(np.float32) # just for allocating memory
    # xs = np.arange(gsize, dtype=np.float32)
    # ys = np.arange(gsize, dtype=np.float32)
    xs = xs.astype(np.float32)
    ys = ys.astype(np.float32)
    print("params: \n", params, "\n\n")

    t1 = time.time()
    # allocates memory on gpu. vxrzns and vyrzns nees be allocated just once and will be overwritten for each timestep
    # vxrzns_gpu = cuda.mem_alloc(vxrzns.nbytes)
    # vyrzns_gpu = cuda.mem_alloc(vyrzns.nbytes)
    all_u_mat_gpu = cuda.mem_alloc(all_u_mat.nbytes)
    all_v_mat_gpu = cuda.mem_alloc(all_v_mat.nbytes)
    all_ui_mat_gpu = cuda.mem_alloc(all_ui_mat.nbytes)
    all_vi_mat_gpu = cuda.mem_alloc(all_vi_mat.nbytes)
    all_Yi_gpu = cuda.mem_alloc(all_Yi.nbytes)    
    vel_data_gpu = [all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu]

    ac_angles_gpu = cuda.mem_alloc(ac_angles.nbytes)
    ac_angle_gpu = cuda.mem_alloc(ac_angle.nbytes)
    xs_gpu = cuda.mem_alloc(xs.nbytes)
    ys_gpu = cuda.mem_alloc(ys.nbytes)
    params_gpu = cuda.mem_alloc(params.nbytes)
    T_gpu = cuda.mem_alloc(Tdummy.nbytes)


    # copies contents of a to  allocated memory on gpu
    cuda.memcpy_htod(all_u_mat_gpu, all_u_mat)
    cuda.memcpy_htod(all_v_mat_gpu, all_v_mat)
    cuda.memcpy_htod(all_ui_mat_gpu, all_ui_mat)
    cuda.memcpy_htod(all_vi_mat_gpu, all_vi_mat)
    cuda.memcpy_htod(all_Yi_gpu, all_Yi)

    cuda.memcpy_htod(ac_angle_gpu, ac_angle)
    cuda.memcpy_htod(xs_gpu, xs)
    cuda.memcpy_htod(ys_gpu, ys)
    cuda.memcpy_htod(params_gpu, params)

    for T in range(nT):
        print("*** Computing data for timestep, T = ", T, '\n')
        # params[7] = T
        # cuda.memcpy_htod(params_gpu, params)
        Tdummy[0] = T
        # Load Velocities
        # vxrzns = np.zeros((nrzns, gsize, gsize), dtype = np.float32)
        # #expectinf to see probs of 0.5 in stream area
        # for i in range(int(nrzns/2)):
        #     vxrzns[i,int(gsize/2 -1):int(gsize/2 +1),:] = 1
        # vyrzns = np.zeros((nrzns, gsize, gsize), dtype = np.float32)
        # vxrzns = np.load('/home/rohit/Documents/Research/ICRA_2020/DDDAS_2D_Highway/Input_data_files/Velx_5K_rlzns.npy')
        # vyrzns = np.load('/home/rohit/Documents/Research/ICRA_2020/DDDAS_2D_Highway/Input_data_files/Vely_5K_rlzns.npy')
        # vxrzns = Vx_rzns
        # vyrzns = Vy_rzns
        # vxrzns = vxrzns.astype(np.float32)
        # vyrzns = vyrzns.astype(np.float32)
        Tdummy = Tdummy.astype(np.float32)

        # TODO: sanity check on dimensions: compare loaded matrix shape with gsize, numrzns

        # copy loaded velocities to gpu
        # cuda.memcpy_htod(vxrzns_gpu, vxrzns)
        # cuda.memcpy_htod(vyrzns_gpu, vyrzns)
        cuda.memcpy_htod(T_gpu, Tdummy)

        print("pre func")

        coo_list_a, Rs_list_a = build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu,
                                                                   xs_gpu, ys_gpu,
                                                                   ac_angles, results, sumR_sa,
                                                                   save_file_for_each_a=False)

        # print("R_s_a0 \n", Rs_list_a[0][0:200])
        print("post func")


        # TODO: end loop over timesteps here and comcatenate COOs and R_sas over timesteps for each action
        # full_coo_list and full_Rs_list are lists with each element containing coo and R_s for an action of the same index
        if T > 0:
            full_coo_list_a, full_Rs_list_a = concatenate_results_across_time(coo_list_a, Rs_list_a, full_coo_list_a,
                                                                              full_Rs_list_a)
            # TODO: finish concatenate...() function
        else:
            full_coo_list_a = coo_list_a
            full_Rs_list_a = Rs_list_a

    t2 = time.time()
    build_time = t2 - t1
    print("build_time ", build_time)

    #save data to file
    # data = setup_params, setup_param_str, g.reward_structure, build_time
    # write_files(full_coo_list_a, filename + '_COO', data)

    # print("Pickled sparse files !")

    #build probability transition dictionary
    state_list = g.ac_state_space()
    init_transition_dict = initialise_dict(g)
    transition_dict = convert_COO_to_dict(init_transition_dict, g, full_coo_list_a, full_Rs_list_a)
    print("conversion COO to dict done")

    #save dictionary to file
    data = setup_params, setup_param_str, g.reward_structure, build_time
    write_files(transition_dict, filename, data)
    pickleFile(full_coo_list_a, save_path + '/' + filename + '_COO')
    pickleFile(full_Rs_list_a, save_path + '/' + filename + '_Rsa')
def CudaBrightness(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void br( float *inIm, int check, int brightness ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx *3 < check*3)
        { 
			if(inIm[idx*3]+brightness > 255)
				inIm[idx*3] = 255;
			else 
        		inIm[idx*3]= inIm[idx*3]+brightness;
        	
        	if(inIm[idx*3+1]+brightness > 255)
				inIm[idx*3+1] = 255;
			else 
        		inIm[idx*3+1]= inIm[idx*3+1]+brightness;
        	
        	if(inIm[idx*3+2]+brightness > 255)
				inIm[idx*3+2] = 255;
			else 
        		inIm[idx*3+2]= inIm[idx*3+2]+brightness;
        }
    }
    """

    brightness = int(
        raw_input("Enter the level of brightness (-255 to 255): "))
    print
    if brightness > 255:
        brightness = 255
    if brightness < -255:
        brightness = -255
    brightness = numpy.int32(brightness)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("br")
    func(d_px, checkSize, brightness, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    brPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(brPx, d_px)
    brPx = (numpy.uint8(brPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(brPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Brightness filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
Example #47
0
    def do_inference(self,
                     image,
                     score_threshold=0.4,
                     top_k=10000,
                     NMS_threshold=0.4,
                     NMS_flag=True,
                     skip_scale_branch_list=[]):

        if image.ndim != 3 or image.shape[2] != 3:
            print('Only RGB images are supported.')
            return None
        input_height = self.input_shape[2]
        input_width = self.input_shape[3]
        if image.shape[0] != input_height or image.shape[1] != input_width:
            logging.info(
                'The size of input image is not %dx%d.\nThe input image will be resized keeping the aspect ratio.'
                % (input_height, input_width))

        input_batch = numpy.zeros(
            (1, input_height, input_width, self.input_shape[1]),
            dtype=numpy.float32)
        left_pad = 0
        top_pad = 0
        if image.shape[0] / image.shape[1] > input_height / input_width:
            resize_scale = input_height / image.shape[0]
            input_image = cv2.resize(image, (0, 0),
                                     fx=resize_scale,
                                     fy=resize_scale)
            left_pad = int((input_width - input_image.shape[1]) / 2)
            input_batch[0, :, left_pad:left_pad +
                        input_image.shape[1], :] = input_image
        else:
            resize_scale = input_width / image.shape[1]
            input_image = cv2.resize(image, (0, 0),
                                     fx=resize_scale,
                                     fy=resize_scale)
            top_pad = int((input_height - input_image.shape[0]) / 2)
            input_batch[0, top_pad:top_pad +
                        input_image.shape[0], :, :] = input_image

        input_batch = input_batch.transpose([0, 3, 1, 2])
        input_batch = numpy.array(input_batch, dtype=numpy.float32, order='C')
        self.inputs[0].host = input_batch

        [cuda.memcpy_htod(inp.device, inp.host) for inp in self.inputs]
        self.executor.execute(batch_size=self.engine.max_batch_size,
                              bindings=self.bindings)
        [
            cuda.memcpy_dtoh(output.host, output.device)
            for output in self.outputs
        ]
        outputs = [out.host for out in self.outputs]
        outputs = [
            numpy.squeeze(output.reshape(shape))
            for output, shape in zip(outputs, self.output_shapes)
        ]

        bbox_collection = []
        for i in range(self.num_output_scales):
            if i in skip_scale_branch_list:
                continue

            score_map = numpy.squeeze(outputs[i * 2])

            # show feature maps-------------------------------
            # score_map_show = score_map * 255
            # score_map_show[score_map_show < 0] = 0
            # score_map_show[score_map_show > 255] = 255
            # cv2.imshow('score_map' + str(i), cv2.resize(score_map_show.astype(dtype=numpy.uint8), (0, 0), fx=2, fy=2))
            # cv2.waitKey()

            bbox_map = numpy.squeeze(outputs[i * 2 + 1])

            RF_center_Xs = numpy.array([
                self.receptive_field_center_start[i] +
                self.receptive_field_stride[i] * x
                for x in range(score_map.shape[1])
            ])
            RF_center_Xs_mat = numpy.tile(RF_center_Xs,
                                          [score_map.shape[0], 1])
            RF_center_Ys = numpy.array([
                self.receptive_field_center_start[i] +
                self.receptive_field_stride[i] * y
                for y in range(score_map.shape[0])
            ])
            RF_center_Ys_mat = numpy.tile(RF_center_Ys,
                                          [score_map.shape[1], 1]).T

            x_lt_mat = RF_center_Xs_mat - bbox_map[0, :, :] * self.constant[i]
            y_lt_mat = RF_center_Ys_mat - bbox_map[1, :, :] * self.constant[i]
            x_rb_mat = RF_center_Xs_mat - bbox_map[2, :, :] * self.constant[i]
            y_rb_mat = RF_center_Ys_mat - bbox_map[3, :, :] * self.constant[i]

            x_lt_mat = x_lt_mat
            x_lt_mat[x_lt_mat < 0] = 0
            y_lt_mat = y_lt_mat
            y_lt_mat[y_lt_mat < 0] = 0
            x_rb_mat = x_rb_mat
            x_rb_mat[x_rb_mat > input_width] = input_width
            y_rb_mat = y_rb_mat
            y_rb_mat[y_rb_mat > input_height] = input_height

            select_index = numpy.where(score_map > score_threshold)
            for idx in range(select_index[0].size):
                bbox_collection.append(
                    (x_lt_mat[select_index[0][idx], select_index[1][idx]] -
                     left_pad,
                     y_lt_mat[select_index[0][idx], select_index[1][idx]] -
                     top_pad,
                     x_rb_mat[select_index[0][idx], select_index[1][idx]] -
                     left_pad,
                     y_rb_mat[select_index[0][idx], select_index[1][idx]] -
                     top_pad, score_map[select_index[0][idx],
                                        select_index[1][idx]]))

        # NMS
        bbox_collection = sorted(bbox_collection,
                                 key=lambda item: item[-1],
                                 reverse=True)
        if len(bbox_collection) > top_k:
            bbox_collection = bbox_collection[0:top_k]
        bbox_collection_numpy = numpy.array(bbox_collection,
                                            dtype=numpy.float32)
        bbox_collection_numpy = bbox_collection_numpy / resize_scale

        if NMS_flag:
            final_bboxes = NMS(bbox_collection_numpy, NMS_threshold)
            final_bboxes_ = []
            for i in range(final_bboxes.shape[0]):
                final_bboxes_.append(
                    (final_bboxes[i, 0], final_bboxes[i, 1],
                     final_bboxes[i, 2], final_bboxes[i, 3], final_bboxes[i,
                                                                          4]))

            return final_bboxes_
        else:
            return bbox_collection_numpy
def CudaColor(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void co( float *inIm, int check, int color){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx*3 < check*3)
        { 
			if(color == 0)
			{
				inIm[idx*3+1] = inIm[idx*3+1]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 1)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 2)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+1] = inIm[idx*3+1]-255;
			}
			
			if(inIm[idx*3] < 0)
				inIm[idx*3] = 0;
			if(inIm[idx*3] > 255)
				inIm[idx*3] = 255;
				
			if(inIm[idx*3+1] < 0)
				inIm[idx*3+1] = 0;
			if(inIm[idx*3+1] > 255)
				inIm[idx*3+1] = 255;
				
			if(inIm[idx*3+2] < 0)
				inIm[idx*3+2] = 0;
			if(inIm[idx*3+2] > 255)
				inIm[idx*3+2] = 255;
        }
    }
    """

    color = int(
        raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): "))
    print
    color = numpy.int32(color)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("co")
    func(d_px, checkSize, color, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    coPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(coPx, d_px)
    coPx = (numpy.uint8(coPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(coPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Color Filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
Example #49
0
def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name,
         allLabels, smooth, uncertainty):

    rank = comm.Get_rank()
    size = comm.Get_size()

    if raw.dtype == 'uint8':
        kernel = _build_kernel_int8()
        raw = (raw - 128).astype('int8')
    else:
        kernel = _build_kernel_float32()
        raw = raw.astype(np.float32)

    foundAxis = [0] * 3
    for k in range(3):
        if indices[k]:
            foundAxis[k] = 1

    zsh, ysh, xsh = raw.shape
    fill_gpu = _build_kernel_fill()

    block = (32, 32, 1)
    x_grid = (xsh // 32) + 1
    y_grid = (ysh // 32) + 1
    grid2 = (int(x_grid), int(y_grid), int(zsh))

    a = np.empty(raw.shape, dtype=np.float32)
    final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8)
    segment_npy = np.empty(1, dtype=np.uint8)

    memory_error = False

    try:
        raw_gpu = gpuarray.to_gpu(raw)
        a_gpu = cuda.mem_alloc(a.nbytes)

        if smooth:
            update_gpu = _build_update_gpu()
            curvature_gpu = _build_curvature_gpu()
            b_gpu = gpuarray.zeros(raw.shape, dtype=np.float32)

        zshape = np.int32(zsh)
        yshape = np.int32(ysh)
        xshape = np.int32(xsh)
        sorw = np.int32(sorw)
        nbrw = np.int32(nbrw)

        slshape = [None] * 3
        indices_gpu = [None] * 3
        beta_gpu = [None] * 3
        slices_gpu = [None] * 3
        ysh = [None] * 3
        xsh = [None] * 3

        for k, found in enumerate(foundAxis):
            if found:
                indices_tmp = np.array(indices[k], dtype=np.int32)
                slices_tmp = slices[k].astype(np.int32)
                slices_tmp = reduceBlocksize(slices_tmp)
                slshape[k], ysh[k], xsh[k] = slices_tmp.shape
                indices_gpu[k] = gpuarray.to_gpu(indices_tmp)
                slices_gpu[k] = gpuarray.to_gpu(slices_tmp)
                Beta = np.zeros(slices_tmp.shape, dtype=np.float32)
                for m in range(slshape[k]):
                    for n in allLabels:
                        A = _calc_label_walking_area(slices_tmp[m], n)
                        plane = indices_tmp[m]
                        if k == 0: raw_tmp = raw[plane]
                        if k == 1: raw_tmp = raw[:, plane]
                        if k == 2: raw_tmp = raw[:, :, plane]
                        Beta[m] += _calc_var(raw_tmp.astype(float), A)
                beta_gpu[k] = gpuarray.to_gpu(Beta)

        sendbuf = np.zeros(1, dtype=np.int32)
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    except Exception as e:
        print('Error: GPU out of memory. Data too large.')
        sendbuf = np.zeros(1, dtype=np.int32) + 1
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    if recvbuf > 0:
        memory_error = True
        try:
            a_gpu.free()
        except:
            pass
        return memory_error, None, None, None

    if smooth:
        try:
            update_gpu = _build_update_gpu()
            curvature_gpu = _build_curvature_gpu()
            b_npy = np.zeros(raw.shape, dtype=np.float32)
            b_gpu = cuda.mem_alloc(b_npy.nbytes)
            cuda.memcpy_htod(b_gpu, b_npy)
            final_smooth = np.zeros((blockmax - blockmin, yshape, xshape),
                                    dtype=np.uint8)
            sendbuf_smooth = np.zeros(1, dtype=np.int32)
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.'
            )
            sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_smooth > 0:
            smooth = 0
            try:
                b_gpu.free()
            except:
                pass

    if uncertainty:
        try:
            max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32)
            max_gpu = cuda.mem_alloc(max_npy.nbytes)
            cuda.memcpy_htod(max_gpu, max_npy)
            kernel_uncertainty = _build_kernel_uncertainty()
            kernel_max = _build_kernel_max()
            sendbuf_uq = np.zeros(1, dtype=np.int32)
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.'
            )
            sendbuf_uq = np.zeros(1, dtype=np.int32) + 1
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_uq > 0:
            uncertainty = False
            try:
                max_gpu.free()
            except:
                pass

    for label_counter, segment in enumerate(allLabels):
        print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' +
              str(len(allLabels)))
        fill_gpu(a_gpu, xshape, yshape, block=block, grid=grid2)
        segment_gpu = np.int32(segment)
        segment_npy.fill(segment)
        for k, found in enumerate(foundAxis):
            if found:
                axis_gpu = np.int32(k)
                x_grid = (xsh[k] // 32) + 1
                y_grid = (ysh[k] // 32) + 1
                grid = (int(x_grid), int(y_grid), int(slshape[k]))
                kernel(axis_gpu,
                       segment_gpu,
                       raw_gpu,
                       slices_gpu[k],
                       a_gpu,
                       xshape,
                       yshape,
                       zshape,
                       indices_gpu[k],
                       sorw,
                       beta_gpu[k],
                       nbrw,
                       block=block,
                       grid=grid)
        cuda.memcpy_dtoh(a, a_gpu)

        if size > 1:
            a = sendrecv(a, blockmin, blockmax, comm, rank, size)

        if smooth or uncertainty:
            cuda.memcpy_htod(a_gpu, a)

        if uncertainty:
            kernel_max(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2)

        if smooth:
            for k in range(smooth):
                curvature_gpu(a_gpu,
                              b_gpu,
                              xshape,
                              yshape,
                              block=block,
                              grid=grid2)
                update_gpu(a_gpu,
                           b_gpu,
                           xshape,
                           yshape,
                           block=block,
                           grid=grid2)
            a_smooth = np.empty_like(a)
            cuda.memcpy_dtoh(a_smooth, a_gpu)
            if label_counter == 0:
                a_smooth[a_smooth < 0] = 0
                walkmap_smooth = np.copy(a_smooth)
            else:
                walkmap_smooth, final_smooth = max_to_label(
                    a_smooth, walkmap_smooth, final_smooth, blockmin, blockmax,
                    segment)

        if label_counter == 0:
            a[a < 0] = 0
            walkmap = np.copy(a)
        else:
            walkmap, final = max_to_label(a, walkmap, final, blockmin,
                                          blockmax, segment)

    if uncertainty:
        kernel_uncertainty(max_gpu,
                           a_gpu,
                           xshape,
                           yshape,
                           block=block,
                           grid=grid2)
        final_uncertainty = np.empty_like(a)
        cuda.memcpy_dtoh(final_uncertainty, a_gpu)
        final_uncertainty = final_uncertainty[blockmin:blockmax]
    else:
        final_uncertainty = None

    if not smooth:
        final_smooth = None

    try:
        a_gpu.free()
    except:
        pass

    return memory_error, final, final_uncertainty, final_smooth
def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles,
                                       results, sumR_sa, save_file_for_each_a=False):
    gsize = int(params[0])
    num_actions = int(params[1])
    nrzns = int(params[2])

    all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu

    results_gpu_list = []
    sumR_sa_gpu_list = []
    for i in range(num_actions):
        results_gpu_list.append(cuda.mem_alloc(results.nbytes))
        sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes))
    for i in range(num_actions):
        cuda.memcpy_htod(results_gpu_list[i], results)
        cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa)

    print("alloted mem in inner func")


    # let one thread access a state centre. access coresponding velocities, run all actions
    # TODO: dt may not be int for a genral purpose code

    mod = SourceModule("""
    __device__ int32_t get_thread_idx()
            // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix
            // for a given T and a given action. 
            // runs for both 2d and 3d grid
            // TODO: may have to change this considering cache locality
        {
            // here i, j, k refer to a general matrix M[i][j][k]
            int32_t i = threadIdx.x;
            int32_t j = blockIdx.y;
            int32_t k = blockIdx.x;
            int32_t idx = k + (j*gridDim.x)  + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y;
            return idx;
        }
    __device__ int32_t state1D_from_thread(int32_t T)
    {   
        // j ~ blockIdx.x
        // i ~ blockIdx.y 
        // The above three consitute a spatial state index from i and j of grid
        // last term is for including time index as well.
        return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); 
    }
    __device__ int32_t state1D_from_ij(int32_t*  posid, int32_t T)
    {
        // posid = {i , j}
        // state id = j + i*dim(i) + T*dim(i)*dim(j)
        return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; 
    }
    __device__ bool is_edge_state(int32_t i, int32_t j)
    {
        // n = gsize -1 that is the last index of the domain assuming square domain
        int32_t n = gridDim.x - 1;
        if (i == 0 || i == n || j == 0 || j == n ) 
            {
                return true;
            }
        else return false;
    }
    __device__ bool is_terminal(int32_t i, int32_t j, float* params)
    {
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        if(i == i_term && j == j_term)
        {
            return true;
        }
        else return false;
    }
    __device__ bool my_isnan(int s)
    {
    // By IEEE 754 rule, NaN is not equal to NaN
    return s != s;
    }
    __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y)
    {
        *x = xs[j];
        *y = ys[gridDim.x - 1 - i];
        return;
    }
    __device__ float get_angle_in_0_2pi(float theta)
    {
        float f_pi = 3.141592;
        if (theta < 0)
        {
            return theta + (2*f_pi);
        }
        else
        {
            return theta;
        }  
    }
    __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y )
    {
        // xold and yold are centre of old state (i_old, j_old)
        float dt = params[4];
        float r1, r2, theta1, theta2, theta, h;
        float dt_new;
        float xnew, ynew;
        if (newposids[0] == i_old && newposids[1] == j_old)
        {
            dt_new = dt;
        }
        else
        {
            get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states
            h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold));
            r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y)));
            theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x));
            theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold));
            theta = fabsf(theta1 -theta2);
            r2 = fabsf(sinf(theta));
            dt_new = r1 + r2;
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[24] = r1;
                params[25] = r2;
            }
        }
        return -dt_new;
    }
    __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r )
    {
            int32_t n = params[0] - 1;      // gsize - 1
            // int32_t num_actions = params[1];
            // int32_t nrzns = params[2];
            float F = params[3];
            float dt = params[4];
            float r_outbound = params[5];
            float r_terminal = params[6];
            float Dj = fabsf(xs[1] - xs[0]);
            float Di = fabsf(ys[1] - ys[0]);
            float r_step = 0;
            *r = 0;
            int32_t i0 = posids[0];
            int32_t j0 = posids[1];
            float vnetx = F*cosf(ac_angle) + vx;
            float vnety = F*sinf(ac_angle) + vy;
            float x, y;
            get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0
            float xnew = x + (vnetx * dt);
            float ynew = y + (vnety * dt);
            //checks TODO: remove checks once verified
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[12] = x;
                params[13] = y;
                params[14] = vnetx;
                params[15] = vnety;
                params[16] = xnew;
                params[17] = ynew;
                params[18] = ac_angle;
            }
            if (xnew > xs[n])
                {
                    xnew = xs[n];
                    *r += r_outbound;
                }
            else if (xnew < xs[0])
                {
                    xnew = xs[0];
                    *r += r_outbound;
                }
            if (ynew > ys[n])
                {
                    ynew =  ys[n];
                    *r += r_outbound;
                }
            else if (ynew < ys[0])
                {
                    ynew =  ys[0];
                    *r += r_outbound;
                }
            // TODO:xxDONE check logic wrt remainderf. remquof had issue
            int32_t xind, yind;
            //float remx = remquof((xnew - xs[0]), Dj, &xind);
            //float remy = remquof(-(ynew - ys[n]), Di, &yind);
            float remx = remainderf((xnew - xs[0]), Dj);
            float remy = remainderf(-(ynew - ys[n]), Di);
            xind = ((xnew - xs[0]) - remx)/Dj;
            yind = (-(ynew - ys[n]) - remy)/Di;
            if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di))
                {
                    xind += 1;
                    yind += 1;
                }
            else if ((remx >= 0.5 * Dj && remy < 0.5 * Di))
                {
                    xind += 1;
                }
            else if ((remx < 0.5 * Dj && remy >= 0.5 * Di))
                {
                    yind += 1;
                }
            if (!(my_isnan(xind) || my_isnan(yind)))
                {
                    posids[0] = yind;
                    posids[1] = xind;
                    if (is_edge_state(posids[0], posids[1]))     //line 110
                        {
                            *r += r_outbound;
                        }
                    
                    if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
                    {
                        params[26] = 9999;
                    }
                }
            r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety);
            //TODO: change back to normal when needed
            //r_step = -dt;
            *r += r_step; //TODO: numerical check remaining
            if (is_terminal(posids[0], posids[1], params))
                {
                    *r += r_terminal;
                }
            
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[19] = xnew;
                params[20] = ynew;
                params[21] = yind;
                params[22] = xind;
                params[23] = *r;
                //params[17] = ynew;
                //params[18] = ac_angle;
            }
    }


    __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params)
    {
        int32_t nrzns = params[2];
        int32_t nmodes = params[7];              
        
        int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi
        float sum_x = 0;
        float sum_y = 0;
        float vx_mean, vy_mean;

        //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize]
        int32_t idx = get_thread_idx();

        //rzn index to identify which of the 5k rzn it is. used to access all_Yi.
        int32_t rzn_id = (blockIdx.z * blockDim.x)  + threadIdx.x ;

        //mean_id is the index used to access the flattened all_u_mat[t,i,j].
        int32_t mean_id = state1D_from_thread(T);

        //to access all_ui_mat and all_vi_mat
        str_uvi = gridDim.x * gridDim.y;
        sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x);

        // to access all_Yi
        sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes);

        vx_mean = all_u_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }

        vy_mean = all_v_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }
        
        *vx = vx_mean + sum_x;
        *vy = vy_mean + sum_y;
     
        return;
    }


    //test: changer from float* to float ac_angle
    __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi,
                                    float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results)
                                            // resutls directions- 1: along S2;  2: along S1;    3: along columns towards count
    {
        int32_t gsize = params[0];          // size of grid along 1 direction. ASSUMING square grid.
        int32_t num_actions = params[1];    
        int32_t nrzns = params[2];
        float F = params[3];
        float dt = params[4];
        float r_outbound = params[5];
        float r_terminal = params[6];
        int32_t nmodes = params[7];              
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        int32_t nT = params[10];
        int32_t is_stationary = params[11];
        int32_t T = (int32_t)T_arr[0];
        int32_t idx = get_thread_idx();

        float vx, vy;

        if(idx < gridDim.x*gridDim.y*nrzns)
        {
            int32_t posids[2] = {blockIdx.y, blockIdx.x};    //static declaration of array of size 2 to hold i and j values of S1. 
            int32_t sp_id;      //sp_id is space_id. S1%(gsize*gsize)

            //  Afer move() these will be overwritten by i and j values of S2
            float r;              // to store immediate reward

            extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params);

            //move(*ac_angle, vx, vy, xs, ys, posids, params, &r);
            move(ac_angle, vx, vy, xs, ys, posids, params, &r);
            int32_t S1, S2;
            if (is_stationary == 1)
            {
                T = 0;
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T);   //get successor state number corresponding to posid and next timestep T+1        
            }
            else
            {
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T+1);   //get successor state number corresponding to posid and next timestep T+1        
                sp_id = S1%(gsize*gsize);
            }
            //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg
            float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall
            results[idx] = S2;
            __syncthreads();
            /*if (threadIdx.x == 0 && blockIdx.z == 0)
            {
                sumR_sa[S1] = sumR_sa[S1]/nrzns;    //TODO: change name to R_sa from sumR_sa since were not storing sum anymore
            }
           */
        }//if ends
        return;
    }
        """)

    # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32)
    # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu)
    # print("sumR_sa",sumR_sa)
    # print("sumR_sa",sumR_sa2[0:10001])
    # T = np.array(T64, dtype = np.float32)
    params2 = np.empty_like(params).astype(np.float32)
    func = mod.get_function("transition_calc")
    for i in range(num_actions):
        print('T', T, " call kernel for action: ",i)
        func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i],
             block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1))
        if i == 0:
            cuda.memcpy_dtoh(params2, params_gpu)
            print("params check:",)
            print(  '\nangle= ', params2[18],
                    '\nx =' ,params2[12],
                '\ny =' ,params2[13] ,
                    '\nvnetx = ',params2[14],
                    '\nvnety =', params2[15],
                    '\nxnew =', params2[16],
                    '\nynew =', params2[17],
                    '\nxnewupd =', params2[19],
                    '\nynewupd =', params2[20],
                    '\nyind i=', params2[21],
                    '\nxind j=', params2[22],
                    '\nr- =', params2[23],
                    '\nr1+ =', params2[24],
                    '\nr2+ =', params2[25],
                    '\nenter_isnan =', params2[26]
                )

    results2_list = []
    sum_Rsa2_list = []
    for i in range(num_actions):
        results2_list.append(np.empty_like(results))
        sum_Rsa2_list.append(np.empty_like(sumR_sa))

    # SYNCHRONISATION - pycuda does it implicitly.

    for i in range(num_actions):
        cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i])
        cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i])
        print("memcpy_dtoh for action: ", i)


    for i in range(num_actions):
        sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns

    # print("sumR_sa2\n",sumR_sa2,"\n\n")

    # print("results_a0\n",results2_list[0].T[50::int(gsize**2)])
    print("OK REACHED END OF cuda relevant CODE\n")

    # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action
    # if save_file_for_each_a is true then each file must be named appopriately.
    if save_file_for_each_a == True:
        f1 = 'COO_Highway2D_T' + str(T) + '_a'
        f3 = '_of_' + str(num_actions) + 'A.npy'
        inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)]
    else:
        inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)]

    # coo_list_a is a list of coo for each each action for the given timestep.
    with Pool(num_actions) as p:
        coo_list_a = p.starmap(get_COO_, inputs)
    # print("coo print\n", coo.T[4880:4900, :])
    print("\n\n")
    # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60)
    # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60)

    return coo_list_a, sum_Rsa2_list
Example #51
0
def image_iterator_gpu(image_volume,
                       roi=None,
                       radius=2,
                       gray_levels=None,
                       binwidth=None,
                       dx=1,
                       dy=0,
                       dz=0,
                       ndev=2,
                       cadd=(0, 0, 0),
                       sadd=3,
                       csub=(0, 0, 0),
                       ssub=3,
                       i=0,
                       feature_kernel='glcm_plugin_gpu',
                       stat_name='glcm_stat_contrast_gpu'):
    """Uses PyCuda to parallelize the computation of the voxel-wise image entropy using a variable \
            neighborhood radius

    Args:
	radius -- neighborhood radius; where neighborhood size is isotropic and calculated as 2*radius+1
    """
    # initialize cuda context
    cuda.init()
    cudacontext = cuda.Device(NVDEVICE).make_context()

    parent_dir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(parent_dir, 'local_features.cuh'), mode='r') as f:
        cuda_template = Template(f.read())

    roimask = None
    if isinstance(image_volume, np.ndarray):
        toBaseVolume = False
        logger.debug('recognized as an np.ndarray')
        if image_volume.ndim == 3:
            d, r, c = image_volume.shape
        elif image_volume.ndim == 2:
            d, r, c = (1, *image_volume.shape)
        image = image_volume.flatten()
        # # use stat based GLCM quantization
        # quantize_mode=QMODE_STAT
    else:
        toBaseVolume = True
        logger.debug('recognized as a BaseVolume')
        image = image_volume
        if roi:
            image = image.conformTo(roi.frameofreference)
        d, r, c = image.frameofreference.size[::-1]
        image = image.vectorize()
        # if not image_volume.modality.lower() == 'ct':
        #     # use stat based GLCM quantization
        #     quantize_mode=QMODE_STAT

        # mask to roi
        if (roi):
            roimask = roi.makeDenseMask().vectorize()

    logger.debug('d:{:d}, r:{:d}, c:{:d}'.format(d, r, c))
    if d == 1:
        z_radius = 0
    elif d > 1:
        z_radius = radius

    # enforce quantization mode selection
    # fixed_start, fixed_end = -150, 350
    fixed_start, fixed_end = -175, 75
    if gray_levels and binwidth:
        logger.exception(
            'must exclusively specify "binwidth" or "gray_levels" to select glcm quantization mode'
        )
    elif binwidth:
        quantize_mode = QMODE_FIXEDHU
        nbins = int(math.floor((fixed_end - fixed_start) / binwidth)) + 2
        gray_levels = -1
    elif gray_levels:
        quantize_mode = QMODE_STAT
        nbins = gray_levels
        binwidth = -1
    else:
        # kernel doesn't use glcm
        quantize_mode = -1
        nbins = 1
        gray_levels = -1
        binwidth = -1

    maxrunlength = math.ceil(
        math.sqrt(2 * (radius * 2 + 1) * (radius * 2 + 1) +
                  (z_radius * 2 + 1)))

    cuda_source = cuda_template.substitute({
        'RADIUS': radius,
        'Z_RADIUS': z_radius,
        'IMAGE_DEPTH': d,
        'IMAGE_HEIGHT': r,
        'IMAGE_WIDTH': c,
        'QUANTIZE_MODE': quantize_mode,
        'GRAY_LEVELS': gray_levels,
        'FIXED_BINWIDTH': binwidth,
        'FIXED_START': fixed_start,
        'NBINS': nbins,
        'MAXRUNLENGTH': maxrunlength,
        'DX': dx,
        'DY': dy,
        'DZ': dz,
        'NDEV': ndev,
        'CADD_X': cadd[0],
        'CADD_Y': cadd[1],
        'CADD_Z': cadd[2],
        'SADD': sadd,
        'CSUB_X': csub[0],
        'CSUB_Y': csub[1],
        'CSUB_Z': csub[2],
        'SSUB': ssub,
        'KERNEL': feature_kernel,
        'STAT': stat_name
    })
    mod2 = SourceModule(
        cuda_source,
        options=['-I {!s}'.format(parent_dir), '-g', '-G', '-lineinfo'])
    func = mod2.get_function('image_iterator_gpu')

    # allocate image on device in global memory
    image = image.astype(np.float32)
    image_gpu = cuda.mem_alloc(image.nbytes)
    result = np.zeros_like(image)
    result_gpu = cuda.mem_alloc(result.nbytes)
    # transfer image to device
    cuda.memcpy_htod(image_gpu, image)
    cuda.memcpy_htod(result_gpu, result)
    # call device kernel
    blocksize = 256
    gridsize = math.ceil(r * c * d / blocksize)
    func(image_gpu, result_gpu, block=(blocksize, 1, 1), grid=(gridsize, 1, 1))
    # get result from device
    cuda.memcpy_dtoh(result, result_gpu)

    # detach from cuda context
    # cudacontext.synchronize()
    # cudacontext.detach()
    cudacontext.pop()
    # required to successfully free device memory for created context
    del cudacontext
    gc.collect()
    pycuda.tools.clear_context_caches()

    logger.debug('feature result shape: {!s}'.format(result.shape))
    logger.debug('GPU done')

    # clean invalid values from result
    result = np.nan_to_num(result)

    if (roimask is not None):
        result = np.multiply(result, roimask)

    if d == 1:
        result = result.reshape(r, c)
    elif d > 1:
        result = result.reshape(d, r, c)

    if toBaseVolume:
        if roi:
            FOR = roi.frameofreference
        else:
            FOR = image_volume.frameofreference
        outvolume = MaskableVolume().fromArray(result, FOR)
        outvolume.modality = image_volume.modality
        return outvolume
    else:
        return result
Example #52
0
    def __init__(self, vol_bnds, voxel_size):

        # Define voxel volume parameters.
        self._vol_bnds = vol_bnds  # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters
        self._voxel_size = voxel_size  # in meters (determines volume discretization and resolution)
        self._trunc_margin = self._voxel_size * 5  # truncation on SDF

        # Adjust volume bounds.
        self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) /
                                self._voxel_size).copy(order='C').astype(
                                    int)  # ensure C-order contigous
        self._vol_bnds[:,
                       1] = self._vol_bnds[:,
                                           0] + self._vol_dim * self._voxel_size
        self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype(
            np.float32)  # ensure C-order contigous
        print("Voxel volume size: {:d} x {:d} x {:d}".format(
            self._vol_dim[0], self._vol_dim[1], self._vol_dim[2]))

        # Initialize pointers to voxel volume in CPU memory.
        self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32)
        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(
            np.float32
        )  # for computing the cumulative moving average of observations per voxel
        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)

        # Copy voxel volumes to GPU.
        if TSDF_GPU_MODE:
            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)

            # Cuda kernel function (C++)
            self._cuda_src_mod = SourceModule("""
              __global__ void integrate(float * tsdf_vol,
                                        float * weight_vol,
                                        float * color_vol,
                                        float * vol_dim,
                                        float * vol_origin,
                                        float * cam_intr,
                                        float * cam_pose,
                                        float * other_params,
                                        float * color_im,
                                        float * depth_im) {

                // Get voxel index.
                int gpu_loop_idx = (int) other_params[0];
                int max_threads_per_block = blockDim.x;
                int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
                int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x;
                
                int vol_dim_x = (int)vol_dim[0];
                int vol_dim_y = (int)vol_dim[1];
                int vol_dim_z = (int)vol_dim[2];

                if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z)
                    return;

                // Get voxel grid coordinates.
                float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z)));
                float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z));
                float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z);

                // Voxel grid coordinates to world coordinates.
                float voxel_size = other_params[1];
                float pt_x = vol_origin[0] + voxel_x * voxel_size;
                float pt_y = vol_origin[1] + voxel_y * voxel_size;
                float pt_z = vol_origin[2] + voxel_z * voxel_size;

                // World coordinates to camera coordinates.
                float tmp_pt_x = pt_x - cam_pose[0*4+3];
                float tmp_pt_y = pt_y - cam_pose[1*4+3];
                float tmp_pt_z = pt_z - cam_pose[2*4+3];
                float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z;
                float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z;
                float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z;

                // Camera coordinates to image pixels.
                int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]);
                int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]);

                // Skip if outside view frustum.
                int im_h = (int) other_params[2];
                int im_w = (int) other_params[3];
                if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0)
                    return;

                // Skip invalid depth.
                float depth_value = depth_im[pixel_y*im_w+pixel_x];
                if (depth_value == 0)
                    return;

                // Integrate TSDF.
                float trunc_margin = other_params[4];
                float depth_diff = depth_value-cam_pt_z;
                if (depth_diff < -trunc_margin)
                    return;
                float dist = fmin(1.0f, depth_diff / trunc_margin);
                float w_old = weight_vol[voxel_idx];
                float obs_weight = other_params[5];
                float w_new = w_old + obs_weight;
                weight_vol[voxel_idx] = w_new;
                tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new;

                // Integrate color.
                float old_color = color_vol[voxel_idx];
                float old_b = floorf(old_color / (256 * 256));
                float old_g = floorf((old_color - old_b * 256 * 256) / 256);
                float old_r = old_color - old_b * 256 * 256 - old_g * 256;
                float new_color = color_im[pixel_y*im_w+pixel_x];
                float new_b = floorf(new_color / (256 * 256));
                float new_g = floorf((new_color - new_b * 256 * 256) / 256);
                float new_r = new_color - new_b * 256 * 256 - new_g * 256;
                new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f);
                new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f);
                new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f);
                color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r;

              }""")

            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")

            # Determine block/grid size on GPU.
            gpu_dev = cuda.Device(0)
            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
            n_blocks = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) /
                    float(self._max_gpu_threads_per_block)))
            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X,
                             int(np.floor(np.cbrt(n_blocks))))
            grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y,
                             int(np.floor(np.sqrt(n_blocks / grid_dim_x))))
            grid_dim_z = min(
                gpu_dev.MAX_GRID_DIM_Z,
                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))))
            self._max_gpu_grid_dim = np.array(
                [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int)
            self._n_gpu_loops = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) / float(
                        np.prod(self._max_gpu_grid_dim) *
                        self._max_gpu_threads_per_block)))
Example #53
0
    def mh_sample_A(self, is_symmetric=False):
        """
        Sample new adjacency matrix and relevant spike parents using MH 
        Determine whether or not to propose a birth
        Choose an edge to propose
        Determine whether to accept using cuComputeProdQratio to do the parallel computation. 
        If accept, use cuSampleSingleProcessZ to choose new parents
        """
        N = self.base.data.N

        K = self.modelParams["proc_id_model", "K"]
        Ns = self.modelParams["proc_id_model", "Ns"]

        # Determine whether to propose a new edge or a removal of an existing edge
        op = MH_ADD if np.random.rand() < self.params["gamma"] else MH_DEL

        # Choose a row (ki) and column (kj) to update
        # They must be selected randomly, otherwise the transition probabilities
        # do not cancel properly as derived in the paper, and the distribution is
        # not left invariant after the MH operation
        #        ki = np.random.randint(K)
        #        # If this is a symmetric graph model, only choose from the upper diagonal
        #        if is_symmetric:
        #            kj = np.random.randint(ki,K)
        #        else:
        #            kj = np.random.randint(K)

        # Choose one of the unspecified edges in the graph
        if self.modelParams["graph_model", "mask"] == None:
            ki = np.random.randint(K)
            # If this is a symmetric graph model, only choose from the upper diagonal
            if is_symmetric:
                kj = np.random.randint(ki, K)
            else:
                kj = np.random.randint(K)
        else:
            (kis, kjs) = np.nonzero(self.modelParams["graph_model",
                                                     "mask"] == -1)
            ind = np.random.randint(0, len(kis))
            ki = kis[ind]
            kj = kjs[ind]
            assert self.modelParams["graph_model", "mask"][ki, kj] == -1

#        # Check if this entry is set in the mask already
#        if self.modelParams["graph_model","mask"] != None:
#            if self.modelParams["graph_model","mask"][ki,kj] != -1:
#                return

# Get the current weight for this entry
        currWBuffer = np.zeros((1, ), dtype=np.float32)
        cuda.memcpy_dtoh(
            currWBuffer, self.gpuPtrs["weight_model", "W"].ptr + int(
                (ki * K + kj) * currWBuffer.itemsize))
        currW = currWBuffer[0]

        # Compute WGS using the current set of weights
        grid_w = int(np.ceil(np.float32(N) / self.params["blockSz"]))

        #        startPerfTimer(perfDict, "computeWGSForAllSpikes")
        #        log.info(self.gpuPtrs["graph_model","WGS"].get())
        self.gpuKernels["computeWGSForAllSpikes"](
            np.int32(K),
            np.int32(N),
            self.gpuPtrs["proc_id_model", "C"].gpudata,
            self.gpuPtrs["impulse_model", "GS"].gpudata,
            self.base.dSS["colPtrs"].gpudata,
            self.base.dSS["rowIndices"].gpudata,
            self.gpuPtrs["weight_model", "W"].gpudata,
            self.gpuPtrs["graph_model", "A"].gpudata,
            self.gpuPtrs["graph_model", "WGS"].gpudata,
            block=(1024, 1, 1),
            grid=(grid_w, K))
        #        stopPerfTimer(perfDict, "computeWGSForAllSpikes")

        # Now determine whether or not to accept the change
        # If the proposal does not change the adjacency matrix
        # then this is easy, otherwise we enlist the GPU
        logQratio = 0.0
        logPratio = 0.0

        if op == MH_ADD:
            # We are proposing a new edge. Calculate WGS with the
            # new edge and determine the P and Q ratios

            # This now runs on all N spikes but only counts spikes affected by the new edge
            # namely, those on process kj
            self.gpuKernels["computeWGSForNewEdge"](
                np.int32(ki),
                np.int32(kj),
                np.int32(K),
                np.int32(N),
                self.gpuPtrs["proc_id_model", "C"].gpudata,
                self.gpuPtrs["impulse_model", "GS"].gpudata,
                self.base.dSS["colPtrs"].gpudata,
                self.base.dSS["rowIndices"].gpudata,
                self.gpuPtrs["weight_model", "W"].gpudata,
                self.gpuPtrs["graph_model", "WGS"].gpudata,
                block=(1024, 1, 1),
                grid=(grid_w, 1))

            if is_symmetric and ki != kj:
                self.gpuKernels["computeWGSForNewEdge"](
                    np.int32(kj),
                    np.int32(ki),
                    np.int32(K),
                    np.int32(N),
                    self.gpuPtrs["proc_id_model", "C"].gpudata,
                    self.gpuPtrs["impulse_model", "GS"].gpudata,
                    self.base.dSS["colPtrs"].gpudata,
                    self.base.dSS["rowIndices"].gpudata,
                    self.gpuPtrs["weight_model", "W"].gpudata,
                    self.gpuPtrs["graph_model", "WGS"].gpudata,
                    block=(1024, 1, 1),
                    grid=(grid_w, 1))
#            stopPerfTimer(perfDict, "computeWGSForNewEdge")

# Compute the P ratio
            rho = self.getConditionalEdgePr(ki, kj)
            if rho == 1.0:
                logPratio = np.Inf
            elif rho == 0.0:
                logPratio = -np.Inf
            elif is_symmetric and ki != kj:
                logPratio = -1.0 * Ns[ki] * currW + -1.0 * Ns[
                    kj] * currW + np.log(rho) - np.log(1 - rho)
            else:
                logPratio = -1.0 * Ns[ki] * currW + np.log(rho) - np.log(1 -
                                                                         rho)

            # Compute the Q ratio
            if ki == kj and not self.params["allow_self_excitation"]:
                # Do not allow self-excitation
                logQratio = np.float64(-1.0 * np.Inf)
            else:
                logQratio = np.log(1 - self.params["gamma"]) - np.log(
                    self.params["gamma"])
                self.gpuKernels["computeProdQratio"](
                    np.int32(kj),
                    np.int32(K),
                    np.int32(N),
                    self.gpuPtrs["proc_id_model", "C"].gpudata,
                    self.gpuPtrs["graph_model", "WGS"].gpudata,
                    self.gpuPtrs["bkgd_model", "lam"].gpudata,
                    np.int32(ki),
                    np.int32(MH_ADD),
                    self.gpuPtrs["graph_model", "qratio"].gpudata,
                    block=(1024, 1, 1),
                    grid=(grid_w, 1))

                blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get()
                logQratio += np.sum(blockLogQratio)

                if is_symmetric and ki != kj:
                    self.gpuKernels["computeProdQratio"](
                        np.int32(ki),
                        np.int32(K),
                        np.int32(N),
                        self.gpuPtrs["proc_id_model", "C"].gpudata,
                        self.gpuPtrs["graph_model", "WGS"].gpudata,
                        self.gpuPtrs["bkgd_model", "lam"].gpudata,
                        np.int32(kj),
                        np.int32(MH_ADD),
                        self.gpuPtrs["graph_model", "qratio"].gpudata,
                        block=(1024, 1, 1),
                        grid=(grid_w, 1))

                    blockLogQratio = self.gpuPtrs["graph_model",
                                                  "qratio"].get()
                    logQratio += np.sum(blockLogQratio)

            # Decide whether or not to accept this change
            logPrAccept = logPratio + logQratio
            accept = np.log(np.random.rand()) < logPrAccept
            if accept:
                #                if is_symmetric and ki!=kj:
                #                    log.debug("+A[%d,%d],+A[%d,%d]",ki,kj,kj,ki)
                #                else:
                #                    log.debug("+A[%d,%d]",ki,kj)

                # Update the adjacency matrix on host and GPU
                self.modelParams["graph_model", "A"][ki, kj] = True
                A_buff = np.array([True], dtype=np.bool)
                cuda.memcpy_htod(
                    self.gpuPtrs["graph_model", "A"].ptr + int(
                        (ki * K + kj) * A_buff.itemsize), A_buff)

                if is_symmetric and ki != kj:
                    self.modelParams["graph_model", "A"][kj, ki] = True
                    A_buff = np.array([True], dtype=np.bool)
                    cuda.memcpy_htod(
                        self.gpuPtrs["graph_model", "A"].ptr + int(
                            (kj * K + ki) * A_buff.itemsize), A_buff)

            else:
                # Clear the WGS changes
                self.gpuKernels["clearWGSForDeletedEdge"](
                    np.int32(ki),
                    np.int32(kj),
                    np.int32(N),
                    self.gpuPtrs["proc_id_model", "C"].gpudata,
                    self.gpuPtrs["graph_model", "WGS"].gpudata,
                    block=(1024, 1, 1),
                    grid=(grid_w, 1))

                if is_symmetric and ki != kj:
                    self.gpuKernels["clearWGSForDeletedEdge"](
                        np.int32(kj),
                        np.int32(ki),
                        np.int32(N),
                        self.gpuPtrs["proc_id_model", "C"].gpudata,
                        self.gpuPtrs["graph_model", "WGS"].gpudata,
                        block=(1024, 1, 1),
                        grid=(grid_w, 1))

        if op == MH_DEL:
            # We are proposing to delete an edge. WGS was calculated
            # with the edge present.

            # Compute the P ratio

            rho = self.getConditionalEdgePr(ki, kj)
            if rho == 1.0:
                logPratio = -np.Inf
            elif rho == 0.0:
                logPratio = np.Inf
            elif is_symmetric and ki != kj:
                logPratio = Ns[ki] * currW + Ns[kj] * currW + np.log(
                    1 - rho) - np.log(rho)
            else:
                logPratio = Ns[ki] * currW + np.log(1 - rho) - np.log(rho)

            # Compute the Q ratio
            logQratio = np.log(
                self.params["gamma"]) - np.log(1 - self.params["gamma"])
            self.gpuKernels["computeProdQratio"](
                np.int32(kj),
                np.int32(K),
                np.int32(N),
                self.gpuPtrs["proc_id_model", "C"].gpudata,
                self.gpuPtrs["graph_model", "WGS"].gpudata,
                self.gpuPtrs["bkgd_model", "lam"].gpudata,
                np.int32(ki),
                np.int32(MH_DEL),
                self.gpuPtrs["graph_model", "qratio"].gpudata,
                block=(1024, 1, 1),
                grid=(grid_w, 1))
            blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get()
            logQratio += np.sum(blockLogQratio)

            if is_symmetric and ki != kj:
                self.gpuKernels["computeProdQratio"](
                    np.int32(ki),
                    np.int32(K),
                    np.int32(N),
                    self.gpuPtrs["proc_id_model", "C"].gpudata,
                    self.gpuPtrs["graph_model", "WGS"].gpudata,
                    self.gpuPtrs["bkgd_model", "lam"].gpudata,
                    np.int32(kj),
                    np.int32(MH_DEL),
                    self.gpuPtrs["graph_model", "qratio"].gpudata,
                    block=(1024, 1, 1),
                    grid=(grid_w, 1))
                blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get()
                logQratio += np.sum(blockLogQratio)

            # Decide whether or not to accept this change
            logPrAccept = logPratio + logQratio
            accept = np.log(np.random.rand()) < logPrAccept
            if accept:
                #                if is_symmetric and ki!=kj:
                #                    log.debug("-A[%d,%d],-A[%d,%d]",ki,kj,kj,ki)
                #                else:
                #                    log.debug("-A[%d,%d]",ki,kj)

                # Update the adjacency matrix
                self.modelParams["graph_model", "A"][ki, kj] = False
                A_buff = np.array([False], dtype=np.bool)
                cuda.memcpy_htod(
                    self.gpuPtrs["graph_model", "A"].ptr + int(
                        (ki * K + kj) * A_buff.itemsize), A_buff)

                if is_symmetric and ki != kj:
                    self.modelParams["graph_model", "A"][kj, ki] = False
                    A_buff = np.array([False], dtype=np.bool)
                    cuda.memcpy_htod(
                        self.gpuPtrs["graph_model", "A"].ptr + int(
                            (kj * K + ki) * A_buff.itemsize), A_buff)

                # Clear the WGS changes
                self.gpuKernels["clearWGSForDeletedEdge"](
                    np.int32(ki),
                    np.int32(kj),
                    np.int32(N),
                    self.gpuPtrs["proc_id_model", "C"].gpudata,
                    self.gpuPtrs["graph_model", "WGS"].gpudata,
                    block=(1024, 1, 1),
                    grid=(grid_w, 1))

                if is_symmetric and ki != kj:
                    self.gpuKernels["clearWGSForDeletedEdge"](
                        np.int32(kj),
                        np.int32(ki),
                        np.int32(N),
                        self.gpuPtrs["proc_id_model", "C"].gpudata,
                        self.gpuPtrs["graph_model", "WGS"].gpudata,
                        block=(1024, 1, 1),
                        grid=(grid_w, 1))

            else:
                # Nothing changes if we do not delete the edge
                pass
    {
        int idx = threadIdx.x + threadIdx.y*5;
        d_a[idx] = d_a[idx]*d_a[idx];
    }
""")
square = mod.get_function("square")

# ---------------Using mem_alloc--------------- #
start = drv.Event()
end = drv.Event()
h_a = numpy.random.randint(1, 5, (5, 5))
h_a = h_a.astype(numpy.float32)
h_b = h_a.copy()
start.record()
d_a = drv.mem_alloc(h_a.size * h_a.dtype.itemsize)
drv.memcpy_htod(d_a, h_a)
# Calling kernel
square(d_a, block=(5, 5, 1), grid=(1, 1), shared=0)
h_result = numpy.empty_like(h_a)
drv.memcpy_dtoh(h_result, d_a)
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Time of Squaring on GPU without inout")
print("%fs" % (secs))
print("original array:")
print(h_a)
print("Square with kernel:")
print(h_result)

# ---------------Using inout functionality of driver class--------------- #
Example #55
0
def frame_eraser():
    files = getData()
    if (files == 0):
        return 0

    path = files['path'] + files['number']

    resolution_x = files['resolution'][0]
    resolution_y = files['resolution'][1]

    global BATCH_SIZE
    BATCH_SIZE = files['batch_size']

    global NUM_THREADS
    NUM_THREADS = 16
    imageNames = __getImageNames(path)

    for batch in range(0, math.floor(len(imageNames) / BATCH_SIZE) + 1):
        print(
            f"\nBatch: {batch + 1} of {math.floor(len(imageNames) / BATCH_SIZE) + 1}"
        )
        batch_host = np.empty(0, dtype=object)
        threads = []
        start = batch * BATCH_SIZE
        end = (batch + 1) * BATCH_SIZE
        if (end > len(imageNames)):
            end = len(imageNames)
        imageName_Batch = imageNames[start:end]
        batch_length = len(imageName_Batch)
        threadBatch = math.floor(batch_length / NUM_THREADS)

        for i in range(0, NUM_THREADS):
            if i == NUM_THREADS - 1:
                imageLoader = ImageLoader(
                    imageName_Batch[i * threadBatch:batch_length], path)
            else:
                imageLoader = ImageLoader(
                    imageName_Batch[i * threadBatch:(i + 1) * threadBatch],
                    path)
            threads.append(imageLoader)
        for t in threads:
            t.start()
        done = False
        print("Copying from Disk to RAM")
        with progressbar.ProgressBar(max_value=batch_length) as bar:
            while (not done):
                done = all(t.done == True for t in threads)
                progress = 0
                for t in threads:
                    progress += t.progress
                bar.update(progress)
                time.sleep(0.1)

        for t in threads:
            temp = np.copy(t.getBatch())
            batch_host = np.append(batch_host, temp)
        for t in threads:
            t.join()
        batch_device = np.zeros_like(batch_host)
        print("Copying from RAM to GPU")
        with progressbar.ProgressBar(max_value=batch_length) as bar:
            for i in range(0, batch_length):
                batch_device[i] = driver.mem_alloc(batch_host[i].nbytes)  # pylint: disable=no-member, unsupported-assignment-operation
                driver.memcpy_htod(batch_device[i], batch_host[i])  # pylint: disable=no-member
                bar.update(i)

        # CUDA Absolute Image Subtraction
        diffBlock = (8, 8, 3)
        diffGrid = (int(resolution_x / 8), int(resolution_y / 8), 1)

        h_diffImage_int = np.zeros_like(batch_host[0], dtype=np.uint8)
        d_diffImage_int = driver.mem_alloc(h_diffImage_int.nbytes)  # pylint: disable=no-member
        getImgDiff = __module.get_function("cuda_GetImgDiff")

        # CUDA Sum Image
        num_block = int(resolution_x * resolution_y * 3 / 512)
        block = (512, 1, 1)
        grid = (num_block, 1, 1)

        h_sum = np.zeros(num_block, dtype=np.float)
        d_sum = driver.mem_alloc(h_sum.nbytes)  # pylint: disable=no-member
        sumPixels = __module.get_function("cuda_SumPixels")

        # CUDA Int to Float image converstion
        h_diffImage_float = h_diffImage_int.astype(np.float32)  # pylint: disable=no-member
        d_diffImage_float = driver.mem_alloc(h_diffImage_float.nbytes)  # pylint: disable=no-member
        byteToFloat = __module.get_function("cuda_ByteToFloat")

        imagesToDelete = []
        print("Processing")
        pixelSum = 0
        with progressbar.ProgressBar(max_value=batch_length) as bar:
            pivot = 0

            for i in range(0, batch_length - 1):
                getImgDiff(d_diffImage_int,
                           batch_device[pivot],
                           batch_device[i + 1],
                           np.int32(resolution_x),
                           block=diffBlock,
                           grid=diffGrid)
                byteToFloat(d_diffImage_float,
                            d_diffImage_int,
                            block=block,
                            grid=grid)
                sumPixels(d_diffImage_float, d_sum, block=block, grid=grid)
                driver.memcpy_dtoh(h_sum, d_sum)  # pylint: disable=no-member
                pixelSum = h_sum.sum()

                if (pixelSum > threshold):
                    pivot = i
                else:
                    imagesToDelete.append(i)
                bar.update(i)

        for i in imagesToDelete:
            os.remove(path + imageName_Batch[i])
            pass
        print(f'Deleted: {len(imagesToDelete)} images\n')

        #getImgDiff(d_diffImage_int, batch_device[1000], batch_device[1001], block=diffBlock, grid=diffGrid)
        #driver.memcpy_dtoh(h_diffImage_int, d_diffImage_int)
        #displayImage(h_diffImage_int)
        #byteToFloat(d_diffImage_float, d_diffImage_int, block=block, grid=grid)

        #if batch >= 5:
        #    return

    return 1
Example #56
0
def elementwiseMean_gpu(feature_volume_list):
    """computes the elementwise mean of the like-shaped volumes in feature_volume_list"""
    # initialize cuda context
    cuda.init()
    cudacontext = cuda.Device(NVDEVICE).make_context()

    parent_dir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(parent_dir, 'feature_compositions.cuh'),
              mode='r') as f:
        mod = SourceModule(
            f.read(),
            options=[
                '-I {!s}'.format(parent_dir),
                # '-g', '-G', '-lineinfo'
            ])
    func = mod.get_function('elementwiseMean')

    # combine volumes into linearized array
    FOR = feature_volume_list[0].frameofreference
    vols = []
    for vol in feature_volume_list:
        vols.append(vol.vectorize())
    array_length = np.product(FOR.size).item()
    while len(vols) > 1:
        num_arrays = 2
        cat = np.concatenate([vols.pop() for x in range(num_arrays)], axis=0)

        # allocate image on device in global memory
        cat = cat.astype(np.float32)
        cat_gpu = cuda.mem_alloc(cat.nbytes)
        result = np.zeros((array_length)).astype(np.float32)
        result_gpu = cuda.mem_alloc(result.nbytes)
        # transfer cat to device
        cuda.memcpy_htod(cat_gpu, cat)
        cuda.memcpy_htod(result_gpu, result)
        # call device kernel
        blocksize = 512
        gridsize = math.ceil(array_length / blocksize)
        func(cat_gpu,
             result_gpu,
             np.int32(array_length),
             np.int32(num_arrays),
             block=(blocksize, 1, 1),
             grid=(gridsize, 1, 1))
        # get result from device
        cuda.memcpy_dtoh(result, result_gpu)
        vols.append(result.reshape((-1, 1)))
    result = vols[0]

    # detach from cuda context
    # cudacontext.synchronize()
    # cudacontext.detach()
    cudacontext.pop()
    # required to successfully free device memory for created context
    del cudacontext
    gc.collect()
    pycuda.tools.clear_context_caches()

    x = MaskableVolume().fromArray(result, FOR)
    x.modality = feature_volume_list[0].modality
    return x
Example #57
0
  def  __dfs_construct(self, 
                        depth, 
                        error_rate, 
                        start_idx, 
                        stop_idx, 
                        si_gpu_in, 
                        si_gpu_out):
    def check_terminate():
      if error_rate == 0.0:
        return True
      else:
        return False     

    n_samples = stop_idx - start_idx 
    indices_offset =  start_idx * self.dtype_indices.itemsize    
    nid = self.n_nodes
    self.n_nodes += 1

    if check_terminate():
      turn_to_leaf(nid, 
                    start_idx, 
                    si_gpu_in.idx, 
                    self.values_idx_array, 
                    self.values_si_idx_array
                    )
      return
    
    if n_samples < self.min_samples_split:
      turn_to_leaf(nid, 
                    start_idx, 
                    si_gpu_in.idx, 
                    self.values_idx_array, 
                    self.values_si_idx_array
                    )
      return
    
    if n_samples <= self.bfs_threshold:
      self.idx_array[self.queue_size * 2] = start_idx
      self.idx_array[self.queue_size * 2 + 1] = stop_idx
      self.si_idx_array[self.queue_size] = si_gpu_in.idx
      self.nid_array[self.queue_size] = nid
      self.queue_size += 1
      return
    
    cuda.memcpy_htod(self.features_array_gpu.ptr, self.features_array)
    min_left, min_right, row, col = self.__gini(n_samples, 
                                                indices_offset, 
                                                si_gpu_in) 
    if min_left + min_right == 4:
      turn_to_leaf(nid, 
                  start_idx, 
                  si_gpu_in.idx, 
                  self.values_idx_array, 
                  self.values_si_idx_array) 
      return
    
    cuda.memcpy_dtoh(self.threshold_value_idx, 
                    si_gpu_in.ptr + int(indices_offset) + \
                    int(row * self.stride + col) * \
                    int(self.dtype_indices.itemsize)) 

    self.feature_idx_array[nid] = row
    self.feature_threshold_array[nid] = (float(self.samples[row, \
        self.threshold_value_idx[0]]) + self.samples[row, \
        self.threshold_value_idx[1]]) / 2
    

    self.fill_kernel.prepared_call(
                      (1, 1),
                      (512, 1, 1),
                      si_gpu_in.ptr + row * self.stride * \
                          self.dtype_indices.itemsize + \
                          indices_offset, 
                      n_samples, 
                      col, 
                      self.mark_table.ptr) 


    block = (self.RESHUFFLE_THREADS_PER_BLOCK, 1, 1)
    
    self.scan_reshuffle_tex.prepared_call(
                      (self.n_features, 1),
                      block,
                      si_gpu_in.ptr + indices_offset,
                      si_gpu_out.ptr + indices_offset,
                      n_samples,
                      col)

    self.__shuffle_feature_indices() 

    self.left_children[nid] = self.n_nodes
    self.__dfs_construct(depth + 1, min_left, 
        start_idx, start_idx + col + 1, si_gpu_out, si_gpu_in)
    
    self.right_children[nid] = self.n_nodes
    self.__dfs_construct(depth + 1, min_right, 
        start_idx + col + 1, stop_idx, si_gpu_out, si_gpu_in)
  def __init__(self,instance,algorithm,verbose=False):
    """Initializes a direct parallel worker

    :param instance: an instance of class from_scatterers_direct(\
cctbx.xray.structure_factors.manager.managed_calculation_base)
    :type instance: cctbx.xray.structure_factors.from_scatterers_direct
    :param algorithm: an instance of class direct_summation_cuda_platform(\
direct_summation_simple) with algorithm set to "simple" or "pycuda"
    :type algorithm: cctbx.xray.structure_factors.direct_summation_cuda_platform
    """
    self.scatterers = instance._xray_structure.scatterers()
    self.registry = instance._xray_structure.scattering_type_registry()
    self.miller_indices = instance._miller_set.indices()
    self.unit_cell = instance._miller_set.unit_cell()
    self.space_group = instance._miller_set.space_group().make_tidy()

    if verbose: self.print_diagnostics() # some diagnostics used for development

    if hasattr(algorithm,"simple"):
      instance._results = ext.structure_factors_simple(
      self.unit_cell,
      instance._miller_set.space_group(),
      self.miller_indices,
      self.scatterers,
      self.registry); return

    if hasattr(algorithm,"pycuda"):
      import pycuda.driver as cuda
      from pycuda.compiler import SourceModule

      self.validate_the_inputs(instance,cuda,algorithm)

      self.prepare_miller_arrays_for_cuda(algorithm)

      self.prepare_scattering_sites_for_cuda(algorithm)

      self.prepare_gaussians_symmetries_cell(algorithm)

      assert cuda.Device.count() >= 1

      device = cuda.Device(0)
      WARPSIZE=device.get_attribute(cuda.device_attribute.WARP_SIZE) # 32
      MULTIPROCESSOR_COUNT=device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)

      sort_mod = SourceModule((mod_fhkl_sorted%(self.gaussians.shape[0],
                         self.symmetry.shape[0],
                         self.sym_stride,
                         self.g_stride,
                         int(self.use_debye_waller),
                         self.order_z,self.order_p)).replace("floating_point_t",algorithm.float_t)
                         )

      r_m_m_address = sort_mod.get_global("reciprocal_metrical_matrix")[0]
      cuda.memcpy_htod(r_m_m_address, self.reciprocal_metrical_matrix)

      gaussian_address = sort_mod.get_global("gaussians")[0]
      cuda.memcpy_htod(gaussian_address, self.gaussians)

      symmetry_address = sort_mod.get_global("symmetry")[0]
      cuda.memcpy_htod(symmetry_address, self.symmetry)

      CUDA_fhkl = sort_mod.get_function("CUDA_fhkl")

      intermediate_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
      intermediate_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
      for x in range(self.scatterers.number_of_types):
        fhkl_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
        fhkl_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)

        CUDA_fhkl(cuda.InOut(fhkl_real),
                 cuda.InOut(fhkl_imag),
                 cuda.In(self.flat_sites),
                 cuda.In(self.weights),
                 cuda.In(self.u_iso),
                 algorithm.numpy.uint32(self.scatterers.increasing_order[x]),
                 algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][0]),
                 algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][1]),
                 cuda.In(self.flat_mix),
                 block=(FHKL_BLOCKSIZE,1,1),
                 grid=((self.n_flat_hkl//FHKL_BLOCKSIZE,1)))

        intermediate_real += fhkl_real
        intermediate_imag += fhkl_imag

      flex_fhkl_real = flex.double(intermediate_real[0:len(self.miller_indices)].astype(algorithm.numpy.float64))
      flex_fhkl_imag = flex.double(intermediate_imag[0:len(self.miller_indices)].astype(algorithm.numpy.float64))

      instance._results = fcalc_container(flex.complex_double(flex_fhkl_real,flex_fhkl_imag))

      return
Example #59
0
  def __bfs(self):
    block_per_split = int(math.ceil(float(self.MAX_BLOCK_BFS) / self.queue_size))
    
    if block_per_split > self.max_features:
      n_blocks = self.max_features
    else:
      n_blocks = block_per_split

    idx_array_gpu = gpuarray.to_gpu(
                    self.idx_array[0 : self.queue_size * 2])
    
    si_idx_array_gpu = gpuarray.to_gpu(
                    self.si_idx_array[0 : self.queue_size])
    
    self.label_total = gpuarray.empty(self.queue_size * self.n_labels, 
                                      dtype = self.dtype_counts)  
    
    threshold_value = gpuarray.empty(self.queue_size, dtype = np.float32)
    
    impurity_gpu = gpuarray.empty(self.queue_size * 2, dtype = np.float32)
    self.min_split = gpuarray.empty(self.queue_size, dtype = self.dtype_indices) 
    min_feature_idx_gpu = gpuarray.empty(self.queue_size, dtype = np.uint16)
    
    impurity_gpu_2d = gpuarray.empty(self.queue_size * 2 * n_blocks, 
                                      dtype = np.float32)
    
    min_split_2d = gpuarray.empty(self.queue_size * n_blocks, 
                                      dtype = self.dtype_indices) 

    min_feature_idx_gpu_2d = gpuarray.empty(self.queue_size * n_blocks, 
                                      dtype = np.uint16)
    
    cuda.memcpy_htod(self.features_array_gpu.ptr, self.features_array) 
    
      
    self.scan_total_bfs.prepared_call(
            (self.queue_size, 1),
            (self.BFS_THREADS, 1, 1),
            self.labels_gpu.ptr,
            self.label_total.ptr,
            si_idx_array_gpu.ptr,
            idx_array_gpu.ptr)
    

    self.comput_bfs_2d.prepared_call(
          (self.queue_size, n_blocks),
          (self.BFS_THREADS, 1, 1),
          self.samples_gpu.ptr,
          self.labels_gpu.ptr,
          idx_array_gpu.ptr,
          si_idx_array_gpu.ptr,
          self.label_total.ptr,
          self.features_array_gpu.ptr,
          impurity_gpu_2d.ptr,
          min_split_2d.ptr,
          min_feature_idx_gpu_2d.ptr)
    
    self.reduce_bfs_2d.prepared_call(
          (self.queue_size, 1),
          (1, 1, 1),
          impurity_gpu_2d.ptr,
          min_split_2d.ptr,
          min_feature_idx_gpu_2d.ptr,
          impurity_gpu.ptr,
          self.min_split.ptr,
          min_feature_idx_gpu.ptr,
          n_blocks)

    self.fill_bfs.prepared_call(
          (self.queue_size, 1),
          (self.BFS_THREADS, 1, 1),
          si_idx_array_gpu.ptr,
          min_feature_idx_gpu.ptr,
          idx_array_gpu.ptr,
          self.min_split.ptr,
          self.mark_table.ptr)


    if block_per_split > self.n_features:
      n_blocks = self.n_features
    else:
      n_blocks = block_per_split
      
    self.reshuffle_bfs.prepared_call(
          (self.queue_size, n_blocks),
          (self.BFS_THREADS, 1, 1),
          si_idx_array_gpu.ptr,
          idx_array_gpu.ptr,
          self.min_split.ptr)
    
    self.__shuffle_feature_indices()
    
    self.get_thresholds.prepared_call(
          (self.queue_size, 1),
          (1, 1, 1),
          si_idx_array_gpu.ptr,
          self.samples_gpu.ptr,
          threshold_value.ptr,
          min_feature_idx_gpu.ptr,
          self.min_split.ptr) 
    
    new_idx_array = np.empty(self.queue_size * 2 * 2, dtype = np.uint32)
    idx_array = self.idx_array
    new_si_idx_array = np.empty(self.queue_size * 2, dtype = np.uint8)
    new_nid_array = np.empty(self.queue_size * 2, dtype = np.uint32)
    left_children = self.left_children
    right_children = self.right_children
    feature_idx_array = self.feature_idx_array
    feature_threshold_array = self.feature_threshold_array
    nid_array = self.nid_array
    
    imp_min = np.empty(self.queue_size * 2, np.float32)
    min_split = np.empty(self.queue_size, self.dtype_indices)
    feature_idx = np.empty(self.queue_size, np.uint16)
    threshold = np.empty(self.queue_size, np.float32) 
    cuda.memcpy_dtoh(imp_min, impurity_gpu.ptr)
    cuda.memcpy_dtoh(min_split, self.min_split.ptr)
    cuda.memcpy_dtoh(feature_idx, min_feature_idx_gpu.ptr)
    cuda.memcpy_dtoh(threshold, threshold_value.ptr) 
    
    si_idx_array = self.si_idx_array 

    self.n_nodes, self.queue_size, self.idx_array, self.si_idx_array, self.nid_array =\
        bfs_loop(self.queue_size, 
                  self.n_nodes, 
                  self.max_features, 
                  new_idx_array, 
                  idx_array, 
                  new_si_idx_array, 
                  new_nid_array, 
                  left_children, 
                  right_children, 
                  feature_idx_array, 
                  feature_threshold_array, 
                  nid_array, 
                  imp_min, 
                  min_split, 
                  feature_idx, 
                  si_idx_array, 
                  threshold, 
                  self.min_samples_split, 
                  self.values_idx_array, 
                  self.values_si_idx_array)

    self.n_nodes = int(self.n_nodes)
    self.queue_size = int(self.queue_size)
Example #60
0
 def __init__(self, eta, beta, L, no_angles, no_pulses, order=5):
     self.mod_K = SourceModule(
         RADON_KERNEL.format(order, no_angles, no_pulses))
     self.K_gpu = self.mod_K.get_function("K_l")
     self.mod_reduction = SourceModule(REDUCTION_KERNEL)
     self.reduction_gpu = self.mod_reduction.get_function("reduction")
     self.eta = eta
     self.gamma = gamma(eta)
     self.beta = beta
     self.L = L
     self.h = calc_h(L, beta, eta)
     drv.memcpy_htod(
         self.mod_K.get_global("rsq4pi")[0],
         scipy.array([1. / sqrt(4. * pi)], dtype=scipy.float32))
     drv.memcpy_htod(
         self.mod_K.get_global("sqeta")[0],
         scipy.array([sqrt(self.eta)], dtype=scipy.float32))
     drv.memcpy_htod(
         self.mod_K.get_global("h")[0],
         scipy.array([self.h], dtype=scipy.float32))
     drv.memcpy_htod(
         self.mod_K.get_global("four_pi_gamma")[0],
         scipy.array([4. * pi * self.gamma], dtype=scipy.float32))
     y = sqrt(self.gamma) / self.h
     drv.memcpy_htod(
         self.mod_K.get_global("y")[0], scipy.array([y],
                                                    dtype=scipy.float32))
     n = scipy.arange(1, order + 1, dtype=scipy.float32)
     n2 = n**2
     ex = exp(-n2 / 4.)
     pre_s2 = ex * cosh(n * y)
     pre_s3 = ex * n * sinh(n * y)
     drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2)
     drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex)
     drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2)
     drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3)