def batch_indexing(self, planes, data_points): data_size = data_points.shape[0] / 128 self.benchmark_begin('preparing') gpu_alloc_objs = [] # for data points #addresses = [] #for point in data_points: # point_addr = drv.to_device(point) # gpu_alloc_objs.append(point_addr) # addresses.append(int(point_addr)) #np_addresses = numpy.array(addresses).astype(numpy.uint64) # 64 bit addressing space. each point costs 8 bytes #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8) #drv.memcpy_htod(arrays_gpu, np_addresses) # for planes planes_addresses = [] for plane in planes: plane_addr = drv.to_device(plane) gpu_alloc_objs.append(plane_addr) planes_addresses.append(int(plane_addr)) planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64) # 64 bit addressing space. each point costs 8 bytes planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8) drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses) # projections projections = numpy.zeros(data_size).astype(numpy.uint64) length = numpy.array([data_size]).astype(numpy.uint64) print "total: " + str(data_size) + " data points to indexing." self.benchmark_end('preparing') self.benchmark_begin('cudaing') self.indexing_kernel( planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length), block = self.block, grid = self.grid) self.benchmark_end('cudaing') #count = 0 #for pro in projections: # print "count: " + str(count) + " " + str(pro) # count += 1 #print projections.shape return projections
def _set(self, ary): # Allocate a new buffer with suitable padding and assign buf = np.zeros(self.datashape, dtype=self.dtype) buf[...,:self.ioshape[-1]] = ary # Copy cuda.memcpy_htod(self.data, buf)
def set_round_drill( self , size ) : sx , sy = self.get_scale() nx , ny = int(size / sx + .5) , int(size / sy + .5) self.drillflat = False print 'Setting round drill:' print size print sx , sy print nx , ny self.hdrill = np.zeros( (nx,ny) , np.float32 ) size /= 2.0 for x in range(nx) : for y in range(ny) : fx = (x-int(nx/2+.5)) * sx fy = (y-int(ny/2+.5)) * sy ts = size*size - fx*fx - fy*fy self.hdrill[x,y] = -m.sqrt( ts ) + size if ts > 0 else size*2 self.drillrad = size print self.hdrill print self.drillrad self.cdrill = cuda_driver.mem_alloc( self.hdrill.nbytes ) cuda_driver.memcpy_htod( self.cdrill , self.hdrill ) self.grid = map( int , ( m.ceil(nx/22.0) , m.ceil(ny/22.0) ) ) self.block = ( min(nx,22) , min(ny,22) , 1 ) print self.grid print self.block
def __init__(self, view_tile, size, sigma, debug=False): self.debug = debug if size[0] < 2 or size[1] < 2: raise ValueError("Split needs to be at least 2x2") self.data_sets = view_tile.get_Data() for dset in self.data_sets: data = dset.getDataSet() if not data.flags['C_CONTIGUOUS']: print "NOT CONTIGUOUS, trying to reformat the points" data = np.require(data, dtype=data.dtype, requirements=['C']) if not data.flags['C_CONTIGUOUS']: raise Exception("Points are not contiguous") dset.setDataSet(data) self.view_tile = view_tile self.sigma = sigma self.pts_gpu = None # Initiates all of cuda stuff self.grid = np.zeros(size).astype(np.float32) self.grid_gpu = cuda.mem_alloc_like(self.grid) cuda.memcpy_htod(self.grid_gpu, self.grid) kernel = SourceModule(self.__cuda_code) self.gpu_gaussian = kernel.get_function("gpu_gaussian") self.view = self.view_tile.get_View() self.grid_size, self.block_size = self.__setup_cuda_sizes(size) self.dx = 1 / float(size[1] - 1) self.dy = 1 / float(size[0] - 1)
def edgetaper_gpu(y_gpu, sf, win='barthann'): shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) # Ensure that sf is odd sf = sf+(1-np.mod(sf,2)) wx = scipy.signal.get_window(win, sf[1]) wy = scipy.signal.get_window(win, sf[0]) maxw = wx.max() * wy.max() hsf = np.floor(sf/2) wx = (wx[0:hsf[1]] / maxw).astype(dtype) wy = (wy[0:hsf[0]] / maxw).astype(dtype) preproc = _generate_preproc(dtype, shape) preproc += '#define wx_size %d\n' % wx.size preproc += '#define wy_size %d\n' % wy.size mod = SourceModule(preproc + edgetaper_code, keep=True) edgetaper_gpu = mod.get_function("edgetaper") wx_gpu, wx_size = mod.get_global('wx') wy_gpu, wy_size = mod.get_global('wy') cu.memcpy_htod(wx_gpu, wx) cu.memcpy_htod(wy_gpu, wy) edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]), block=block_size, grid=grid_size)
def test_prepared_invocation(self): a = np.random.randn(4,4).astype(np.float32) a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize) drv.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*blockDim.x; a[idx] *= 2; } """) func = mod.get_function("doublify") func.prepare("P") func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20) a_doubled = np.empty_like(a) drv.memcpy_dtoh(a_doubled, a_gpu) print (a) print (a_doubled) assert la.norm(a_doubled-2*a) == 0 # now with offsets func.prepare("P") a_quadrupled = np.empty_like(a) func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize) drv.memcpy_dtoh(a_quadrupled, a_gpu) assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function("copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0,255,(32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global( global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def loop(iterations): ts = 0 while(ts<iterations): ' To avoid overwrites a temporary copy is made of F ' T[:] = F cuda.memcpy_htod(T_gpu, T) ' Propagate ' prop(F_gpu, T_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Calculate density and get bounceback from obstacle nodes ' density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Calculate equilibrium ' eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Transfer bounceback to obstacle nodes ' bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Copy F to host for copy to T in beginning of loop ' cuda.memcpy_dtoh(F, F_gpu) ts += 1
def _set(self, ary): # Allocate a new buffer with suitable padding and pack it buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype) buf[:, :self.ncol] = self._pack(ary) # Copy cuda.memcpy_htod(self.data, buf)
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs): # Determine number of threads and blocks for the GPU number_of_blobs = np.int32(len(r_vectors)) threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs) # Get parameters from arguments L = kwargs.get('periodic_length') eps = kwargs.get('repulsion_strength') b = kwargs.get('debye_length') blob_radius = kwargs.get('blob_radius') # Reshape arrays x = np.reshape(r_vectors, number_of_blobs * 3) f = np.empty_like(x) # Allocate GPU memory x_gpu = cuda.mem_alloc(x.nbytes) f_gpu = cuda.mem_alloc(f.nbytes) # Copy data to the GPU (host to device) cuda.memcpy_htod(x_gpu, x) # Get blob-blob force function force = mod.get_function("calc_blob_blob_force") # Compute mobility force product force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) # Copy data from GPU to CPU (device to host) cuda.memcpy_dtoh(f, f_gpu) return np.reshape(f, (number_of_blobs, 3))
def prepare_device_arrays(self): self.maxLayers = self.grid_prop.GetMaxLayers() nczbins_fine = len(self.czcen_fine) numLayers = np.zeros(nczbins_fine,dtype=np.int32) densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE) distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE) self.grid_prop.GetNumberOfLayers(numLayers) self.grid_prop.GetDensityInLayer(densityInLayer) self.grid_prop.GetDistanceInLayer(distanceInLayer) # Copy all these earth info arrays to device: self.d_numLayers = cuda.mem_alloc(numLayers.nbytes) self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes) self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes) cuda.memcpy_htod(self.d_numLayers,numLayers) cuda.memcpy_htod(self.d_densityInLayer,densityInLayer) cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer) self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes) self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes) cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine) cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine) return
def _read_LPU_input(self, in_gpot_dict, in_spike_dict): """ Put inputs from other LPUs to buffer. """ for other_lpu, gpot_data in in_gpot_dict.iteritems(): i = self.other_lpu_map[other_lpu] if self.num_input_gpot_neurons[i] > 0: cuda.memcpy_htod(int(int(self.buffer.gpot_buffer.gpudata) \ +(self.buffer.gpot_current * self.buffer.gpot_buffer.ld \ + self.my_num_gpot_neurons + self.cum_virtual_gpot_neurons[i]) \ * self.buffer.gpot_buffer.dtype.itemsize), gpot_data) if self.debug: self.in_gpot_files[other_lpu].root.array.append(gpot_data.reshape(1,-1)) #Will need to change this if only spike indexes are transmitted for other_lpu, sparse_spike in in_spike_dict.iteritems(): i = self.other_lpu_map[other_lpu] if self.num_input_spike_neurons[i] > 0: full_spike = np.zeros(self.num_input_spike_neurons[i],dtype=np.int32) if len(sparse_spike)>0: idx = np.asarray([self.input_spike_idx_map[i][k] \ for k in sparse_spike], dtype=np.int32) full_spike[idx] = 1 cuda.memcpy_htod(int(int(self.buffer.spike_buffer.gpudata) \ +(self.buffer.spike_current * self.buffer.spike_buffer.ld \ + self.my_num_spike_neurons + self.cum_virtual_spike_neurons[i]) \ * self.buffer.spike_buffer.dtype.itemsize), full_spike)
def cuda_crossOver(sola, solb): """ """ sol_len = len(sola); a_gpu = cuda.mem_alloc(sola.nbytes); b_gpu = cuda.mem_alloc(solb.nbytes); cuda.memcpy_htod(a_gpu, sola); cuda.memcpy_htod(b_gpu, solb); func = mod.get_function("crossOver"); func(a_gpu,b_gpu, block=(sol_len,1,1)); a_new = numpy.empty_like(sola); b_new = numpy.empty_like(solb); cuda.memcpy_dtoh(a_new, a_gpu); cuda.memcpy_dtoh(b_new, b_gpu); if debug == True: print "a:", a; print "b:",b; print "new a:",a_new; print "new b:",b_new; return a_new,b_new;
def compile_for_GPU(function_package, kernel_function_name='default'): kernel_code = '' if kernel_function_name == 'default': kernel_code = attachment source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code) else: fp = function_package from vivaldi_translator import translate_to_CUDA function_name = fp.function_name Vivaldi_code = function_code_dict[function_name] function_code = translate_to_CUDA(Vivaldi_code=Vivaldi_code, function_name=function_name, function_arguments=fp.function_args) kernel_code = attachment + 'extern "C"{\n' kernel_code += function_code kernel_code += '\n}' if True: # print for debugging f = open('asdf.cu','w') f.write(kernel_code) f.close() #print function_code args = [kernel_code] source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code) temp,_ = source_module_dict[kernel_function_name].get_global('DEVICE_NUMBER') cuda.memcpy_htod(temp, numpy.int32(device_number)) func_dict[kernel_function_name] = source_module_dict[kernel_function_name].get_function(kernel_function_name) create_helper_textures(source_module_dict[kernel_function_name])
def from_np(np_data): cudabuf = cuda.mem_alloc(np_data.nbytes) cuda.memcpy_htod(cudabuf, np_data) # self.cpudata = np_data tensor = MyTensor(cudabuf, shape=np_data.shape, size=np_data.size) tensor.cpudata = np_data return tensor
def calc_psd(self,bitloads,xtalk): #Number of expected permutations Ncombinations=self.K #Check if this is getting hairy and assign grid/block dimensions (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations) #How many individual lk's memdim=blockCount*threadCount threadshare_grid=(blockCount,1) threadshare_block=(threadCount,1,1) #Memory (We get away with the NCombinations because calpsd checks against it) d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes) d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes) d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes) d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes) h_p=np.zeros((self.K,self.N)).astype(self.type) cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32)) cuda.memcpy_htod(d_XTG,xtalk.astype(self.type)) #Go solve #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){ self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid) cuda.Context.synchronize() cuda.memcpy_dtoh(h_p,d_p) d_a.free() d_bitload.free() d_XTG.free() d_p.free() return h_p.astype(np.float64)
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors): # create Cheetah template and fill in variables for mask kernel mask_template = Template(mask_source) mask_template.BLOCK_DIM_X = b_size[0] mask_template.BLOCK_DIM_Y = b_size[1] mask_template.WIDTH = dest_im.shape[1] mask_template.HEIGHT = dest_im.shape[0] mask_template.RGB = RGB mask_template.NEIGHBORS = neighbors # compile the CUDA kernel mask_kernel = cuda_compile(mask_template, "mask_kernel") # alloc memory to GPU d_source = cu.mem_alloc(source_im.nbytes) cu.memcpy_htod(d_source, source_im) # sends to GPU filter out interior points in the mask mask_kernel(d_source, block=b_size, grid=g_size) # retrieves interior point buffer from GPU inner_buffer = np.array(dest_im, dtype =np.uint8) cu.memcpy_dtoh(inner_buffer, d_source) # returns the interior buffer return inner_buffer
def test_pycuda(self): """ Test pycuda installation with small example. :return: :rtype: """ try: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np a = np.random.randn(4, 4) print(a) a= a.astype(np.float32) a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) mod = SourceModule( """ __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """ ) func = mod.get_function("doublify") func(a_gpu, block=(4,4,1)) a_doubled = np.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) #print(a_doubled) #print(a) except Exception: self.fail('Still not working')
def __init__(self, n_dict, V, dt, debug=False, LPU_id=None): self.num_neurons = len(n_dict['id']) self.dt = np.double(dt) self.steps = max(int(round(dt / 1e-5)), 1) self.debug = debug self.LPU_id = LPU_id self.ddt = dt / self.steps self.V = V self.X0 = garray.to_gpu( np.asarray( n_dict['X0'], dtype=np.float64 )) self.X1 = garray.to_gpu( np.asarray( n_dict['X1'], dtype=np.float64 )) self.X2 = garray.to_gpu( np.asarray( n_dict['X2'], dtype=np.float64 )) # Copies an initial V into V cuda.memcpy_htod(int(self.V), np.asarray(n_dict['Vinit'], dtype=np.double)) self.update = self.get_gpu_kernel() if self.debug: if self.LPU_id is None: self.LPU_id = "anon" self.I_file = tables.openFile(self.LPU_id + "_I.h5", mode="w") self.I_file.createEArray("/","array", \ tables.Float64Atom(), (0,self.num_neurons)) self.V_file = tables.openFile(self.LPU_id + "_V.h5", mode="w") self.V_file.createEArray("/","array", \ tables.Float64Atom(), (0,self.num_neurons))
def __init__(self, pts, axis, split, sigma): if split[0] < 2 or split[1] < 2: raise ValueError("Split needs to be at least 2x2") if not pts.flags['C_CONTIGUOUS']: pts = np.require(pts, dtype=pts.dtype, requirements=['C']) if not pts.flags['C_CONTIGUOUS']: raise Exception("Points are not contiguous") self.axis = axis self.sigma = sigma self.pts = pts self.pts_gpu = None # Initiates all of cuda stuff self.grid = np.zeros(split).astype(pts.dtype) self.grid_gpu = cuda.mem_alloc_like(self.grid) cuda.memcpy_htod(self.grid_gpu, self.grid) kernel = SourceModule(self.__cuda_code) self.gpu_gaussian = kernel.get_function("gpu_gaussian") self.dx = 1 / float(split[0] - 1) self.dy = 1 / float(split[1] - 1) self.grid_size, self.block_size = self.__setup_cuda_sizes(split)
def GPU(): im = Image.open(sys.argv[1]) print sys.argv[1], ": ", im.format, im.size, im.mode, '\n' pixels = np.array(im.getdata()) #r, g, b = im.split() print pixels #print pixels[:,0].nbytes pixels = np.array(im) gpu = cuda.mem_alloc(pixels.nbytes) cuda.memcpy_htod(gpu, pixels) kernel = SourceModule(""" #define MAX_PIXEL_VALUE 255 #define THRESHOLD 50 __global__ void process_pixel(int *r, int *g, int *b) { int id = blockDim.x * blockIdx.x + threadIdx.x; if ((r[id] > THRESHOLD) && (g[id] > THRESHOLD) && (b[id] > THRESHOLD)) { r[id] = MAX_PIXEL_VALUE; g[id] = MAX_PIXEL_VALUE; b[id] = MAX_PIXEL_VALUE; } } """) func = kernel.get_function("process_pixel") func(gpu, block=(4,4,1)) newpixels = np.zeros_like(pixels) cuda.memcpy_dtoh(newpixels, gpu)
def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False): if cuda_verbose: self.compile_options = ['--ptxas-options=-v'] else: self.compile_options = [] self.num_neurons = len(n_dict['id']) self.dt = np.double(dt) self.steps = max(int(round(dt / 1e-5)),1) self.debug = debug self.ddt = dt / self.steps self.V = V self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64)) self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64)) self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64)) self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64)) self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64)) self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64)) self.offset = garray.to_gpu(np.asarray(n_dict['offset'], dtype=np.float64)) cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'], dtype=np.double)) self.update = self.get_euler_kernel()
def _to_device(self, module): ptr, size = module.get_global(self.name) if size != self.data.nbytes: raise RuntimeError("Const %s needs %d bytes, but only space for %d" % (self, self.data.nbytes, size)) if self.state is DeviceDataMixin.HOST: driver.memcpy_htod(ptr, self._data) self.state = DeviceDataMixin.BOTH
def evaluate(self, params, returnOutputs=False): """Evaluate several networks (with given params) on training set. @param params: network params @type params: list of Parameters @param returnOutputs: return network output values (debug) @type returnOutputs: bool, default False @return output matrix if returnOutputs=True, else None """ if self.popSize != len(params): raise ValueError("Need %d Parameter structures (provided %d)" % ( self.popSize, len(params))) paramArrayType = Parameters * len(params) driver.memcpy_htod(self.params, paramArrayType(*params)) # TODO: remove driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4) self.evaluateKernel.prepared_call(self.evaluateGridDim, self.trainSetDev, self.trainSet.size, self.params, self.popSize, self.outputs) driver.Context.synchronize() self.outputsMat = driver.from_device(self.outputs, shape=(self.popSize, self.trainSet.size), dtype=np.float32) if returnOutputs: return self.outputsMat
def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False): if cuda_verbose: self.compile_options = ["--ptxas-options=-v"] else: self.compile_options = [] self.num_neurons = len(n_dict["id"]) self.dt = np.double(dt) self.steps = max(int(round(dt / 1e-5)), 1) self.debug = debug self.ddt = dt / self.steps self.V = V self.n = garray.to_gpu(np.asarray(n_dict["initn"], dtype=np.float64)) self.V_1 = garray.to_gpu(np.asarray(n_dict["V1"], dtype=np.float64)) self.V_2 = garray.to_gpu(np.asarray(n_dict["V2"], dtype=np.float64)) self.V_3 = garray.to_gpu(np.asarray(n_dict["V3"], dtype=np.float64)) self.V_4 = garray.to_gpu(np.asarray(n_dict["V4"], dtype=np.float64)) self.V_l = garray.to_gpu(np.asarray(n_dict["V_l"], dtype=np.float64)) self.V_ca = garray.to_gpu(np.asarray(n_dict["V_ca"], dtype=np.float64)) self.V_k = garray.to_gpu(np.asarray(n_dict["V_k"], dtype=np.float64)) self.G_l = garray.to_gpu(np.asarray(n_dict["G_l"], dtype=np.float64)) self.G_ca = garray.to_gpu(np.asarray(n_dict["G_ca"], dtype=np.float64)) self.G_k = garray.to_gpu(np.asarray(n_dict["G_k"], dtype=np.float64)) self.Tphi = garray.to_gpu(np.asarray(n_dict["phi"], dtype=np.float64)) self.offset = garray.to_gpu(np.asarray(n_dict["offset"], dtype=np.float64)) cuda.memcpy_htod(int(self.V), np.asarray(n_dict["initV"], dtype=np.double)) self.update = self.get_euler_kernel()
def calc_bandwidth_h2d( s ): t1 = datetime.now() cuda.memcpy_htod( s.dev_a, s.a ) dt = datetime.now() - t1 dt_float = dt.seconds + dt.microseconds*1e-6 return s.nbytes/dt_float/gbytes
def __compile_kernels(self): """ DFS module """ f = self.forest self.find_min_kernel = f.find_min_kernel self.fill_kernel = f.fill_kernel self.scan_reshuffle_tex = f.scan_reshuffle_tex self.comput_total_2d = f.comput_total_2d self.reduce_2d = f.reduce_2d self.scan_total_2d = f.scan_total_2d self.scan_reduce = f.scan_reduce """ BFS module """ self.scan_total_bfs = f.scan_total_bfs self.comput_bfs_2d = f.comput_bfs_2d self.fill_bfs = f.fill_bfs self.reshuffle_bfs = f.reshuffle_bfs self.reduce_bfs_2d = f.reduce_bfs_2d self.get_thresholds = f.get_thresholds """ Other """ self.predict_kernel = f.predict_kernel self.mark_table = f.mark_table const_sorted_indices = f.bfs_module.get_global("sorted_indices_1")[0] const_sorted_indices_ = f.bfs_module.get_global("sorted_indices_2")[0] cuda.memcpy_htod(const_sorted_indices, np.uint64(self.sorted_indices_gpu.ptr)) cuda.memcpy_htod(const_sorted_indices_, np.uint64(self.sorted_indices_gpu_.ptr))
def __init__(self, n_dict, V, dt, debug=False): self.num_neurons = len(n_dict['id']) self.dt = np.double(dt) self.steps = max(int(round(dt / 1e-5)), 1) self.debug = debug self.ddt = dt / self.steps self.V = V self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64)) self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64)) self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64)) self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64)) self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64)) self.V_l = garray.to_gpu(np.asarray(n_dict['V_l'], dtype = np.float64)) self.V_ca = garray.to_gpu(np.asarray(n_dict['V_ca'], dtype = np.float64)) self.V_k = garray.to_gpu(np.asarray(n_dict['V_k'], dtype = np.float64)) self.G_l = garray.to_gpu(np.asarray(n_dict['G_l'], dtype = np.float64)) self.G_ca = garray.to_gpu(np.asarray(n_dict['G_ca'], dtype = np.float64)) self.G_k = garray.to_gpu(np.asarray(n_dict['G_k'], dtype = np.float64)) self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64)) self.offset = garray.to_gpu(np.asarray(n_dict['offset'], dtype=np.float64)) cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'], dtype=np.double)) self.update = self.get_euler_kernel()
def __compute_guassian_on_pts(self): view = self.view_tile.get_View() for dset in self.data_sets: _data = np.array(dset.getDataSet(), copy=True) _data[:, 0] = (_data[:, 0] - view.left)/view.width() _data[:, 1] = (_data[:, 1] - view.bottom)/view.height() for row in range(self.grid_size[0]): for col in range(self.grid_size[1]): # 3 * SIGMA give the 95% left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma) right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma) bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma) top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma) pts = getFilteredDataSet(_data, (left, right, bottom, top)) if len(pts) > 0: self.pts_gpu = cuda.mem_alloc_like(pts) cuda.memcpy_htod(self.pts_gpu, pts) self.gpu_gaussian(self.grid_gpu, # Grid self.pts_gpu, # Points np.int32(col), # Block Index x np.int32(row), # Block Index y np.int32(self.grid_size[1]), # Grid Dimensions x np.int32(self.grid_size[0]), # Grid Dimensions y np.int32(pts.shape[0]), # Point Length np.float32(self.dx), # dx np.float32(self.dy), # dy np.float32(self.sigma), # Sigma block=self.block_size) self.pts_gpu.free()
def set(self, ary, device=None): """ copy host array to device. Arguments: ary: host array, needs to be contiguous device: device id, if not the one attached to current context Returns: self """ assert ary.size == self.size assert self.is_contiguous, "Array in set() must be contiguous" if ary.dtype is not self.dtype: ary = ary.astype(self.dtype) assert ary.strides == self.strides if device is None: drv.memcpy_htod(self.gpudata, ary) else: # with multithreaded datasets, make a context before copying # and destroy it again once done. ctx = drv.Device(device).make_context() drv.memcpy_htod(self.gpudata, ary) ctx.pop() del ctx return self
h_b = np.random.randn(1, N) """Cast ```h_a``` and ```h_b``` to single precision (```float32```).""" h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) """Allocate ```h_a.nbytes```, ```h_b.nbytes``` and ```h_c.nbytes``` of GPU device memory space pointed to by ```d_a```, ```d_b``` and ```d_c```, respectively.""" d_a = cuda.mem_alloc(h_a.nbytes) d_b = cuda.mem_alloc(h_b.nbytes) d_c = cuda.mem_alloc(h_a.nbytes) """Copy the ```h_a``` and ```h_b``` arrays from host to the device arrays ```d_a``` and ```d_b```, respectively.""" cuda.memcpy_htod(d_a, h_a) cuda.memcpy_htod(d_b, h_b) """Define the CUDA kernel function ```deviceAdd``` as a string. ```deviceAdd``` performs the elementwise summation of ```d_a``` and ```d_b``` and puts the result in ```d_c```.""" mod = SourceModule(""" #include <stdio.h> __global__ void deviceAdd(float * __restrict__ d_c, const float * __restrict__ d_a, const float * __restrict__ d_b, const int N) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= N) return; d_c[tid] = d_a[tid] + d_b[tid]; } """) """Define a reference to the ```__global__``` function ```deviceAdd```."""
# matrix B b = np.random.randn(n, n) * 10 b = b.astype(np.float32) # matrix B c = np.empty([n, n]) c = c.astype(np.float32) # allocate memory on device a_gpu = cuda.mem_alloc(a.nbytes) b_gpu = cuda.mem_alloc(b.nbytes) c_gpu = cuda.mem_alloc(c.nbytes) # copy matrix to memory cuda.memcpy_htod(a_gpu, a) cuda.memcpy_htod(b_gpu, b) # compile kernel mod = SourceModule(""" __global__ void matmul2(int n, float *a, float *b, float *c) { int row_num_a,col_num_a,row_num_b,col_num_b,row_num_c,col_num_c; row_num_a=col_num_a=row_num_b=col_num_b=row_num_c=col_num_c = n; __shared__ float A[32][32];//2D array for storing shared matrix values of A & B __shared__ float B[32][32];//Process subMatrix in block int Col=blockIdx.x*blockDim.x+threadIdx.x;//Col and Row Ids of threads int Row=blockIdx.y*blockDim.y+threadIdx.y; double temp = 0;
def sampleC(self): """ Sample the process affiliations for each event. These must be done sequentially since they are made dependent through the parent relationships Z. """ N = self.base.data.N K = self.modelParams["proc_id_model", "K"] D = self.base.data.D gridx = int(np.ceil(np.float32(N) / self.params["blockSz"])) for n in np.arange(self.base.data.N): # Sample c_n conditioned upon all other C's self.gpuKernels["computePerSpikePrCn"]( np.int32(n), np.int32(N), np.int32(K), np.int32(D), self.base.data.gpu.X.gpudata, self.gpuPtrs["proc_id_model", "Xmean"].gpudata, self.gpuPtrs["proc_id_model", "Xprec"].gpudata, self.gpuPtrs["parent_model", "Z"].gpudata, self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "A"].gpudata, self.gpuPtrs["weight_model", "W"].gpudata, self.gpuPtrs["bkgd_model", "lam"].gpudata, self.gpuPtrs["proc_id_model", "Xstats"].gpudata, block=(self.params["blockSz"], 1, 1), grid=(gridx, K)) # Sum the log prob for each process and sample a new cn prcn = np.zeros((K, ), dtype=np.float32) blockLogPrSum = self.gpuPtrs["proc_id_model", "Xstats"].get() prcn[:] = np.sum(blockLogPrSum, 1) try: cn = log_sum_exp_sample(prcn) except Exception as ex: log.info("Exception on spike %d!", n) log.info(ex) log.info("K=%d", K) log.info("X[n]:") log.info(self.base.data.gpu.X.get()[:, n]) WGS = self.gpuPtrs["graph_model", "WGS"].get() log.info("WGS[:,%d]:", n) log.info(WGS[:, n]) C = self.gpuPtrs["proc_id_model", "C"].get() Z = self.gpuPtrs["parent_model", "Z"].get() A = self.gpuPtrs["graph_model", "A"].get() W = self.gpuPtrs["weight_model", "W"].get() log.info("W") log.info(W) if Z[n] > -1: log.info("Spike %d (c=%d) parented by spike %d (c=%d)", n, C[n], Z[n], C[Z[n]]) # log.info ("A") # log.info(self.gpuPtrs["graph_model","A"].get()) log.info("Edge exists from parent? ") log.info(A[C[Z[n]], C[n]]) # log.info ("W") # log.info(self.gpuPtrs["weight_model","A"].get()) log.info("Weight from parent:") log.info(W[C[Z[n]], C[n]]) log.info("Spikes parented by n") log.info(np.count_nonzero(Z == n)) for ch in np.nonzero(Z == n)[0]: log.info("Spike %d (c=%d) parented spike %d (c=%d)", n, C[n], ch, C[ch]) log.info("Edge exists to child? ") log.info(A[C[n], C[ch]]) if not A[C[n], C[ch]]: log.info("WGS[:,%d]:", ch) log.info(WGS[:, ch]) log.info("lam:") log.info(self.gpuPtrs["bkgd_model", "lam"].get()[:, n]) log.info("gaussians") log.info(self.gpuPtrs["proc_id_model", "Xmean"].get()[C[n], :]) log.info(self.gpuPtrs["proc_id_model", "Xprec"].get()[C[n], :, :]) log.info("blockLogPrSum") log.info(blockLogPrSum) log.info("prcn") log.info(prcn) exit() # Copy the new cn to the GPU cn_buff = np.array([cn], dtype=np.int32) cuda.memcpy_htod( self.gpuPtrs["proc_id_model", "C"].ptr + int(n * cn_buff.itemsize), cn_buff) # Update Ns and C C = self.gpuPtrs["proc_id_model", "C"].get() self.modelParams["proc_id_model", "C"] = C Ns = np.zeros((K, ), dtype=np.int32) for n in np.arange(N): Ns[C[n]] += 1 self.modelParams["proc_id_model", "Ns"] = Ns self.gpuPtrs["proc_id_model", "Ns"].set(self.modelParams["proc_id_model", "Ns"])
def solve_gpu(currentmodelrun, modelend, G): """Solving using FDTD method on GPU. Implemented using Nvidia CUDA. Args: currentmodelrun (int): Current model run number. modelend (int): Number of last model to run. G (class): Grid class instance - holds essential parameters describing the model. Returns: tsolve (float): Time taken to execute solving """ import pycuda.driver as drv from pycuda.compiler import SourceModule drv.init() # Create device handle and context on specifc GPU device (and make it current context) dev = drv.Device(G.gpu.deviceID) ctx = dev.make_context() # Electric and magnetic field updates - prepare kernels, and get kernel functions if Material.maxpoles > 0: kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3])) else: # Set to one any substitutions for dispersive materials kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1)) update_e_gpu = kernels_fields.get_function("update_e") update_h_gpu = kernels_fields.get_function("update_h") # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0] if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem: raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name)) else: drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU if Material.maxpoles > 0: # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values). update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A") update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B") G.gpu_initialise_dispersive_arrays() # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU G.gpu_set_blocks_per_grid() G.gpu_initialise_arrays() # PML updates if G.pmls: # Prepare kernels kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3])) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Set block per grid, initialise arrays on GPU, and get kernel functions for pml in G.pmls: pml.gpu_set_blocks_per_grid(G) pml.gpu_initialise_arrays() pml.gpu_get_update_funcs(kernels_pml) # Receivers if G.rxs: # Initialise arrays on GPU rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G) # Prepare kernel and get kernel function kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2])) store_outputs_gpu = kernel_store_outputs.get_function("store_outputs") # Sources - initialise arrays on GPU, prepare kernel and get kernel functions if G.voltagesources + G.hertziandipoles + G.magneticdipoles: kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3])) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) if G.hertziandipoles: srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G) update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole") if G.magneticdipoles: srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G) update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole") if G.voltagesources: srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G) update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source") # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions if G.snapshots: # Initialise arrays on GPU snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G) # Prepare kernel and get kernel function kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2])) store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot") # Iteration loop timer iterstart = drv.Event() iterend = drv.Event() iterstart.record() for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable): # Get GPU memory usage on final iteration if iteration == G.iterations - 1: memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0] # Store field component values for every receiver if G.rxs: store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration), rxcoords_gpu.gpudata, rxs_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1)) # Store any snapshots for i, snap in enumerate(G.snapshots): if snap.time == iteration + 1: store_snapshot_gpu(np.int32(i), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) if G.snapsgpu2cpu: gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) # Update magnetic field components update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update magnetic field components with the PML correction for pml in G.pmls: pml.gpu_update_magnetic(G) # Update magnetic field components for magetic dipole sources if G.magneticdipoles: update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata, srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1)) # Update electric field components # If all materials are non-dispersive do standard update if Material.maxpoles == 0: update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # If there are any dispersive materials do 1st part of dispersive update # (it is split into two parts as it requires present and updated electric field values). else: update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update electric field components with the PML correction for pml in G.pmls: pml.gpu_update_electric(G) # Update electric field components for voltage sources if G.voltagesources: update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata, srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1)) # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last) if G.hertziandipoles: update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata, srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1)) # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates. if Material.maxpoles > 0: update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Copy output from receivers array back to correct receiver objects if G.rxs: gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G) # Copy data from any snapshots back to correct snapshot objects if G.snapshots and not G.snapsgpu2cpu: for i, snap in enumerate(G.snapshots): gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) iterend.record() iterend.synchronize() tsolve = iterstart.time_till(iterend) * 1e-3 # Remove context from top of stack and delete ctx.pop() del ctx return tsolve, memsolve
def _run_simulation(self, parameters, init_values, blocks, threads, in_atol=1e-6, in_rtol=1e-6): total_threads = threads * blocks experiments = len(parameters) neqn = self._speciesNumber # compile init_common_kernel = self._completeCode.get_function("init_common") init_common_kernel(block=(threads, 1, 1), grid=(blocks, 1)) # output array ret_xt = np.zeros( [total_threads, 1, self._resultNumber, self._speciesNumber]) ret_istate = np.ones([total_threads], dtype=np.int32) # calculate sizes of work spaces isize = 20 + self._speciesNumber rsize = 22 + self._speciesNumber * max(16, self._speciesNumber + 9) # local variables t = np.zeros([total_threads], dtype=np.float64) jt = np.zeros([total_threads], dtype=np.int32) neq = np.zeros([total_threads], dtype=np.int32) itol = np.zeros([total_threads], dtype=np.int32) iopt = np.zeros([total_threads], dtype=np.int32) rtol = np.zeros([total_threads], dtype=np.float64) iout = np.zeros([total_threads], dtype=np.int32) tout = np.zeros([total_threads], dtype=np.float64) itask = np.zeros([total_threads], dtype=np.int32) istate = np.zeros([total_threads], dtype=np.int32) atol = np.zeros([total_threads], dtype=np.float64) liw = np.zeros([total_threads], dtype=np.int32) lrw = np.zeros([total_threads], dtype=np.int32) iwork = np.zeros([isize * total_threads], dtype=np.int32) rwork = np.zeros([rsize * total_threads], dtype=np.float64) y = np.zeros([self._speciesNumber * total_threads], dtype=np.float64) for i in range(total_threads): neq[i] = neqn t[i] = 0 itol[i] = 1 itask[i] = 1 istate[i] = 1 iopt[i] = 0 jt[i] = 2 atol[i] = in_atol rtol[i] = in_rtol liw[i] = isize lrw[i] = rsize try: # initial conditions for j in range(self._speciesNumber): # loop over species y[i * self._speciesNumber + j] = init_values[i][j] ret_xt[i, 0, 0, j] = init_values[i][j] except IndexError: pass # allocate on device d_t = driver.mem_alloc(t.size * t.dtype.itemsize) d_jt = driver.mem_alloc(jt.size * jt.dtype.itemsize) d_neq = driver.mem_alloc(neq.size * neq.dtype.itemsize) d_liw = driver.mem_alloc(liw.size * liw.dtype.itemsize) d_lrw = driver.mem_alloc(lrw.size * lrw.dtype.itemsize) d_itol = driver.mem_alloc(itol.size * itol.dtype.itemsize) d_iopt = driver.mem_alloc(iopt.size * iopt.dtype.itemsize) d_rtol = driver.mem_alloc(rtol.size * rtol.dtype.itemsize) d_iout = driver.mem_alloc(iout.size * iout.dtype.itemsize) d_tout = driver.mem_alloc(tout.size * tout.dtype.itemsize) d_itask = driver.mem_alloc(itask.size * itask.dtype.itemsize) d_istate = driver.mem_alloc(istate.size * istate.dtype.itemsize) d_y = driver.mem_alloc(y.size * y.dtype.itemsize) d_atol = driver.mem_alloc(atol.size * atol.dtype.itemsize) d_iwork = driver.mem_alloc(iwork.size * iwork.dtype.itemsize) d_rwork = driver.mem_alloc(rwork.size * rwork.dtype.itemsize) # copy to device driver.memcpy_htod(d_t, t) driver.memcpy_htod(d_jt, jt) driver.memcpy_htod(d_neq, neq) driver.memcpy_htod(d_liw, liw) driver.memcpy_htod(d_lrw, lrw) driver.memcpy_htod(d_itol, itol) driver.memcpy_htod(d_iopt, iopt) driver.memcpy_htod(d_rtol, rtol) driver.memcpy_htod(d_iout, iout) driver.memcpy_htod(d_tout, tout) driver.memcpy_htod(d_itask, itask) driver.memcpy_htod(d_istate, istate) driver.memcpy_htod(d_y, y) driver.memcpy_htod(d_atol, atol) driver.memcpy_htod(d_iwork, iwork) driver.memcpy_htod(d_rwork, rwork) param = np.zeros((total_threads, self._parameterNumber), dtype=np.float32) try: for i in range(len(parameters)): for j in range(self._parameterNumber): param[i][j] = parameters[i][j] except IndexError: pass # parameter texture ary = sim.create_2D_array(param) sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4, total_threads) self._param_tex.set_array(ary) if self._dt <= 0: for i in range(self._resultNumber): for j in range(total_threads): tout[j] = self._timepoints[i] driver.memcpy_htod(d_tout, tout) self._compiledRunMethod(d_neq, d_y, d_t, d_tout, d_itol, d_rtol, d_atol, d_itask, d_istate, d_iopt, d_rwork, d_lrw, d_iwork, d_liw, d_jt, block=(threads, 1, 1), grid=(blocks, 1)) driver.memcpy_dtoh(t, d_t) driver.memcpy_dtoh(y, d_y) driver.memcpy_dtoh(istate, d_istate) for j in range(total_threads): for k in range(self._speciesNumber): ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k] if istate[j] < 0: ret_istate[j] = 0 # end of loop over time points else: tt = self._timepoints[0] for i in range(self._resultNumber): while 1: next_time = min(tt + self._dt, self._timepoints[i]) for j in range(total_threads): tout[j] = next_time driver.memcpy_htod(d_tout, tout) self._compiledRunMethod(d_neq, d_y, d_t, d_tout, d_itol, d_rtol, d_atol, d_itask, d_istate, d_iopt, d_rwork, d_lrw, d_iwork, d_liw, d_jt, block=(threads, 1, 1), grid=(blocks, 1)) driver.memcpy_dtoh(t, d_t) driver.memcpy_dtoh(y, d_y) driver.memcpy_dtoh(istate, d_istate) if np.abs(next_time - self._timepoints[i]) < 1e-5: tt = next_time break tt = next_time for j in range(total_threads): for k in range(self._speciesNumber): ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k] if istate[j] < 0: ret_istate[j] = 0 # loop over and check ret_istate # it will will be zero if there was problems for j in range(total_threads): if ret_istate[j] == 0: for i in range(self._resultNumber): for k in range(self._speciesNumber): ret_xt[j, 0, i, k] = float('NaN') return ret_xt[0:experiments]
print "Loading matrices to device..." # Matrices dimensions N = 8192 A_rows = N A_cols = N B_cols = N # A_rows = 784 # A_cols = 1024 # B_cols = 4096 # A is a matrix randomly filled with 1 and -1 A = sign(np.random.randn(A_rows,A_cols)) A = A.astype(np.float32) A_gpu = cuda.mem_alloc(A.nbytes) cuda.memcpy_htod(A_gpu, A) # B is a matrix randomly filled with 1 and -1 B = sign(np.random.randn(A_cols,B_cols)) B = B.astype(np.float32) B_gpu = cuda.mem_alloc(B.nbytes) cuda.memcpy_htod(B_gpu, B) # C is the resulting matrix C1 = np.zeros((A_rows,B_cols)).astype(np.float32) C2 = np.zeros((A_rows,B_cols)).astype(np.float32) C_gpu = cuda.mem_alloc(C1.nbytes) print "XNOR kernel..." # wait until the GPU is done with the work
feq_g6 = cuda.mem_alloc(rho.size * rho.dtype.itemsize) feq_g7 = cuda.mem_alloc(rho.size * rho.dtype.itemsize) feq_g8 = cuda.mem_alloc(rho.size * rho.dtype.itemsize) rho_g = cuda.mem_alloc(rho.size * rho.dtype.itemsize) taus_g = cuda.mem_alloc(tauS.size * tauS.dtype.itemsize) u_g = cuda.mem_alloc(u.nbytes) c_g = cuda.mem_alloc(c.nbytes) t_g = cuda.mem_alloc(t.nbytes) #fpost_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize) #Pinning memory for faster transfers between cpu and gpu #fin_pin = cuda.register_host_memory(fin) #this didnt seem to make difference, so using fin only for transfers cuda.memcpy_htod(fin_g0, fin[0]) cuda.memcpy_htod(fin_g1, fin[1]) cuda.memcpy_htod(fin_g2, fin[2]) cuda.memcpy_htod(fin_g3, fin[3]) cuda.memcpy_htod(fin_g4, fin[4]) cuda.memcpy_htod(fin_g5, fin[5]) cuda.memcpy_htod(fin_g6, fin[6]) cuda.memcpy_htod(fin_g7, fin[7]) cuda.memcpy_htod(fin_g8, fin[8]) cuda.memcpy_htod(feq_g0, fin[0]) cuda.memcpy_htod(feq_g1, fin[1]) cuda.memcpy_htod(feq_g2, fin[2]) cuda.memcpy_htod(feq_g3, fin[3]) cuda.memcpy_htod(feq_g4, fin[4]) cuda.memcpy_htod(feq_g5, fin[5])
def expand_cuda(sourceImage): sourceImage = np.float32(sourceImage) # Perform separable convolution on sourceImage using CUDA. destImage = sourceImage.copy() destImage = np.float32(destImage) fil = np.zeros((3, 3)) (imageHeight, imageWidth) = destImage.shape # print(imageWidth,imageHeight) fil = np.float32(fil) DATA_H = imageHeight DATA_W = imageWidth DATA_H = np.int32(DATA_H) DATA_W = np.int32(DATA_W) # Prepare device arrays sourceImage_gpu = cuda.mem_alloc_like(sourceImage) fil_gpu = cuda.mem_alloc_like(fil) destImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(fil_gpu, fil) minGPU(destImage_gpu, sourceImage_gpu, fil_gpu, DATA_W, DATA_H, block=(imageHeight, 1, 1), grid=(1, imageWidth)) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage sourceImage = np.float32(sourceImage) # Perform separable convolution on sourceImage using CUDA. destImage = sourceImage.copy() destImage = np.float32(destImage) fil = np.zeros((3, 3)) (imageHeight, imageWidth) = destImage.shape # print(imageWidth,imageHeight) fil = np.float32(fil) DATA_H = imageHeight DATA_W = imageWidth DATA_H = np.int32(DATA_H) DATA_W = np.int32(DATA_W) # Prepare device arrays sourceImage_gpu = cuda.mem_alloc_like(sourceImage) fil_gpu = cuda.mem_alloc_like(fil) destImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(fil_gpu, fil) minGPU(destImage_gpu, sourceImage_gpu, fil_gpu, DATA_W, DATA_H, block=(imageHeight, 1, 1), grid=(1, imageWidth)) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def initB(CR, npoints, ncoeffsLO, ncoeffsHI, R_SS): pycuda.tools.clear_context_caches() # Read in g and h coefficients as 2D arrays g2D = np.zeros([ncoeffsLO+1, ncoeffsLO+1]) h2D = np.zeros([ncoeffsLO+1, ncoeffsLO+1]) #fname = 'CR' + str(CR) + '_MDI' + str(ncoeffs) + '.dat' fname = '/home/cdkay/MagnetoPickles/SCScoeffs'+ str(CR) + '.pkl' f1 = open(fname, 'rb') GH = pickle.load(f1) f1.close() gLO2D = GH[0] hLO2D = GH[1] gHI2D = GH[2] hHI2D = GH[3] # Convert to 1D arrays global gLO, hLO, gHI, hHI nelemsLO = np.sum(range(ncoeffsLO+2)) gLO = np.zeros(nelemsLO) hLO = np.zeros(nelemsLO) nelemsHI = np.sum(range(ncoeffsHI+2)) gHI = np.zeros(nelemsHI) hHI = np.zeros(nelemsHI) # No more normalizing here for l in range(ncoeffsLO+1): for m in range(ncoeffsLO+1): myID = m * (ncoeffsLO+1) - int(np.sum(range(m))) + l - m gLO[myID] = gLO2D[l,m] for l in range(ncoeffsLO+1): for m in range(ncoeffsLO+1): myID = m * (ncoeffsLO+1) - int(np.sum(range(m))) + l - m if m==0: hLO[myID] = 0 else: hLO[myID] = hLO2D[l,m] for l in range(ncoeffsHI+1): for m in range(ncoeffsHI+1): myID = m * (ncoeffsHI+1) - int(np.sum(range(m))) + l - m gHI[myID] = gHI2D[l,m] for l in range(ncoeffsHI+1): for m in range(ncoeffsHI+1): myID = m * (ncoeffsHI+1) - int(np.sum(range(m))) + l - m if m==0: hHI[myID] = 0 else: hHI[myID] = hHI2D[l,m] # Allocate g/h space on GPU and transfer LO (assume HI <= than in memory) gLO = gLO.astype(np.float32) hLO = hLO.astype(np.float32) gHI = gHI.astype(np.float32) hHI = hHI.astype(np.float32) global g_gpu, h_gpu g_gpu = cuda.mem_alloc(gLO.nbytes) cuda.memcpy_htod(g_gpu, gLO) h_gpu = cuda.mem_alloc(hLO.nbytes) cuda.memcpy_htod(h_gpu, hLO) # Set up memory for r, colat, lon global theta, theta_gpu, r, r_gpu, phi, phi_gpu theta = np.zeros(npoints) theta = theta.astype(np.float32) theta_gpu = cuda.mem_alloc(theta.nbytes) r = np.zeros(npoints) r = r.astype(np.float32) r_gpu = cuda.mem_alloc(r.nbytes) phi = np.zeros(npoints) phi = phi.astype(np.float32) phi_gpu = cuda.mem_alloc(phi.nbytes) # Set up memory for atomic sum and output a = np.zeros(ncoeffsLO+1) a = a.astype(np.float32) global ssize ssize = npoints * a.nbytes # Set up outs to contain Br, Btheta, Bphi and Br for each point global outs, outs_gpu outs = np.zeros([4*npoints]) outs = outs.astype(np.float32) outs_gpu = cuda.mem_alloc(outs.nbytes) cuda.memcpy_htod(outs_gpu, outs) # Set up justB to contain only B (less copying if not needed) global justB, justB_gpu justB = np.zeros([npoints]) justB = justB.astype(np.float32) justB_gpu = cuda.mem_alloc(justB.nbytes) cuda.memcpy_htod(justB_gpu, justB) global RSS_gpu # ratio of rstar to rsun RSSar = np.array(R_SS, dtype=np.float32) RSS_gpu = cuda.mem_alloc(RSSar.size * RSSar.dtype.itemsize) cuda.memcpy_htod(RSS_gpu, RSSar) # Get GPU kernel global func func = mod.get_function("PFSS_kern") # Calculate magnetic field at 2.5 Rsun #B105 = np.zeros([180,360]) #Set up r and a global aSCS_gpu r = np.array([2.5]*npoints) r = r.astype(np.float32) r_gpu = cuda.mem_alloc(r.nbytes) cuda.memcpy_htod(r_gpu, r) aSCS = np.array([0.2]*npoints) aSCS = aSCS.astype(np.float32) aSCS_gpu = cuda.mem_alloc(aSCS.nbytes) cuda.memcpy_htod(aSCS_gpu, aSCS)
print("Beta function Random Sampler using curand_kernel.h") print("g_a ~ Gamma[a], beta is generated by g_a/(g_a + g_b)") print("********************************************") alpha = 1.2 beta = 1.3 nw = 1 nt = 100000 nq = 1 nb = nw * nt * nq sharedsize = 0 #byte x = np.zeros(nb) x = x.astype(np.float32) dev_x = cuda.mem_alloc(x.nbytes) cuda.memcpy_htod(dev_x, x) source_module = gabcrm_module() pkernel = source_module.get_function("betagen") pkernel(dev_x, np.float32(alpha), np.float32(beta), block=(int(nw), 1, 1), grid=(int(nt), int(nq)), shared=sharedsize) cuda.memcpy_dtoh(x, dev_x) plt.hist(x, bins=30, density=True) xl = np.linspace(betafunc.ppf(0.001, alpha, beta), betafunc.ppf(0.999, alpha, beta), 100) plt.plot(xl, betafunc.pdf(xl, alpha, beta))
# Q * [r1,r2] print("********************************************") print("Gaussian Random Sampler using curand_kernel.h") print("********************************************") nw = 1 nt = 100000 nq = 1 nb = nw * nt * nq sharedsize = 0 #byte x1 = np.zeros(nb) x1 = x1.astype(np.float32) dev_x1 = cuda.mem_alloc(x1.nbytes) cuda.memcpy_htod(dev_x1, x1) x2 = np.zeros(nb) x2 = x2.astype(np.float32) dev_x2 = cuda.mem_alloc(x2.nbytes) cuda.memcpy_htod(dev_x2, x2) source_module = norm2d_module() pkernel = source_module.get_function("norm2d") pkernel(dev_x1, dev_x2, np.float32(a), np.float32(b), np.float32(c), block=(int(nw), 1, 1), grid=(int(nt), int(nq)),
def calcB(R_in, Lat_in, Lon_in, dlon, npoints, LOorHI, ncoeffs, aSCS_in): if LOorHI == 0: cuda.memcpy_htod(g_gpu, gLO) cuda.memcpy_htod(h_gpu, hLO) if LOorHI == 1: cuda.memcpy_htod(g_gpu, gHI) cuda.memcpy_htod(h_gpu, hHI) #ncoeffs = 90 #Transfer the thetas and phis to GPU r = np.array([R_in] * npoints, dtype=np.float32) theta = np.array([(90. - Lat_in) ] * npoints, dtype=np.float32) * dtor Lons = [Lon_in + dlon * i for i in range(npoints)] #print Lons phi = np.array(Lons, dtype=np.float32) * dtor cuda.memcpy_htod(theta_gpu, theta) cuda.memcpy_htod(phi_gpu, phi) cuda.memcpy_htod(r_gpu, r) cuda.memcpy_htod(aSCS_gpu, np.array([aSCS_in], dtype=np.float32)) a = np.zeros(ncoeffs+1) a = a.astype(np.float32) ssize = a.nbytes * npoints func = mod.get_function("PFSS_kern") # Theta and phi should be in rads going into the kernel # R should be in solar radii func(theta_gpu, phi_gpu, r_gpu, g_gpu, h_gpu, outs_gpu, justB_gpu, aSCS_gpu, block=(ncoeffs+1, 1, 1), grid=(npoints,1,1), shared=ssize) cuda.memcpy_dtoh(outs, outs_gpu) result = outs#[] #for i in range(npoints): # temp = SPHVEC2CART(Lat_in, Lons[i], outs[0 + 4 * i], outs[1 + 4 * i], outs[2 + 4 * i]) # result.append(temp[0]) # result.append(temp[1]) # result.append(temp[2]) # result.append(outs[3 + 4 * i]) return result
def CudaNegative(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void ng( float *inIm, int check ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { inIm[idx*3]= 255-inIm[idx*3]; inIm[idx*3+1]= 255-inIm[idx*3+1]; inIm[idx*3+2]= 255-inIm[idx*3+2]; } } """ #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("ng") func(d_px, checkSize, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() ngPx = numpy.empty_like(px) cuda.memcpy_dtoh(ngPx, d_px) ngPx = (numpy.uint8(ngPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(ngPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Negative image" print "Image size: ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
def Encrypt(): #Initialize perf_timer perf_timer = np.zeros(4) overall_time = perf_counter() # Read image & clear temp directories img, dim, misc_timer = PreProcess() # Resize image for Arnold Mapping misc_timer[3] = perf_counter() if dim[0]!=dim[1]: N = max(dim[0], dim[1]) img = cv2.resize(img,(N,N), interpolation=cv2.INTER_CUBIC) dim = img.shape # Calculate no. of rounds rounds = randint(8,16) misc_timer[3] = perf_counter() - misc_timer[3] # Flatten image to vector,transfer to GPU temp_timer = perf_counter() imgArr = np.asarray(img).reshape(-1) gpuimgIn = cuda.mem_alloc(imgArr.nbytes) gpuimgOut = cuda.mem_alloc(imgArr.nbytes) cuda.memcpy_htod(gpuimgIn, imgArr) func = cf.mod.get_function("ArMapImg") misc_timer[1] += perf_counter() - temp_timer # Warm-Up GPU for accurate benchmarking if cfg.DEBUG_TIMER: funcTemp = cf.mod.get_function("WarmUp") funcTemp(grid=(1,1,1), block=(1,1,1)) # Log no. of rounds of ArMapping temp_timer = perf_counter() with open(cfg.LOG, 'a+') as f: f.write(str(rounds)+"\n") misc_timer[2] += perf_counter() - temp_timer # Perform Arnold Mapping perf_timer[0] = perf_counter() for i in range (max(rounds,5)): func(gpuimgIn, gpuimgOut, grid=(dim[0],dim[1],1), block=(3,1,1)) gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn perf_timer[0] = perf_counter() - perf_timer[0] if cfg.DEBUG_IMAGES: misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_1", len(imgArr), dim) # Fractal XOR Phase temp_timer = perf_counter() fractal, misc_timer[4] = cf.getFractal(dim[0]) fracArr = np.asarray(fractal).reshape(-1) gpuFrac = cuda.mem_alloc(fracArr.nbytes) cuda.memcpy_htod(gpuFrac, fracArr) func = cf.mod.get_function("FracXOR") misc_timer[4] = perf_counter() - temp_timer perf_timer[1] = perf_counter() func(gpuimgIn, gpuimgOut, gpuFrac, grid=(dim[0]*dim[1],1,1), block=(3,1,1)) perf_timer[1] = perf_counter() - perf_timer[1] gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn if cfg.DEBUG_IMAGES: misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_2", len(imgArr), dim) # Permutation: ArMap-based intra-row/column rotation perf_timer[2] = perf_counter() U = cf.genRelocVec(dim[0],dim[1],cfg.P1LOG, ENC=True) # Col-rotation | len(U)=n, values from 0->m V = cf.genRelocVec(dim[1],dim[0],cfg.P2LOG, ENC=True) # Row-rotation | len(V)=m, values from 0->n perf_timer[2] = perf_counter() - perf_timer[2] # Transfer rotation-vectors to GPU misc_timer[5] = perf_counter() gpuU = cuda.mem_alloc(U.nbytes) gpuV = cuda.mem_alloc(V.nbytes) cuda.memcpy_htod(gpuU, U) cuda.memcpy_htod(gpuV, V) func = cf.mod.get_function("Enc_GenCatMap") misc_timer[5] = perf_counter() - misc_timer[5] # Perform permutation perf_timer[3] = perf_counter() for i in range(cfg.PERM_ROUNDS): func(gpuimgIn, gpuimgOut, gpuU, gpuV, grid=(dim[0],dim[1],1), block=(3,1,1)) gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn perf_timer[3] = perf_counter() - perf_timer[3] if cfg.DEBUG_IMAGES: misc_timer[6] += cf.interImageWrite(gpuimgIn, "IN_3", len(imgArr), dim) # Transfer vector back to host and reshape into encrypted output temp_timer = perf_counter() cuda.memcpy_dtoh(imgArr, gpuimgIn) img = (np.reshape(imgArr,dim)).astype(np.uint8) cv2.imwrite(cfg.ENC_OUT, img) misc_timer[1] += perf_counter() - temp_timer # Print timing statistics if cfg.DEBUG_TIMER: overall_time = perf_counter() - overall_time perf = np.sum(perf_timer) misc = np.sum(misc_timer) print("\nTarget: {} ({}x{})".format(cfg.ENC_IN, dim[1], dim[0])) print("\nPERF. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(perf, perf/overall_time*100)) print("ArMap Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[0], perf_timer[0]/overall_time*100)) print("XOR Kernel: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[1], perf_timer[1]/overall_time*100)) print("Shuffle Gen: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[2], perf_timer[2]/overall_time*100)) print("Perm. Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[3], perf_timer[3]/overall_time*100)) print("\nMISC. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(misc, misc/overall_time*100)) print("Dir. Cleanup:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[0], misc_timer[0]/overall_time*100)) print("I/O:\t\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[1], misc_timer[1]/overall_time*100)) print("Logging:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[2], misc_timer[2]/overall_time*100)) print("ArMap Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[3], misc_timer[3]/overall_time*100)) print("FracXOR Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[4], misc_timer[4]/overall_time*100)) print("Permute Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[5], misc_timer[5]/overall_time*100)) if cfg.DEBUG_IMAGES: print("Debug Images:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[6], misc_timer[6]/overall_time*100)) print("\nNET TIME:\t{0:7.5f}s\n".format(overall_time))
def build_sparse_transition_model(filename = 'Transition_dict', n_actions = 16, nt = None, dt =None, F =None, startpos = None, endpos = None, Test_grid =False): global state_list global base_path global save_path print("Building Sparse Model") t1 = time.time() #setup grid print("input to build_sparse_trans_model:\n") print("n_actions", n_actions) print("nt, dt", nt, dt) g, xs, ys, X, Y, vel_field_data, nmodes, num_rzns, path_mat, setup_params, setup_param_str = setup_grid(num_actions = n_actions, nt = nt, Test_grid= Test_grid) print("xs: ",xs) print("ys", ys) all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi = vel_field_data check_nt, check_nrzns, nmodes = all_Yi.shape all_u_mat = all_u_mat.astype(np.float32) all_v_mat = all_v_mat.astype(np.float32) all_ui_mat = all_ui_mat.astype(np.float32) all_vi_mat = all_vi_mat.astype(np.float32) all_Yi = all_Yi.astype(np.float32) #setup_params = [num_actions, nt, dt, F, startpos, endpos] reference from setup grid nT = setup_params[1] # total no. of time steps TODO: check default value print("****CHECK: ", nt, nT, check_nt) # assert (nt == nT), "nt and nT are not the same!" #if nt specified in runner is within nT from param file, then use nt. i.e. model will be built for nt timesteps. if nt != None and nt <= nT: nT = nt is_stationary = 0 # 0 is false. any other number is true. is_stationry = 0 (false) means that flow is NOT stationary # and S2 will be indexed by T+1. if is_stationary = x (true), then S2 is indexed by 0, same as S1. # list_size = 10 #predefined size of list for each S2 # if nt > 1: # is_stationary = 0 gsize = g.ni # size of grid along 1 direction. ASSUMING square grid. num_actions = setup_params[0] nrzns = num_rzns bDimx = nrzns # for small test cases if nrzns>=1000: bDimx = 1000 #for large problems dt = setup_params[2] F = setup_params[3] r_outbound = g.r_outbound r_terminal = g.r_terminal i_term = g.endpos[0] # terminal state indices j_term = g.endpos[1] #name of output pickle file containing transtion prob in dictionary format if nT > 1: prefix = '3D_' + str(nT) + 'nT_a' else: prefix = '2D_a' filename = filename + prefix + str(n_actions) #TODO: change filename base_path = join(ROOT_DIR,'DP/Trans_matxs_3D/') save_path = base_path + filename if exists(save_path): print("Folder Already Exists !!\n") return # TODO: remove z from params. it is only for chekcs z=-9999 params = np.array( [gsize, num_actions, nrzns, F, dt, r_outbound, r_terminal, nmodes, i_term, j_term, nT, is_stationary, z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z,z]).astype( np.float32) st_sp_size = (gsize ** 2) # size of spatial state space print("check stsp_size", gsize, nT, st_sp_size) save_file_for_each_a = False print("params") print("gsize ", params[0], "\n", "num_actions ", params[1], "\n", "nrzns ", params[2], "\n", "F ", params[3], "\n", "dt ", params[4], "\n", "r_outbound ", params[5], "\n", "r_terminal ", params[6], "\n", "nmodes ", params[7], "\n", "i_term ", params[8], "\n", "j_term ", params[9], "\n", "nT", params[10], "\n", "is_stationary ", params[11], "" ) # cpu initialisations. # dummy intialisations to copy size to gpu # vxrzns = np.zeros((nrzns, gsize, gsize), dtype=np.float32) # vyrzns = np.zeros((nrzns, gsize, gsize), dtype=np.float32) results = -1 * np.ones(((gsize ** 2) * nrzns), dtype=np.float32) sumR_sa = np.zeros(st_sp_size).astype(np.float32) Tdummy = np.zeros(2, dtype = np.float32) # informational initialisations ac_angles = np.linspace(0, 2 * pi, num_actions, endpoint = False, dtype=np.float32) print("action angles:\n", ac_angles) ac_angle = ac_angles[0].astype(np.float32) # just for allocating memory # xs = np.arange(gsize, dtype=np.float32) # ys = np.arange(gsize, dtype=np.float32) xs = xs.astype(np.float32) ys = ys.astype(np.float32) print("params: \n", params, "\n\n") t1 = time.time() # allocates memory on gpu. vxrzns and vyrzns nees be allocated just once and will be overwritten for each timestep # vxrzns_gpu = cuda.mem_alloc(vxrzns.nbytes) # vyrzns_gpu = cuda.mem_alloc(vyrzns.nbytes) all_u_mat_gpu = cuda.mem_alloc(all_u_mat.nbytes) all_v_mat_gpu = cuda.mem_alloc(all_v_mat.nbytes) all_ui_mat_gpu = cuda.mem_alloc(all_ui_mat.nbytes) all_vi_mat_gpu = cuda.mem_alloc(all_vi_mat.nbytes) all_Yi_gpu = cuda.mem_alloc(all_Yi.nbytes) vel_data_gpu = [all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu] ac_angles_gpu = cuda.mem_alloc(ac_angles.nbytes) ac_angle_gpu = cuda.mem_alloc(ac_angle.nbytes) xs_gpu = cuda.mem_alloc(xs.nbytes) ys_gpu = cuda.mem_alloc(ys.nbytes) params_gpu = cuda.mem_alloc(params.nbytes) T_gpu = cuda.mem_alloc(Tdummy.nbytes) # copies contents of a to allocated memory on gpu cuda.memcpy_htod(all_u_mat_gpu, all_u_mat) cuda.memcpy_htod(all_v_mat_gpu, all_v_mat) cuda.memcpy_htod(all_ui_mat_gpu, all_ui_mat) cuda.memcpy_htod(all_vi_mat_gpu, all_vi_mat) cuda.memcpy_htod(all_Yi_gpu, all_Yi) cuda.memcpy_htod(ac_angle_gpu, ac_angle) cuda.memcpy_htod(xs_gpu, xs) cuda.memcpy_htod(ys_gpu, ys) cuda.memcpy_htod(params_gpu, params) for T in range(nT): print("*** Computing data for timestep, T = ", T, '\n') # params[7] = T # cuda.memcpy_htod(params_gpu, params) Tdummy[0] = T # Load Velocities # vxrzns = np.zeros((nrzns, gsize, gsize), dtype = np.float32) # #expectinf to see probs of 0.5 in stream area # for i in range(int(nrzns/2)): # vxrzns[i,int(gsize/2 -1):int(gsize/2 +1),:] = 1 # vyrzns = np.zeros((nrzns, gsize, gsize), dtype = np.float32) # vxrzns = np.load('/home/rohit/Documents/Research/ICRA_2020/DDDAS_2D_Highway/Input_data_files/Velx_5K_rlzns.npy') # vyrzns = np.load('/home/rohit/Documents/Research/ICRA_2020/DDDAS_2D_Highway/Input_data_files/Vely_5K_rlzns.npy') # vxrzns = Vx_rzns # vyrzns = Vy_rzns # vxrzns = vxrzns.astype(np.float32) # vyrzns = vyrzns.astype(np.float32) Tdummy = Tdummy.astype(np.float32) # TODO: sanity check on dimensions: compare loaded matrix shape with gsize, numrzns # copy loaded velocities to gpu # cuda.memcpy_htod(vxrzns_gpu, vxrzns) # cuda.memcpy_htod(vyrzns_gpu, vyrzns) cuda.memcpy_htod(T_gpu, Tdummy) print("pre func") coo_list_a, Rs_list_a = build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles, results, sumR_sa, save_file_for_each_a=False) # print("R_s_a0 \n", Rs_list_a[0][0:200]) print("post func") # TODO: end loop over timesteps here and comcatenate COOs and R_sas over timesteps for each action # full_coo_list and full_Rs_list are lists with each element containing coo and R_s for an action of the same index if T > 0: full_coo_list_a, full_Rs_list_a = concatenate_results_across_time(coo_list_a, Rs_list_a, full_coo_list_a, full_Rs_list_a) # TODO: finish concatenate...() function else: full_coo_list_a = coo_list_a full_Rs_list_a = Rs_list_a t2 = time.time() build_time = t2 - t1 print("build_time ", build_time) #save data to file # data = setup_params, setup_param_str, g.reward_structure, build_time # write_files(full_coo_list_a, filename + '_COO', data) # print("Pickled sparse files !") #build probability transition dictionary state_list = g.ac_state_space() init_transition_dict = initialise_dict(g) transition_dict = convert_COO_to_dict(init_transition_dict, g, full_coo_list_a, full_Rs_list_a) print("conversion COO to dict done") #save dictionary to file data = setup_params, setup_param_str, g.reward_structure, build_time write_files(transition_dict, filename, data) pickleFile(full_coo_list_a, save_path + '/' + filename + '_COO') pickleFile(full_Rs_list_a, save_path + '/' + filename + '_Rsa')
def CudaBrightness(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void br( float *inIm, int check, int brightness ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { if(inIm[idx*3]+brightness > 255) inIm[idx*3] = 255; else inIm[idx*3]= inIm[idx*3]+brightness; if(inIm[idx*3+1]+brightness > 255) inIm[idx*3+1] = 255; else inIm[idx*3+1]= inIm[idx*3+1]+brightness; if(inIm[idx*3+2]+brightness > 255) inIm[idx*3+2] = 255; else inIm[idx*3+2]= inIm[idx*3+2]+brightness; } } """ brightness = int( raw_input("Enter the level of brightness (-255 to 255): ")) print if brightness > 255: brightness = 255 if brightness < -255: brightness = -255 brightness = numpy.int32(brightness) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("br") func(d_px, checkSize, brightness, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() brPx = numpy.empty_like(px) cuda.memcpy_dtoh(brPx, d_px) brPx = (numpy.uint8(brPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(brPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Brightness filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
def do_inference(self, image, score_threshold=0.4, top_k=10000, NMS_threshold=0.4, NMS_flag=True, skip_scale_branch_list=[]): if image.ndim != 3 or image.shape[2] != 3: print('Only RGB images are supported.') return None input_height = self.input_shape[2] input_width = self.input_shape[3] if image.shape[0] != input_height or image.shape[1] != input_width: logging.info( 'The size of input image is not %dx%d.\nThe input image will be resized keeping the aspect ratio.' % (input_height, input_width)) input_batch = numpy.zeros( (1, input_height, input_width, self.input_shape[1]), dtype=numpy.float32) left_pad = 0 top_pad = 0 if image.shape[0] / image.shape[1] > input_height / input_width: resize_scale = input_height / image.shape[0] input_image = cv2.resize(image, (0, 0), fx=resize_scale, fy=resize_scale) left_pad = int((input_width - input_image.shape[1]) / 2) input_batch[0, :, left_pad:left_pad + input_image.shape[1], :] = input_image else: resize_scale = input_width / image.shape[1] input_image = cv2.resize(image, (0, 0), fx=resize_scale, fy=resize_scale) top_pad = int((input_height - input_image.shape[0]) / 2) input_batch[0, top_pad:top_pad + input_image.shape[0], :, :] = input_image input_batch = input_batch.transpose([0, 3, 1, 2]) input_batch = numpy.array(input_batch, dtype=numpy.float32, order='C') self.inputs[0].host = input_batch [cuda.memcpy_htod(inp.device, inp.host) for inp in self.inputs] self.executor.execute(batch_size=self.engine.max_batch_size, bindings=self.bindings) [ cuda.memcpy_dtoh(output.host, output.device) for output in self.outputs ] outputs = [out.host for out in self.outputs] outputs = [ numpy.squeeze(output.reshape(shape)) for output, shape in zip(outputs, self.output_shapes) ] bbox_collection = [] for i in range(self.num_output_scales): if i in skip_scale_branch_list: continue score_map = numpy.squeeze(outputs[i * 2]) # show feature maps------------------------------- # score_map_show = score_map * 255 # score_map_show[score_map_show < 0] = 0 # score_map_show[score_map_show > 255] = 255 # cv2.imshow('score_map' + str(i), cv2.resize(score_map_show.astype(dtype=numpy.uint8), (0, 0), fx=2, fy=2)) # cv2.waitKey() bbox_map = numpy.squeeze(outputs[i * 2 + 1]) RF_center_Xs = numpy.array([ self.receptive_field_center_start[i] + self.receptive_field_stride[i] * x for x in range(score_map.shape[1]) ]) RF_center_Xs_mat = numpy.tile(RF_center_Xs, [score_map.shape[0], 1]) RF_center_Ys = numpy.array([ self.receptive_field_center_start[i] + self.receptive_field_stride[i] * y for y in range(score_map.shape[0]) ]) RF_center_Ys_mat = numpy.tile(RF_center_Ys, [score_map.shape[1], 1]).T x_lt_mat = RF_center_Xs_mat - bbox_map[0, :, :] * self.constant[i] y_lt_mat = RF_center_Ys_mat - bbox_map[1, :, :] * self.constant[i] x_rb_mat = RF_center_Xs_mat - bbox_map[2, :, :] * self.constant[i] y_rb_mat = RF_center_Ys_mat - bbox_map[3, :, :] * self.constant[i] x_lt_mat = x_lt_mat x_lt_mat[x_lt_mat < 0] = 0 y_lt_mat = y_lt_mat y_lt_mat[y_lt_mat < 0] = 0 x_rb_mat = x_rb_mat x_rb_mat[x_rb_mat > input_width] = input_width y_rb_mat = y_rb_mat y_rb_mat[y_rb_mat > input_height] = input_height select_index = numpy.where(score_map > score_threshold) for idx in range(select_index[0].size): bbox_collection.append( (x_lt_mat[select_index[0][idx], select_index[1][idx]] - left_pad, y_lt_mat[select_index[0][idx], select_index[1][idx]] - top_pad, x_rb_mat[select_index[0][idx], select_index[1][idx]] - left_pad, y_rb_mat[select_index[0][idx], select_index[1][idx]] - top_pad, score_map[select_index[0][idx], select_index[1][idx]])) # NMS bbox_collection = sorted(bbox_collection, key=lambda item: item[-1], reverse=True) if len(bbox_collection) > top_k: bbox_collection = bbox_collection[0:top_k] bbox_collection_numpy = numpy.array(bbox_collection, dtype=numpy.float32) bbox_collection_numpy = bbox_collection_numpy / resize_scale if NMS_flag: final_bboxes = NMS(bbox_collection_numpy, NMS_threshold) final_bboxes_ = [] for i in range(final_bboxes.shape[0]): final_bboxes_.append( (final_bboxes[i, 0], final_bboxes[i, 1], final_bboxes[i, 2], final_bboxes[i, 3], final_bboxes[i, 4])) return final_bboxes_ else: return bbox_collection_numpy
def CudaColor(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void co( float *inIm, int check, int color){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx*3 < check*3) { if(color == 0) { inIm[idx*3+1] = inIm[idx*3+1]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 1) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 2) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+1] = inIm[idx*3+1]-255; } if(inIm[idx*3] < 0) inIm[idx*3] = 0; if(inIm[idx*3] > 255) inIm[idx*3] = 255; if(inIm[idx*3+1] < 0) inIm[idx*3+1] = 0; if(inIm[idx*3+1] > 255) inIm[idx*3+1] = 255; if(inIm[idx*3+2] < 0) inIm[idx*3+2] = 0; if(inIm[idx*3+2] > 255) inIm[idx*3+2] = 255; } } """ color = int( raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): ")) print color = numpy.int32(color) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("co") func(d_px, checkSize, color, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() coPx = numpy.empty_like(px) cuda.memcpy_dtoh(coPx, d_px) coPx = (numpy.uint8(coPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(coPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Color Filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name, allLabels, smooth, uncertainty): rank = comm.Get_rank() size = comm.Get_size() if raw.dtype == 'uint8': kernel = _build_kernel_int8() raw = (raw - 128).astype('int8') else: kernel = _build_kernel_float32() raw = raw.astype(np.float32) foundAxis = [0] * 3 for k in range(3): if indices[k]: foundAxis[k] = 1 zsh, ysh, xsh = raw.shape fill_gpu = _build_kernel_fill() block = (32, 32, 1) x_grid = (xsh // 32) + 1 y_grid = (ysh // 32) + 1 grid2 = (int(x_grid), int(y_grid), int(zsh)) a = np.empty(raw.shape, dtype=np.float32) final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8) segment_npy = np.empty(1, dtype=np.uint8) memory_error = False try: raw_gpu = gpuarray.to_gpu(raw) a_gpu = cuda.mem_alloc(a.nbytes) if smooth: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() b_gpu = gpuarray.zeros(raw.shape, dtype=np.float32) zshape = np.int32(zsh) yshape = np.int32(ysh) xshape = np.int32(xsh) sorw = np.int32(sorw) nbrw = np.int32(nbrw) slshape = [None] * 3 indices_gpu = [None] * 3 beta_gpu = [None] * 3 slices_gpu = [None] * 3 ysh = [None] * 3 xsh = [None] * 3 for k, found in enumerate(foundAxis): if found: indices_tmp = np.array(indices[k], dtype=np.int32) slices_tmp = slices[k].astype(np.int32) slices_tmp = reduceBlocksize(slices_tmp) slshape[k], ysh[k], xsh[k] = slices_tmp.shape indices_gpu[k] = gpuarray.to_gpu(indices_tmp) slices_gpu[k] = gpuarray.to_gpu(slices_tmp) Beta = np.zeros(slices_tmp.shape, dtype=np.float32) for m in range(slshape[k]): for n in allLabels: A = _calc_label_walking_area(slices_tmp[m], n) plane = indices_tmp[m] if k == 0: raw_tmp = raw[plane] if k == 1: raw_tmp = raw[:, plane] if k == 2: raw_tmp = raw[:, :, plane] Beta[m] += _calc_var(raw_tmp.astype(float), A) beta_gpu[k] = gpuarray.to_gpu(Beta) sendbuf = np.zeros(1, dtype=np.int32) recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) except Exception as e: print('Error: GPU out of memory. Data too large.') sendbuf = np.zeros(1, dtype=np.int32) + 1 recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) if recvbuf > 0: memory_error = True try: a_gpu.free() except: pass return memory_error, None, None, None if smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() b_npy = np.zeros(raw.shape, dtype=np.float32) b_gpu = cuda.mem_alloc(b_npy.nbytes) cuda.memcpy_htod(b_gpu, b_npy) final_smooth = np.zeros((blockmax - blockmin, yshape, xshape), dtype=np.uint8) sendbuf_smooth = np.zeros(1, dtype=np.int32) recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.' ) sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1 recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) if recvbuf_smooth > 0: smooth = 0 try: b_gpu.free() except: pass if uncertainty: try: max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32) max_gpu = cuda.mem_alloc(max_npy.nbytes) cuda.memcpy_htod(max_gpu, max_npy) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() sendbuf_uq = np.zeros(1, dtype=np.int32) recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.' ) sendbuf_uq = np.zeros(1, dtype=np.int32) + 1 recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) if recvbuf_uq > 0: uncertainty = False try: max_gpu.free() except: pass for label_counter, segment in enumerate(allLabels): print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' + str(len(allLabels))) fill_gpu(a_gpu, xshape, yshape, block=block, grid=grid2) segment_gpu = np.int32(segment) segment_npy.fill(segment) for k, found in enumerate(foundAxis): if found: axis_gpu = np.int32(k) x_grid = (xsh[k] // 32) + 1 y_grid = (ysh[k] // 32) + 1 grid = (int(x_grid), int(y_grid), int(slshape[k])) kernel(axis_gpu, segment_gpu, raw_gpu, slices_gpu[k], a_gpu, xshape, yshape, zshape, indices_gpu[k], sorw, beta_gpu[k], nbrw, block=block, grid=grid) cuda.memcpy_dtoh(a, a_gpu) if size > 1: a = sendrecv(a, blockmin, blockmax, comm, rank, size) if smooth or uncertainty: cuda.memcpy_htod(a_gpu, a) if uncertainty: kernel_max(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2) if smooth: for k in range(smooth): curvature_gpu(a_gpu, b_gpu, xshape, yshape, block=block, grid=grid2) update_gpu(a_gpu, b_gpu, xshape, yshape, block=block, grid=grid2) a_smooth = np.empty_like(a) cuda.memcpy_dtoh(a_smooth, a_gpu) if label_counter == 0: a_smooth[a_smooth < 0] = 0 walkmap_smooth = np.copy(a_smooth) else: walkmap_smooth, final_smooth = max_to_label( a_smooth, walkmap_smooth, final_smooth, blockmin, blockmax, segment) if label_counter == 0: a[a < 0] = 0 walkmap = np.copy(a) else: walkmap, final = max_to_label(a, walkmap, final, blockmin, blockmax, segment) if uncertainty: kernel_uncertainty(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2) final_uncertainty = np.empty_like(a) cuda.memcpy_dtoh(final_uncertainty, a_gpu) final_uncertainty = final_uncertainty[blockmin:blockmax] else: final_uncertainty = None if not smooth: final_smooth = None try: a_gpu.free() except: pass return memory_error, final, final_uncertainty, final_smooth
def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles, results, sumR_sa, save_file_for_each_a=False): gsize = int(params[0]) num_actions = int(params[1]) nrzns = int(params[2]) all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu results_gpu_list = [] sumR_sa_gpu_list = [] for i in range(num_actions): results_gpu_list.append(cuda.mem_alloc(results.nbytes)) sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes)) for i in range(num_actions): cuda.memcpy_htod(results_gpu_list[i], results) cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa) print("alloted mem in inner func") # let one thread access a state centre. access coresponding velocities, run all actions # TODO: dt may not be int for a genral purpose code mod = SourceModule(""" __device__ int32_t get_thread_idx() // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix // for a given T and a given action. // runs for both 2d and 3d grid // TODO: may have to change this considering cache locality { // here i, j, k refer to a general matrix M[i][j][k] int32_t i = threadIdx.x; int32_t j = blockIdx.y; int32_t k = blockIdx.x; int32_t idx = k + (j*gridDim.x) + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y; return idx; } __device__ int32_t state1D_from_thread(int32_t T) { // j ~ blockIdx.x // i ~ blockIdx.y // The above three consitute a spatial state index from i and j of grid // last term is for including time index as well. return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); } __device__ int32_t state1D_from_ij(int32_t* posid, int32_t T) { // posid = {i , j} // state id = j + i*dim(i) + T*dim(i)*dim(j) return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; } __device__ bool is_edge_state(int32_t i, int32_t j) { // n = gsize -1 that is the last index of the domain assuming square domain int32_t n = gridDim.x - 1; if (i == 0 || i == n || j == 0 || j == n ) { return true; } else return false; } __device__ bool is_terminal(int32_t i, int32_t j, float* params) { int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; if(i == i_term && j == j_term) { return true; } else return false; } __device__ bool my_isnan(int s) { // By IEEE 754 rule, NaN is not equal to NaN return s != s; } __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y) { *x = xs[j]; *y = ys[gridDim.x - 1 - i]; return; } __device__ float get_angle_in_0_2pi(float theta) { float f_pi = 3.141592; if (theta < 0) { return theta + (2*f_pi); } else { return theta; } } __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y ) { // xold and yold are centre of old state (i_old, j_old) float dt = params[4]; float r1, r2, theta1, theta2, theta, h; float dt_new; float xnew, ynew; if (newposids[0] == i_old && newposids[1] == j_old) { dt_new = dt; } else { get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold)); r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y))); theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x)); theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold)); theta = fabsf(theta1 -theta2); r2 = fabsf(sinf(theta)); dt_new = r1 + r2; if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[24] = r1; params[25] = r2; } } return -dt_new; } __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r ) { int32_t n = params[0] - 1; // gsize - 1 // int32_t num_actions = params[1]; // int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; float Dj = fabsf(xs[1] - xs[0]); float Di = fabsf(ys[1] - ys[0]); float r_step = 0; *r = 0; int32_t i0 = posids[0]; int32_t j0 = posids[1]; float vnetx = F*cosf(ac_angle) + vx; float vnety = F*sinf(ac_angle) + vy; float x, y; get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0 float xnew = x + (vnetx * dt); float ynew = y + (vnety * dt); //checks TODO: remove checks once verified if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[12] = x; params[13] = y; params[14] = vnetx; params[15] = vnety; params[16] = xnew; params[17] = ynew; params[18] = ac_angle; } if (xnew > xs[n]) { xnew = xs[n]; *r += r_outbound; } else if (xnew < xs[0]) { xnew = xs[0]; *r += r_outbound; } if (ynew > ys[n]) { ynew = ys[n]; *r += r_outbound; } else if (ynew < ys[0]) { ynew = ys[0]; *r += r_outbound; } // TODO:xxDONE check logic wrt remainderf. remquof had issue int32_t xind, yind; //float remx = remquof((xnew - xs[0]), Dj, &xind); //float remy = remquof(-(ynew - ys[n]), Di, &yind); float remx = remainderf((xnew - xs[0]), Dj); float remy = remainderf(-(ynew - ys[n]), Di); xind = ((xnew - xs[0]) - remx)/Dj; yind = (-(ynew - ys[n]) - remy)/Di; if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di)) { xind += 1; yind += 1; } else if ((remx >= 0.5 * Dj && remy < 0.5 * Di)) { xind += 1; } else if ((remx < 0.5 * Dj && remy >= 0.5 * Di)) { yind += 1; } if (!(my_isnan(xind) || my_isnan(yind))) { posids[0] = yind; posids[1] = xind; if (is_edge_state(posids[0], posids[1])) //line 110 { *r += r_outbound; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[26] = 9999; } } r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety); //TODO: change back to normal when needed //r_step = -dt; *r += r_step; //TODO: numerical check remaining if (is_terminal(posids[0], posids[1], params)) { *r += r_terminal; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[19] = xnew; params[20] = ynew; params[21] = yind; params[22] = xind; params[23] = *r; //params[17] = ynew; //params[18] = ac_angle; } } __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params) { int32_t nrzns = params[2]; int32_t nmodes = params[7]; int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi float sum_x = 0; float sum_y = 0; float vx_mean, vy_mean; //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize] int32_t idx = get_thread_idx(); //rzn index to identify which of the 5k rzn it is. used to access all_Yi. int32_t rzn_id = (blockIdx.z * blockDim.x) + threadIdx.x ; //mean_id is the index used to access the flattened all_u_mat[t,i,j]. int32_t mean_id = state1D_from_thread(T); //to access all_ui_mat and all_vi_mat str_uvi = gridDim.x * gridDim.y; sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x); // to access all_Yi sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes); vx_mean = all_u_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } vy_mean = all_v_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } *vx = vx_mean + sum_x; *vy = vy_mean + sum_y; return; } //test: changer from float* to float ac_angle __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results) // resutls directions- 1: along S2; 2: along S1; 3: along columns towards count { int32_t gsize = params[0]; // size of grid along 1 direction. ASSUMING square grid. int32_t num_actions = params[1]; int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; int32_t nmodes = params[7]; int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; int32_t nT = params[10]; int32_t is_stationary = params[11]; int32_t T = (int32_t)T_arr[0]; int32_t idx = get_thread_idx(); float vx, vy; if(idx < gridDim.x*gridDim.y*nrzns) { int32_t posids[2] = {blockIdx.y, blockIdx.x}; //static declaration of array of size 2 to hold i and j values of S1. int32_t sp_id; //sp_id is space_id. S1%(gsize*gsize) // Afer move() these will be overwritten by i and j values of S2 float r; // to store immediate reward extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params); //move(*ac_angle, vx, vy, xs, ys, posids, params, &r); move(ac_angle, vx, vy, xs, ys, posids, params, &r); int32_t S1, S2; if (is_stationary == 1) { T = 0; S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T); //get successor state number corresponding to posid and next timestep T+1 } else { S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T+1); //get successor state number corresponding to posid and next timestep T+1 sp_id = S1%(gsize*gsize); } //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall results[idx] = S2; __syncthreads(); /*if (threadIdx.x == 0 && blockIdx.z == 0) { sumR_sa[S1] = sumR_sa[S1]/nrzns; //TODO: change name to R_sa from sumR_sa since were not storing sum anymore } */ }//if ends return; } """) # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32) # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu) # print("sumR_sa",sumR_sa) # print("sumR_sa",sumR_sa2[0:10001]) # T = np.array(T64, dtype = np.float32) params2 = np.empty_like(params).astype(np.float32) func = mod.get_function("transition_calc") for i in range(num_actions): print('T', T, " call kernel for action: ",i) func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i], block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1)) if i == 0: cuda.memcpy_dtoh(params2, params_gpu) print("params check:",) print( '\nangle= ', params2[18], '\nx =' ,params2[12], '\ny =' ,params2[13] , '\nvnetx = ',params2[14], '\nvnety =', params2[15], '\nxnew =', params2[16], '\nynew =', params2[17], '\nxnewupd =', params2[19], '\nynewupd =', params2[20], '\nyind i=', params2[21], '\nxind j=', params2[22], '\nr- =', params2[23], '\nr1+ =', params2[24], '\nr2+ =', params2[25], '\nenter_isnan =', params2[26] ) results2_list = [] sum_Rsa2_list = [] for i in range(num_actions): results2_list.append(np.empty_like(results)) sum_Rsa2_list.append(np.empty_like(sumR_sa)) # SYNCHRONISATION - pycuda does it implicitly. for i in range(num_actions): cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i]) cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i]) print("memcpy_dtoh for action: ", i) for i in range(num_actions): sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns # print("sumR_sa2\n",sumR_sa2,"\n\n") # print("results_a0\n",results2_list[0].T[50::int(gsize**2)]) print("OK REACHED END OF cuda relevant CODE\n") # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action # if save_file_for_each_a is true then each file must be named appopriately. if save_file_for_each_a == True: f1 = 'COO_Highway2D_T' + str(T) + '_a' f3 = '_of_' + str(num_actions) + 'A.npy' inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)] else: inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)] # coo_list_a is a list of coo for each each action for the given timestep. with Pool(num_actions) as p: coo_list_a = p.starmap(get_COO_, inputs) # print("coo print\n", coo.T[4880:4900, :]) print("\n\n") # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60) # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60) return coo_list_a, sum_Rsa2_list
def image_iterator_gpu(image_volume, roi=None, radius=2, gray_levels=None, binwidth=None, dx=1, dy=0, dz=0, ndev=2, cadd=(0, 0, 0), sadd=3, csub=(0, 0, 0), ssub=3, i=0, feature_kernel='glcm_plugin_gpu', stat_name='glcm_stat_contrast_gpu'): """Uses PyCuda to parallelize the computation of the voxel-wise image entropy using a variable \ neighborhood radius Args: radius -- neighborhood radius; where neighborhood size is isotropic and calculated as 2*radius+1 """ # initialize cuda context cuda.init() cudacontext = cuda.Device(NVDEVICE).make_context() parent_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(parent_dir, 'local_features.cuh'), mode='r') as f: cuda_template = Template(f.read()) roimask = None if isinstance(image_volume, np.ndarray): toBaseVolume = False logger.debug('recognized as an np.ndarray') if image_volume.ndim == 3: d, r, c = image_volume.shape elif image_volume.ndim == 2: d, r, c = (1, *image_volume.shape) image = image_volume.flatten() # # use stat based GLCM quantization # quantize_mode=QMODE_STAT else: toBaseVolume = True logger.debug('recognized as a BaseVolume') image = image_volume if roi: image = image.conformTo(roi.frameofreference) d, r, c = image.frameofreference.size[::-1] image = image.vectorize() # if not image_volume.modality.lower() == 'ct': # # use stat based GLCM quantization # quantize_mode=QMODE_STAT # mask to roi if (roi): roimask = roi.makeDenseMask().vectorize() logger.debug('d:{:d}, r:{:d}, c:{:d}'.format(d, r, c)) if d == 1: z_radius = 0 elif d > 1: z_radius = radius # enforce quantization mode selection # fixed_start, fixed_end = -150, 350 fixed_start, fixed_end = -175, 75 if gray_levels and binwidth: logger.exception( 'must exclusively specify "binwidth" or "gray_levels" to select glcm quantization mode' ) elif binwidth: quantize_mode = QMODE_FIXEDHU nbins = int(math.floor((fixed_end - fixed_start) / binwidth)) + 2 gray_levels = -1 elif gray_levels: quantize_mode = QMODE_STAT nbins = gray_levels binwidth = -1 else: # kernel doesn't use glcm quantize_mode = -1 nbins = 1 gray_levels = -1 binwidth = -1 maxrunlength = math.ceil( math.sqrt(2 * (radius * 2 + 1) * (radius * 2 + 1) + (z_radius * 2 + 1))) cuda_source = cuda_template.substitute({ 'RADIUS': radius, 'Z_RADIUS': z_radius, 'IMAGE_DEPTH': d, 'IMAGE_HEIGHT': r, 'IMAGE_WIDTH': c, 'QUANTIZE_MODE': quantize_mode, 'GRAY_LEVELS': gray_levels, 'FIXED_BINWIDTH': binwidth, 'FIXED_START': fixed_start, 'NBINS': nbins, 'MAXRUNLENGTH': maxrunlength, 'DX': dx, 'DY': dy, 'DZ': dz, 'NDEV': ndev, 'CADD_X': cadd[0], 'CADD_Y': cadd[1], 'CADD_Z': cadd[2], 'SADD': sadd, 'CSUB_X': csub[0], 'CSUB_Y': csub[1], 'CSUB_Z': csub[2], 'SSUB': ssub, 'KERNEL': feature_kernel, 'STAT': stat_name }) mod2 = SourceModule( cuda_source, options=['-I {!s}'.format(parent_dir), '-g', '-G', '-lineinfo']) func = mod2.get_function('image_iterator_gpu') # allocate image on device in global memory image = image.astype(np.float32) image_gpu = cuda.mem_alloc(image.nbytes) result = np.zeros_like(image) result_gpu = cuda.mem_alloc(result.nbytes) # transfer image to device cuda.memcpy_htod(image_gpu, image) cuda.memcpy_htod(result_gpu, result) # call device kernel blocksize = 256 gridsize = math.ceil(r * c * d / blocksize) func(image_gpu, result_gpu, block=(blocksize, 1, 1), grid=(gridsize, 1, 1)) # get result from device cuda.memcpy_dtoh(result, result_gpu) # detach from cuda context # cudacontext.synchronize() # cudacontext.detach() cudacontext.pop() # required to successfully free device memory for created context del cudacontext gc.collect() pycuda.tools.clear_context_caches() logger.debug('feature result shape: {!s}'.format(result.shape)) logger.debug('GPU done') # clean invalid values from result result = np.nan_to_num(result) if (roimask is not None): result = np.multiply(result, roimask) if d == 1: result = result.reshape(r, c) elif d > 1: result = result.reshape(d, r, c) if toBaseVolume: if roi: FOR = roi.frameofreference else: FOR = image_volume.frameofreference outvolume = MaskableVolume().fromArray(result, FOR) outvolume.modality = image_volume.modality return outvolume else: return result
def __init__(self, vol_bnds, voxel_size): # Define voxel volume parameters. self._vol_bnds = vol_bnds # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters self._voxel_size = voxel_size # in meters (determines volume discretization and resolution) self._trunc_margin = self._voxel_size * 5 # truncation on SDF # Adjust volume bounds. self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size).copy(order='C').astype( int) # ensure C-order contigous self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype( np.float32) # ensure C-order contigous print("Voxel volume size: {:d} x {:d} x {:d}".format( self._vol_dim[0], self._vol_dim[1], self._vol_dim[2])) # Initialize pointers to voxel volume in CPU memory. self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32) self._weight_vol_cpu = np.zeros(self._vol_dim).astype( np.float32 ) # for computing the cumulative moving average of observations per voxel self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) # Copy voxel volumes to GPU. if TSDF_GPU_MODE: self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) # Cuda kernel function (C++) self._cuda_src_mod = SourceModule(""" __global__ void integrate(float * tsdf_vol, float * weight_vol, float * color_vol, float * vol_dim, float * vol_origin, float * cam_intr, float * cam_pose, float * other_params, float * color_im, float * depth_im) { // Get voxel index. int gpu_loop_idx = (int) other_params[0]; int max_threads_per_block = blockDim.x; int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x; int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x; int vol_dim_x = (int)vol_dim[0]; int vol_dim_y = (int)vol_dim[1]; int vol_dim_z = (int)vol_dim[2]; if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z) return; // Get voxel grid coordinates. float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z))); float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z)); float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z); // Voxel grid coordinates to world coordinates. float voxel_size = other_params[1]; float pt_x = vol_origin[0] + voxel_x * voxel_size; float pt_y = vol_origin[1] + voxel_y * voxel_size; float pt_z = vol_origin[2] + voxel_z * voxel_size; // World coordinates to camera coordinates. float tmp_pt_x = pt_x - cam_pose[0*4+3]; float tmp_pt_y = pt_y - cam_pose[1*4+3]; float tmp_pt_z = pt_z - cam_pose[2*4+3]; float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z; float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z; float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z; // Camera coordinates to image pixels. int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]); int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]); // Skip if outside view frustum. int im_h = (int) other_params[2]; int im_w = (int) other_params[3]; if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0) return; // Skip invalid depth. float depth_value = depth_im[pixel_y*im_w+pixel_x]; if (depth_value == 0) return; // Integrate TSDF. float trunc_margin = other_params[4]; float depth_diff = depth_value-cam_pt_z; if (depth_diff < -trunc_margin) return; float dist = fmin(1.0f, depth_diff / trunc_margin); float w_old = weight_vol[voxel_idx]; float obs_weight = other_params[5]; float w_new = w_old + obs_weight; weight_vol[voxel_idx] = w_new; tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new; // Integrate color. float old_color = color_vol[voxel_idx]; float old_b = floorf(old_color / (256 * 256)); float old_g = floorf((old_color - old_b * 256 * 256) / 256); float old_r = old_color - old_b * 256 * 256 - old_g * 256; float new_color = color_im[pixel_y*im_w+pixel_x]; float new_b = floorf(new_color / (256 * 256)); float new_g = floorf((new_color - new_b * 256 * 256) / 256); float new_r = new_color - new_b * 256 * 256 - new_g * 256; new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f); new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f); new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f); color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r; }""") self._cuda_integrate = self._cuda_src_mod.get_function("integrate") # Determine block/grid size on GPU. gpu_dev = cuda.Device(0) self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK n_blocks = int( np.ceil( float(np.prod(self._vol_dim)) / float(self._max_gpu_threads_per_block))) grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))) grid_dim_z = min( gpu_dev.MAX_GRID_DIM_Z, int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y)))) self._max_gpu_grid_dim = np.array( [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int) self._n_gpu_loops = int( np.ceil( float(np.prod(self._vol_dim)) / float( np.prod(self._max_gpu_grid_dim) * self._max_gpu_threads_per_block)))
def mh_sample_A(self, is_symmetric=False): """ Sample new adjacency matrix and relevant spike parents using MH Determine whether or not to propose a birth Choose an edge to propose Determine whether to accept using cuComputeProdQratio to do the parallel computation. If accept, use cuSampleSingleProcessZ to choose new parents """ N = self.base.data.N K = self.modelParams["proc_id_model", "K"] Ns = self.modelParams["proc_id_model", "Ns"] # Determine whether to propose a new edge or a removal of an existing edge op = MH_ADD if np.random.rand() < self.params["gamma"] else MH_DEL # Choose a row (ki) and column (kj) to update # They must be selected randomly, otherwise the transition probabilities # do not cancel properly as derived in the paper, and the distribution is # not left invariant after the MH operation # ki = np.random.randint(K) # # If this is a symmetric graph model, only choose from the upper diagonal # if is_symmetric: # kj = np.random.randint(ki,K) # else: # kj = np.random.randint(K) # Choose one of the unspecified edges in the graph if self.modelParams["graph_model", "mask"] == None: ki = np.random.randint(K) # If this is a symmetric graph model, only choose from the upper diagonal if is_symmetric: kj = np.random.randint(ki, K) else: kj = np.random.randint(K) else: (kis, kjs) = np.nonzero(self.modelParams["graph_model", "mask"] == -1) ind = np.random.randint(0, len(kis)) ki = kis[ind] kj = kjs[ind] assert self.modelParams["graph_model", "mask"][ki, kj] == -1 # # Check if this entry is set in the mask already # if self.modelParams["graph_model","mask"] != None: # if self.modelParams["graph_model","mask"][ki,kj] != -1: # return # Get the current weight for this entry currWBuffer = np.zeros((1, ), dtype=np.float32) cuda.memcpy_dtoh( currWBuffer, self.gpuPtrs["weight_model", "W"].ptr + int( (ki * K + kj) * currWBuffer.itemsize)) currW = currWBuffer[0] # Compute WGS using the current set of weights grid_w = int(np.ceil(np.float32(N) / self.params["blockSz"])) # startPerfTimer(perfDict, "computeWGSForAllSpikes") # log.info(self.gpuPtrs["graph_model","WGS"].get()) self.gpuKernels["computeWGSForAllSpikes"]( np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["impulse_model", "GS"].gpudata, self.base.dSS["colPtrs"].gpudata, self.base.dSS["rowIndices"].gpudata, self.gpuPtrs["weight_model", "W"].gpudata, self.gpuPtrs["graph_model", "A"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, K)) # stopPerfTimer(perfDict, "computeWGSForAllSpikes") # Now determine whether or not to accept the change # If the proposal does not change the adjacency matrix # then this is easy, otherwise we enlist the GPU logQratio = 0.0 logPratio = 0.0 if op == MH_ADD: # We are proposing a new edge. Calculate WGS with the # new edge and determine the P and Q ratios # This now runs on all N spikes but only counts spikes affected by the new edge # namely, those on process kj self.gpuKernels["computeWGSForNewEdge"]( np.int32(ki), np.int32(kj), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["impulse_model", "GS"].gpudata, self.base.dSS["colPtrs"].gpudata, self.base.dSS["rowIndices"].gpudata, self.gpuPtrs["weight_model", "W"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) if is_symmetric and ki != kj: self.gpuKernels["computeWGSForNewEdge"]( np.int32(kj), np.int32(ki), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["impulse_model", "GS"].gpudata, self.base.dSS["colPtrs"].gpudata, self.base.dSS["rowIndices"].gpudata, self.gpuPtrs["weight_model", "W"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) # stopPerfTimer(perfDict, "computeWGSForNewEdge") # Compute the P ratio rho = self.getConditionalEdgePr(ki, kj) if rho == 1.0: logPratio = np.Inf elif rho == 0.0: logPratio = -np.Inf elif is_symmetric and ki != kj: logPratio = -1.0 * Ns[ki] * currW + -1.0 * Ns[ kj] * currW + np.log(rho) - np.log(1 - rho) else: logPratio = -1.0 * Ns[ki] * currW + np.log(rho) - np.log(1 - rho) # Compute the Q ratio if ki == kj and not self.params["allow_self_excitation"]: # Do not allow self-excitation logQratio = np.float64(-1.0 * np.Inf) else: logQratio = np.log(1 - self.params["gamma"]) - np.log( self.params["gamma"]) self.gpuKernels["computeProdQratio"]( np.int32(kj), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, self.gpuPtrs["bkgd_model", "lam"].gpudata, np.int32(ki), np.int32(MH_ADD), self.gpuPtrs["graph_model", "qratio"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get() logQratio += np.sum(blockLogQratio) if is_symmetric and ki != kj: self.gpuKernels["computeProdQratio"]( np.int32(ki), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, self.gpuPtrs["bkgd_model", "lam"].gpudata, np.int32(kj), np.int32(MH_ADD), self.gpuPtrs["graph_model", "qratio"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get() logQratio += np.sum(blockLogQratio) # Decide whether or not to accept this change logPrAccept = logPratio + logQratio accept = np.log(np.random.rand()) < logPrAccept if accept: # if is_symmetric and ki!=kj: # log.debug("+A[%d,%d],+A[%d,%d]",ki,kj,kj,ki) # else: # log.debug("+A[%d,%d]",ki,kj) # Update the adjacency matrix on host and GPU self.modelParams["graph_model", "A"][ki, kj] = True A_buff = np.array([True], dtype=np.bool) cuda.memcpy_htod( self.gpuPtrs["graph_model", "A"].ptr + int( (ki * K + kj) * A_buff.itemsize), A_buff) if is_symmetric and ki != kj: self.modelParams["graph_model", "A"][kj, ki] = True A_buff = np.array([True], dtype=np.bool) cuda.memcpy_htod( self.gpuPtrs["graph_model", "A"].ptr + int( (kj * K + ki) * A_buff.itemsize), A_buff) else: # Clear the WGS changes self.gpuKernels["clearWGSForDeletedEdge"]( np.int32(ki), np.int32(kj), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) if is_symmetric and ki != kj: self.gpuKernels["clearWGSForDeletedEdge"]( np.int32(kj), np.int32(ki), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) if op == MH_DEL: # We are proposing to delete an edge. WGS was calculated # with the edge present. # Compute the P ratio rho = self.getConditionalEdgePr(ki, kj) if rho == 1.0: logPratio = -np.Inf elif rho == 0.0: logPratio = np.Inf elif is_symmetric and ki != kj: logPratio = Ns[ki] * currW + Ns[kj] * currW + np.log( 1 - rho) - np.log(rho) else: logPratio = Ns[ki] * currW + np.log(1 - rho) - np.log(rho) # Compute the Q ratio logQratio = np.log( self.params["gamma"]) - np.log(1 - self.params["gamma"]) self.gpuKernels["computeProdQratio"]( np.int32(kj), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, self.gpuPtrs["bkgd_model", "lam"].gpudata, np.int32(ki), np.int32(MH_DEL), self.gpuPtrs["graph_model", "qratio"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get() logQratio += np.sum(blockLogQratio) if is_symmetric and ki != kj: self.gpuKernels["computeProdQratio"]( np.int32(ki), np.int32(K), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, self.gpuPtrs["bkgd_model", "lam"].gpudata, np.int32(kj), np.int32(MH_DEL), self.gpuPtrs["graph_model", "qratio"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) blockLogQratio = self.gpuPtrs["graph_model", "qratio"].get() logQratio += np.sum(blockLogQratio) # Decide whether or not to accept this change logPrAccept = logPratio + logQratio accept = np.log(np.random.rand()) < logPrAccept if accept: # if is_symmetric and ki!=kj: # log.debug("-A[%d,%d],-A[%d,%d]",ki,kj,kj,ki) # else: # log.debug("-A[%d,%d]",ki,kj) # Update the adjacency matrix self.modelParams["graph_model", "A"][ki, kj] = False A_buff = np.array([False], dtype=np.bool) cuda.memcpy_htod( self.gpuPtrs["graph_model", "A"].ptr + int( (ki * K + kj) * A_buff.itemsize), A_buff) if is_symmetric and ki != kj: self.modelParams["graph_model", "A"][kj, ki] = False A_buff = np.array([False], dtype=np.bool) cuda.memcpy_htod( self.gpuPtrs["graph_model", "A"].ptr + int( (kj * K + ki) * A_buff.itemsize), A_buff) # Clear the WGS changes self.gpuKernels["clearWGSForDeletedEdge"]( np.int32(ki), np.int32(kj), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) if is_symmetric and ki != kj: self.gpuKernels["clearWGSForDeletedEdge"]( np.int32(kj), np.int32(ki), np.int32(N), self.gpuPtrs["proc_id_model", "C"].gpudata, self.gpuPtrs["graph_model", "WGS"].gpudata, block=(1024, 1, 1), grid=(grid_w, 1)) else: # Nothing changes if we do not delete the edge pass
{ int idx = threadIdx.x + threadIdx.y*5; d_a[idx] = d_a[idx]*d_a[idx]; } """) square = mod.get_function("square") # ---------------Using mem_alloc--------------- # start = drv.Event() end = drv.Event() h_a = numpy.random.randint(1, 5, (5, 5)) h_a = h_a.astype(numpy.float32) h_b = h_a.copy() start.record() d_a = drv.mem_alloc(h_a.size * h_a.dtype.itemsize) drv.memcpy_htod(d_a, h_a) # Calling kernel square(d_a, block=(5, 5, 1), grid=(1, 1), shared=0) h_result = numpy.empty_like(h_a) drv.memcpy_dtoh(h_result, d_a) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Time of Squaring on GPU without inout") print("%fs" % (secs)) print("original array:") print(h_a) print("Square with kernel:") print(h_result) # ---------------Using inout functionality of driver class--------------- #
def frame_eraser(): files = getData() if (files == 0): return 0 path = files['path'] + files['number'] resolution_x = files['resolution'][0] resolution_y = files['resolution'][1] global BATCH_SIZE BATCH_SIZE = files['batch_size'] global NUM_THREADS NUM_THREADS = 16 imageNames = __getImageNames(path) for batch in range(0, math.floor(len(imageNames) / BATCH_SIZE) + 1): print( f"\nBatch: {batch + 1} of {math.floor(len(imageNames) / BATCH_SIZE) + 1}" ) batch_host = np.empty(0, dtype=object) threads = [] start = batch * BATCH_SIZE end = (batch + 1) * BATCH_SIZE if (end > len(imageNames)): end = len(imageNames) imageName_Batch = imageNames[start:end] batch_length = len(imageName_Batch) threadBatch = math.floor(batch_length / NUM_THREADS) for i in range(0, NUM_THREADS): if i == NUM_THREADS - 1: imageLoader = ImageLoader( imageName_Batch[i * threadBatch:batch_length], path) else: imageLoader = ImageLoader( imageName_Batch[i * threadBatch:(i + 1) * threadBatch], path) threads.append(imageLoader) for t in threads: t.start() done = False print("Copying from Disk to RAM") with progressbar.ProgressBar(max_value=batch_length) as bar: while (not done): done = all(t.done == True for t in threads) progress = 0 for t in threads: progress += t.progress bar.update(progress) time.sleep(0.1) for t in threads: temp = np.copy(t.getBatch()) batch_host = np.append(batch_host, temp) for t in threads: t.join() batch_device = np.zeros_like(batch_host) print("Copying from RAM to GPU") with progressbar.ProgressBar(max_value=batch_length) as bar: for i in range(0, batch_length): batch_device[i] = driver.mem_alloc(batch_host[i].nbytes) # pylint: disable=no-member, unsupported-assignment-operation driver.memcpy_htod(batch_device[i], batch_host[i]) # pylint: disable=no-member bar.update(i) # CUDA Absolute Image Subtraction diffBlock = (8, 8, 3) diffGrid = (int(resolution_x / 8), int(resolution_y / 8), 1) h_diffImage_int = np.zeros_like(batch_host[0], dtype=np.uint8) d_diffImage_int = driver.mem_alloc(h_diffImage_int.nbytes) # pylint: disable=no-member getImgDiff = __module.get_function("cuda_GetImgDiff") # CUDA Sum Image num_block = int(resolution_x * resolution_y * 3 / 512) block = (512, 1, 1) grid = (num_block, 1, 1) h_sum = np.zeros(num_block, dtype=np.float) d_sum = driver.mem_alloc(h_sum.nbytes) # pylint: disable=no-member sumPixels = __module.get_function("cuda_SumPixels") # CUDA Int to Float image converstion h_diffImage_float = h_diffImage_int.astype(np.float32) # pylint: disable=no-member d_diffImage_float = driver.mem_alloc(h_diffImage_float.nbytes) # pylint: disable=no-member byteToFloat = __module.get_function("cuda_ByteToFloat") imagesToDelete = [] print("Processing") pixelSum = 0 with progressbar.ProgressBar(max_value=batch_length) as bar: pivot = 0 for i in range(0, batch_length - 1): getImgDiff(d_diffImage_int, batch_device[pivot], batch_device[i + 1], np.int32(resolution_x), block=diffBlock, grid=diffGrid) byteToFloat(d_diffImage_float, d_diffImage_int, block=block, grid=grid) sumPixels(d_diffImage_float, d_sum, block=block, grid=grid) driver.memcpy_dtoh(h_sum, d_sum) # pylint: disable=no-member pixelSum = h_sum.sum() if (pixelSum > threshold): pivot = i else: imagesToDelete.append(i) bar.update(i) for i in imagesToDelete: os.remove(path + imageName_Batch[i]) pass print(f'Deleted: {len(imagesToDelete)} images\n') #getImgDiff(d_diffImage_int, batch_device[1000], batch_device[1001], block=diffBlock, grid=diffGrid) #driver.memcpy_dtoh(h_diffImage_int, d_diffImage_int) #displayImage(h_diffImage_int) #byteToFloat(d_diffImage_float, d_diffImage_int, block=block, grid=grid) #if batch >= 5: # return return 1
def elementwiseMean_gpu(feature_volume_list): """computes the elementwise mean of the like-shaped volumes in feature_volume_list""" # initialize cuda context cuda.init() cudacontext = cuda.Device(NVDEVICE).make_context() parent_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(parent_dir, 'feature_compositions.cuh'), mode='r') as f: mod = SourceModule( f.read(), options=[ '-I {!s}'.format(parent_dir), # '-g', '-G', '-lineinfo' ]) func = mod.get_function('elementwiseMean') # combine volumes into linearized array FOR = feature_volume_list[0].frameofreference vols = [] for vol in feature_volume_list: vols.append(vol.vectorize()) array_length = np.product(FOR.size).item() while len(vols) > 1: num_arrays = 2 cat = np.concatenate([vols.pop() for x in range(num_arrays)], axis=0) # allocate image on device in global memory cat = cat.astype(np.float32) cat_gpu = cuda.mem_alloc(cat.nbytes) result = np.zeros((array_length)).astype(np.float32) result_gpu = cuda.mem_alloc(result.nbytes) # transfer cat to device cuda.memcpy_htod(cat_gpu, cat) cuda.memcpy_htod(result_gpu, result) # call device kernel blocksize = 512 gridsize = math.ceil(array_length / blocksize) func(cat_gpu, result_gpu, np.int32(array_length), np.int32(num_arrays), block=(blocksize, 1, 1), grid=(gridsize, 1, 1)) # get result from device cuda.memcpy_dtoh(result, result_gpu) vols.append(result.reshape((-1, 1))) result = vols[0] # detach from cuda context # cudacontext.synchronize() # cudacontext.detach() cudacontext.pop() # required to successfully free device memory for created context del cudacontext gc.collect() pycuda.tools.clear_context_caches() x = MaskableVolume().fromArray(result, FOR) x.modality = feature_volume_list[0].modality return x
def __dfs_construct(self, depth, error_rate, start_idx, stop_idx, si_gpu_in, si_gpu_out): def check_terminate(): if error_rate == 0.0: return True else: return False n_samples = stop_idx - start_idx indices_offset = start_idx * self.dtype_indices.itemsize nid = self.n_nodes self.n_nodes += 1 if check_terminate(): turn_to_leaf(nid, start_idx, si_gpu_in.idx, self.values_idx_array, self.values_si_idx_array ) return if n_samples < self.min_samples_split: turn_to_leaf(nid, start_idx, si_gpu_in.idx, self.values_idx_array, self.values_si_idx_array ) return if n_samples <= self.bfs_threshold: self.idx_array[self.queue_size * 2] = start_idx self.idx_array[self.queue_size * 2 + 1] = stop_idx self.si_idx_array[self.queue_size] = si_gpu_in.idx self.nid_array[self.queue_size] = nid self.queue_size += 1 return cuda.memcpy_htod(self.features_array_gpu.ptr, self.features_array) min_left, min_right, row, col = self.__gini(n_samples, indices_offset, si_gpu_in) if min_left + min_right == 4: turn_to_leaf(nid, start_idx, si_gpu_in.idx, self.values_idx_array, self.values_si_idx_array) return cuda.memcpy_dtoh(self.threshold_value_idx, si_gpu_in.ptr + int(indices_offset) + \ int(row * self.stride + col) * \ int(self.dtype_indices.itemsize)) self.feature_idx_array[nid] = row self.feature_threshold_array[nid] = (float(self.samples[row, \ self.threshold_value_idx[0]]) + self.samples[row, \ self.threshold_value_idx[1]]) / 2 self.fill_kernel.prepared_call( (1, 1), (512, 1, 1), si_gpu_in.ptr + row * self.stride * \ self.dtype_indices.itemsize + \ indices_offset, n_samples, col, self.mark_table.ptr) block = (self.RESHUFFLE_THREADS_PER_BLOCK, 1, 1) self.scan_reshuffle_tex.prepared_call( (self.n_features, 1), block, si_gpu_in.ptr + indices_offset, si_gpu_out.ptr + indices_offset, n_samples, col) self.__shuffle_feature_indices() self.left_children[nid] = self.n_nodes self.__dfs_construct(depth + 1, min_left, start_idx, start_idx + col + 1, si_gpu_out, si_gpu_in) self.right_children[nid] = self.n_nodes self.__dfs_construct(depth + 1, min_right, start_idx + col + 1, stop_idx, si_gpu_out, si_gpu_in)
def __init__(self,instance,algorithm,verbose=False): """Initializes a direct parallel worker :param instance: an instance of class from_scatterers_direct(\ cctbx.xray.structure_factors.manager.managed_calculation_base) :type instance: cctbx.xray.structure_factors.from_scatterers_direct :param algorithm: an instance of class direct_summation_cuda_platform(\ direct_summation_simple) with algorithm set to "simple" or "pycuda" :type algorithm: cctbx.xray.structure_factors.direct_summation_cuda_platform """ self.scatterers = instance._xray_structure.scatterers() self.registry = instance._xray_structure.scattering_type_registry() self.miller_indices = instance._miller_set.indices() self.unit_cell = instance._miller_set.unit_cell() self.space_group = instance._miller_set.space_group().make_tidy() if verbose: self.print_diagnostics() # some diagnostics used for development if hasattr(algorithm,"simple"): instance._results = ext.structure_factors_simple( self.unit_cell, instance._miller_set.space_group(), self.miller_indices, self.scatterers, self.registry); return if hasattr(algorithm,"pycuda"): import pycuda.driver as cuda from pycuda.compiler import SourceModule self.validate_the_inputs(instance,cuda,algorithm) self.prepare_miller_arrays_for_cuda(algorithm) self.prepare_scattering_sites_for_cuda(algorithm) self.prepare_gaussians_symmetries_cell(algorithm) assert cuda.Device.count() >= 1 device = cuda.Device(0) WARPSIZE=device.get_attribute(cuda.device_attribute.WARP_SIZE) # 32 MULTIPROCESSOR_COUNT=device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT) sort_mod = SourceModule((mod_fhkl_sorted%(self.gaussians.shape[0], self.symmetry.shape[0], self.sym_stride, self.g_stride, int(self.use_debye_waller), self.order_z,self.order_p)).replace("floating_point_t",algorithm.float_t) ) r_m_m_address = sort_mod.get_global("reciprocal_metrical_matrix")[0] cuda.memcpy_htod(r_m_m_address, self.reciprocal_metrical_matrix) gaussian_address = sort_mod.get_global("gaussians")[0] cuda.memcpy_htod(gaussian_address, self.gaussians) symmetry_address = sort_mod.get_global("symmetry")[0] cuda.memcpy_htod(symmetry_address, self.symmetry) CUDA_fhkl = sort_mod.get_function("CUDA_fhkl") intermediate_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) intermediate_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) for x in range(self.scatterers.number_of_types): fhkl_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) fhkl_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) CUDA_fhkl(cuda.InOut(fhkl_real), cuda.InOut(fhkl_imag), cuda.In(self.flat_sites), cuda.In(self.weights), cuda.In(self.u_iso), algorithm.numpy.uint32(self.scatterers.increasing_order[x]), algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][0]), algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][1]), cuda.In(self.flat_mix), block=(FHKL_BLOCKSIZE,1,1), grid=((self.n_flat_hkl//FHKL_BLOCKSIZE,1))) intermediate_real += fhkl_real intermediate_imag += fhkl_imag flex_fhkl_real = flex.double(intermediate_real[0:len(self.miller_indices)].astype(algorithm.numpy.float64)) flex_fhkl_imag = flex.double(intermediate_imag[0:len(self.miller_indices)].astype(algorithm.numpy.float64)) instance._results = fcalc_container(flex.complex_double(flex_fhkl_real,flex_fhkl_imag)) return
def __bfs(self): block_per_split = int(math.ceil(float(self.MAX_BLOCK_BFS) / self.queue_size)) if block_per_split > self.max_features: n_blocks = self.max_features else: n_blocks = block_per_split idx_array_gpu = gpuarray.to_gpu( self.idx_array[0 : self.queue_size * 2]) si_idx_array_gpu = gpuarray.to_gpu( self.si_idx_array[0 : self.queue_size]) self.label_total = gpuarray.empty(self.queue_size * self.n_labels, dtype = self.dtype_counts) threshold_value = gpuarray.empty(self.queue_size, dtype = np.float32) impurity_gpu = gpuarray.empty(self.queue_size * 2, dtype = np.float32) self.min_split = gpuarray.empty(self.queue_size, dtype = self.dtype_indices) min_feature_idx_gpu = gpuarray.empty(self.queue_size, dtype = np.uint16) impurity_gpu_2d = gpuarray.empty(self.queue_size * 2 * n_blocks, dtype = np.float32) min_split_2d = gpuarray.empty(self.queue_size * n_blocks, dtype = self.dtype_indices) min_feature_idx_gpu_2d = gpuarray.empty(self.queue_size * n_blocks, dtype = np.uint16) cuda.memcpy_htod(self.features_array_gpu.ptr, self.features_array) self.scan_total_bfs.prepared_call( (self.queue_size, 1), (self.BFS_THREADS, 1, 1), self.labels_gpu.ptr, self.label_total.ptr, si_idx_array_gpu.ptr, idx_array_gpu.ptr) self.comput_bfs_2d.prepared_call( (self.queue_size, n_blocks), (self.BFS_THREADS, 1, 1), self.samples_gpu.ptr, self.labels_gpu.ptr, idx_array_gpu.ptr, si_idx_array_gpu.ptr, self.label_total.ptr, self.features_array_gpu.ptr, impurity_gpu_2d.ptr, min_split_2d.ptr, min_feature_idx_gpu_2d.ptr) self.reduce_bfs_2d.prepared_call( (self.queue_size, 1), (1, 1, 1), impurity_gpu_2d.ptr, min_split_2d.ptr, min_feature_idx_gpu_2d.ptr, impurity_gpu.ptr, self.min_split.ptr, min_feature_idx_gpu.ptr, n_blocks) self.fill_bfs.prepared_call( (self.queue_size, 1), (self.BFS_THREADS, 1, 1), si_idx_array_gpu.ptr, min_feature_idx_gpu.ptr, idx_array_gpu.ptr, self.min_split.ptr, self.mark_table.ptr) if block_per_split > self.n_features: n_blocks = self.n_features else: n_blocks = block_per_split self.reshuffle_bfs.prepared_call( (self.queue_size, n_blocks), (self.BFS_THREADS, 1, 1), si_idx_array_gpu.ptr, idx_array_gpu.ptr, self.min_split.ptr) self.__shuffle_feature_indices() self.get_thresholds.prepared_call( (self.queue_size, 1), (1, 1, 1), si_idx_array_gpu.ptr, self.samples_gpu.ptr, threshold_value.ptr, min_feature_idx_gpu.ptr, self.min_split.ptr) new_idx_array = np.empty(self.queue_size * 2 * 2, dtype = np.uint32) idx_array = self.idx_array new_si_idx_array = np.empty(self.queue_size * 2, dtype = np.uint8) new_nid_array = np.empty(self.queue_size * 2, dtype = np.uint32) left_children = self.left_children right_children = self.right_children feature_idx_array = self.feature_idx_array feature_threshold_array = self.feature_threshold_array nid_array = self.nid_array imp_min = np.empty(self.queue_size * 2, np.float32) min_split = np.empty(self.queue_size, self.dtype_indices) feature_idx = np.empty(self.queue_size, np.uint16) threshold = np.empty(self.queue_size, np.float32) cuda.memcpy_dtoh(imp_min, impurity_gpu.ptr) cuda.memcpy_dtoh(min_split, self.min_split.ptr) cuda.memcpy_dtoh(feature_idx, min_feature_idx_gpu.ptr) cuda.memcpy_dtoh(threshold, threshold_value.ptr) si_idx_array = self.si_idx_array self.n_nodes, self.queue_size, self.idx_array, self.si_idx_array, self.nid_array =\ bfs_loop(self.queue_size, self.n_nodes, self.max_features, new_idx_array, idx_array, new_si_idx_array, new_nid_array, left_children, right_children, feature_idx_array, feature_threshold_array, nid_array, imp_min, min_split, feature_idx, si_idx_array, threshold, self.min_samples_split, self.values_idx_array, self.values_si_idx_array) self.n_nodes = int(self.n_nodes) self.queue_size = int(self.queue_size)
def __init__(self, eta, beta, L, no_angles, no_pulses, order=5): self.mod_K = SourceModule( RADON_KERNEL.format(order, no_angles, no_pulses)) self.K_gpu = self.mod_K.get_function("K_l") self.mod_reduction = SourceModule(REDUCTION_KERNEL) self.reduction_gpu = self.mod_reduction.get_function("reduction") self.eta = eta self.gamma = gamma(eta) self.beta = beta self.L = L self.h = calc_h(L, beta, eta) drv.memcpy_htod( self.mod_K.get_global("rsq4pi")[0], scipy.array([1. / sqrt(4. * pi)], dtype=scipy.float32)) drv.memcpy_htod( self.mod_K.get_global("sqeta")[0], scipy.array([sqrt(self.eta)], dtype=scipy.float32)) drv.memcpy_htod( self.mod_K.get_global("h")[0], scipy.array([self.h], dtype=scipy.float32)) drv.memcpy_htod( self.mod_K.get_global("four_pi_gamma")[0], scipy.array([4. * pi * self.gamma], dtype=scipy.float32)) y = sqrt(self.gamma) / self.h drv.memcpy_htod( self.mod_K.get_global("y")[0], scipy.array([y], dtype=scipy.float32)) n = scipy.arange(1, order + 1, dtype=scipy.float32) n2 = n**2 ex = exp(-n2 / 4.) pre_s2 = ex * cosh(n * y) pre_s3 = ex * n * sinh(n * y) drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2) drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex) drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2) drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3)