def _create_halo_arrays(self): # Allocate space for the halos: two per face, # one for sending and one for receiving. nz, ny, nx = self.local_dims sw = self.stencil_width # create two halo regions for each face, one holding # the halo values to send, and the other holding # the halo values to receive. self.left_recv_halo = gpuarray.empty([nz,ny,sw], dtype=np.float64) self.left_send_halo = self.left_recv_halo.copy() self.right_recv_halo = self.left_recv_halo.copy() self.right_send_halo = self.left_recv_halo.copy() self.bottom_recv_halo = gpuarray.empty([nz,sw,nx], dtype=np.float64) self.bottom_send_halo = self.bottom_recv_halo.copy() self.top_recv_halo = self.bottom_recv_halo.copy() self.top_send_halo = self.bottom_recv_halo.copy() self.back_recv_halo = gpuarray.empty([sw,ny,nx], dtype=np.float64) self.back_send_halo = self.back_recv_halo.copy() self.front_recv_halo = self.back_recv_halo.copy() self.front_send_halo = self.back_recv_halo.copy()
def generate_shifts_2d(width, height, n_samples, with_hot=False): x_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (width - 0.01) x_shifts = x_shifts.astype(np.uint32) y_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (height - 0.01) y_shifts = y_shifts.astype(np.uint32) if with_hot: shifts_hot = gp.empty((width * height, n_samples), np.float32) threads_per_block = 32 n_blocks = int(math.ceil(n_samples / threads_per_block)) gpu_shift_to_hot_2d(x_shifts, y_shifts, shifts_hot, np.uint32(shifts_hot.strides[0]/4), np.uint32(shifts_hot.strides[1]/4), np.uint32(width), np.uint32(height), np.uint32(n_samples), block=(threads_per_block, 1, 1), grid=(n_blocks, 1)) return x_shifts, y_shifts, shifts_hot else: shifts = gp.empty((2, n_samples), np.float32) threads_per_block = 32 n_blocks = int(math.ceil(n_samples / threads_per_block)) gpu_vstack(y_shifts, x_shifts, shifts, np.uint32(shifts.strides[0]/4), np.uint32(shifts.strides[1]/4), np.uint32(n_samples), block=(threads_per_block, 1, 1), grid=(n_blocks, 1)) return x_shifts, y_shifts, shifts
def _minmax_impl(a_gpu, axis, min_or_max, stream=None): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin! if min_or_max == 'max': return gpuarray.max(a_gpu).get() else: return gpuarray.min(a_gpu).get() else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def get(self, V_gpu, xcl_gpu, xcr_gpu, W_gpu, x_gpu, stream=None): """ """ if stream is None: stream = cuda.Stream() # Temporary variables z_gpu = gpuarray.empty((self.params['V_d'], self.params['V_w']), self.params['dtype']) xc2_gpu = gpuarray.empty(2*self.params['w_d'], self.params['dtype']) blockDim_x = self.params['V_w'] self._func[0](xcl_gpu, xcr_gpu, xc2_gpu, block=(blockDim_x, 1, 1), stream=stream) gridDim_z = self.params['V_d'] blockDim_y = self.params['V_w'] self._func[1](V_gpu, xc2_gpu, z_gpu, block=(1, blockDim_y, 1), grid=(1, 1, gridDim_z), stream=stream) blockDim_y = self.params['W_h'] self._func[2](W_gpu, xc2_gpu, z_gpu, x_gpu, block=(1, blockDim_y, 1), grid=(1, 1, 1), stream=stream)
def sample_dropout_mask(x, dropout_probability=.5, columns=None, stream=None, target=None, dropout_mask=None, dropout_prob_array=None): """ Samples a dropout mask and applies it in place""" assert x.flags.c_contiguous if columns is not None: assert len(columns) == 2 x_tmp = x x = extract_columns(x, columns[0], columns[1]) shape = x.shape if dropout_prob_array is None: dropout_prob_array = gpuarray.empty(shape, x.dtype) sampler.fill_uniform(dropout_prob_array, stream) if dropout_mask is None: dropout_mask = gpuarray.empty(shape, np.int8) if target is None: target = x all_kernels['sample_dropout_mask']( x, target, dropout_mask, dropout_prob_array, np.float32(dropout_probability)) if columns is not None: insert_columns(x, x_tmp, columns[0]) return dropout_mask
def __init__(self, res=(640, 480)): mod = cuda.SourceModule(file("cpp/trace.cu").read(), keep=True, options=['-I../cpp'], no_extern_c=True) self.InitEyeRays = mod.get_function("InitEyeRays") self.InitFishEyeRays = mod.get_function("InitFishEyeRays") self.Trace = mod.get_function("Trace") self.ShadeSimple = mod.get_function("ShadeSimple") self.mod = mod self.block = (16, 32, 1) # 15: 32, 18: 28, 19: 24 self.grid = ( res[0]/self.block[0], res[1]/self.block[1] ) self.resx, self.resy = (self.grid[0]*self.block[0], self.grid[1]*self.block[1]) self.smallblock = (16, 16, 1) self.smallgrid = ( res[0]/self.smallblock[0], res[1]/self.smallblock[1] ) self.d_img = ga.empty( (self.resy, self.resx, 4), uint8 ) ''' struct RayData { float3 dir; float t; VoxNodeId endNode; int endNodeChild; float endNodeSize; }; ''' raySize = struct.calcsize("3f f i i f") self.d_rays = ga.empty( (self.resy, self.resx, raySize), uint8 ) self.setLightPos((0.5, 0.5, 1)) self.detailCoef = 10.0
def enable3d(self): self.point1 = self.point-(self.mesh_diagonal_norm/60)*self.axis2 self.point2 = self.point+(self.mesh_diagonal_norm/60)*self.axis2 self.viewing_angle = 0.0 pos1, dir1 = from_film(self.point1, axis1=self.axis1, axis2=self.axis2, size=self.size, width=self.film_width) pos2, dir2 = from_film(self.point2, axis1=self.axis1, axis2=self.axis2, size=self.size, width=self.film_width) self.rays1 = gpu.GPURays(pos1, dir1, max_alpha_depth=self.max_alpha_depth) self.rays2 = gpu.GPURays(pos2, dir2, max_alpha_depth=self.max_alpha_depth) scope_size = (self.size[0]//4, self.size[0]//4) scope_pos, scope_dir = from_film(self.point, axis1=self.axis1, axis2=self.axis2, size=scope_size, width=self.film_width/4.0) self.scope_rays = gpu.GPURays(scope_pos, scope_dir) self.scope_pixels_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.uint32) self.pixels1_gpu = ga.empty(self.width*self.height, dtype=np.uint32) self.pixels2_gpu = ga.empty(self.width*self.height, dtype=np.uint32) self.distances_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.float32) self.display3d = True
def test_cublasSgetriBatched(self): l,m = 11,7 np.random.seed(1) A = np.random.rand(l,m, m).astype(np.float32) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) c_gpu = gpuarray.empty((l,m,m), np.float32) c_arr = bptrs(c_gpu) p_gpu = gpuarray.empty((l,m), np.int32) i_gpu = gpuarray.zeros(l, np.int32) cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) cublas.cublasSgetriBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m, i_gpu.gpudata, l) X = np.array(map(np.linalg.inv,A)) X_ = c_gpu.get() assert np.allclose(X,X_,6)
def test_cublas_bug(): ''' The SGEMM call would cause all calls after it to fail for some unknown reason. Likely this is caused swaprows causing memory corruption. NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be fixed in CUDA 6.5 ''' from pycuda.driver import Stream from skcuda.cublas import cublasSgemm from skcuda.misc import _global_cublas_handle as handle n = 131 s = slice(128, n) X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32)) a = gpuarray.empty((X.shape[1], 3), dtype=np.float32) c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32) b = gpuarray.empty_like(X) m, n = a.shape[0], b[s].shape[1] k = a.shape[1] lda = m ldb = k ldc = m #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc #gpuarray.dot(d, Xoutd[s]) #op.sgemm(a, b[s], c) stream = Stream() stream.synchronize()
def gradient_gpu(y_gpu, mode='valid'): shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize) preproc = _generate_preproc(dtype, shape) mod = SourceModule(preproc + kernel_code, keep=True) if mode == 'valid': gradient_gpu = mod.get_function("gradient_valid") gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype) grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype) if mode == 'same': gradient_gpu = mod.get_function("gradient_same") gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype) grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype) gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata, block=block_size, grid=grid_size, shared=shared_size) return (gradx_gpu, grady_gpu)
def get(self, V_gpu, d_gpu, xcl_gpu, xcr_gpu, W_gpu, dpl_gpu, dpr_gpu, stream=None): S_gpu = gpuarray.empty(self.params['V_w'], self.params['dtype']) VVT_gpu = gpuarray.to_gpu(np.ndarray(shape=self.params['V_shape'], dtype=self.params['dtype'], strides=self.params['V_strides'])) tmp_gpu = gpuarray.empty((self.params['V_w'], self.params['w_d']), self.params['dtype']) dp2_gpu = gpuarray.empty(self.params['W_w'], self.params['dtype']) if stream is None: stream = cuda.Stream() gridDim_x = 2 gridDim_y = 2 gridDim_z = self.params['V_d'] blockDim_x = self.params['V_w']//gridDim_x blockDim_y = self.params['V_h']//gridDim_y self._func[0](V_gpu, VVT_gpu, block=(blockDim_x, blockDim_y, 1), grid=(gridDim_x, gridDim_y, gridDim_z), stream=stream) gridDim_x = 1 gridDim_y = 2 blockDim_x = self.params['V_d'] blockDim_y = self.params['V_h']//gridDim_y self._func[1](VVT_gpu, d_gpu, xcl_gpu, xcr_gpu, tmp_gpu, block=(blockDim_x, blockDim_y, 1), grid=(gridDim_x, gridDim_y, 1), stream=stream) gridDim_x = 1 gridDim_y = 2 blockDim_x = self.params['V_d'] blockDim_y = self.params['V_h']//gridDim_y self._func[2](tmp_gpu, S_gpu, block=(blockDim_x, blockDim_y, 1), grid=(gridDim_x, gridDim_y, 1), stream=stream) gridDim_x = 2 gridDim_y = 1 blockDim_x = self.params['V_w']//gridDim_x blockDim_y = self.params['V_d'] self._func[3](W_gpu, d_gpu, S_gpu, xcl_gpu, xcr_gpu, dp2_gpu, block=(blockDim_x, blockDim_y, 1), grid=(gridDim_x, gridDim_y, 1), stream=stream) blockDim_x = self.params['V_w'] self._func[4](dp2_gpu, dpl_gpu, dpr_gpu, block=(blockDim_x, 1, 1), grid=(1, 1, 1), stream=stream)
def initializeGpuMemory(self): K = self.modelParams["proc_id_model","K"] # Sufficient statistics for the parameters of G kernels self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32) self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
def _init_comm_bufs(self): """ Buffers for sending/receiving data from other modules. Notes ----- Must be executed after `_init_port_dicts()`. """ # Buffers (and their interfaces and MPI types) for receiving data # transmitted from source modules: self._in_buf = {} self._in_buf['gpot'] = {} self._in_buf['spike'] = {} self._in_buf_int = {} self._in_buf_int['gpot'] = {} self._in_buf_int['spike'] = {} self._in_buf_mtype = {} self._in_buf_mtype['gpot'] = {} self._in_buf_mtype['spike'] = {} for in_id in self._in_ids: self._in_buf['gpot'][in_id] = \ gpuarray.empty(len(self._in_port_dict_ids['gpot'][in_id]), self.pm['gpot'].dtype) self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id]) self._in_buf_mtype['gpot'][in_id] = \ dtype_to_mpi(self._in_buf['gpot'][in_id].dtype) self._in_buf['spike'][in_id] = \ gpuarray.empty(len(self._in_port_dict_ids['spike'][in_id]), self.pm['spike'].dtype) self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id]) self._in_buf_mtype['spike'][in_id] = \ dtype_to_mpi(self._in_buf['spike'][in_id].dtype) # Buffers (and their interfaces and MPI types) for transmitting data to # destination modules: self._out_buf = {} self._out_buf['gpot'] = {} self._out_buf['spike'] = {} self._out_buf_int = {} self._out_buf_int['gpot'] = {} self._out_buf_int['spike'] = {} self._out_buf_mtype = {} self._out_buf_mtype['gpot'] = {} self._out_buf_mtype['spike'] = {} for out_id in self._out_ids: self._out_buf['gpot'][out_id] = \ gpuarray.empty(len(self._out_port_dict_ids['gpot'][out_id]), self.pm['gpot'].dtype) self._out_buf_int['gpot'][out_id] = bufint(self._out_buf['gpot'][out_id]) self._out_buf_mtype['gpot'][out_id] = \ dtype_to_mpi(self._out_buf['gpot'][out_id].dtype) self._out_buf['spike'][out_id] = \ gpuarray.empty(len(self._out_port_dict_ids['spike'][out_id]), self.pm['spike'].dtype) self._out_buf_int['spike'][out_id] = bufint(self._out_buf['spike'][out_id]) self._out_buf_mtype['spike'][out_id] = \ dtype_to_mpi(self._out_buf['spike'][out_id].dtype)
def getFields(self,x,y): outX = gpuarray.empty((self.Nfields,x,y),np.float32) outY = gpuarray.empty((self.Nfields,x,y),np.float32) grid = (int(ceil(x/32)),int(ceil(y/32))) block = (int(ceil(x/grid[0])),int(ceil(y/grid[1])),1) for i in range(self.Nfields): self.resampleF[i].prepared_call(grid,block,outX[i,:,:].gpudata,outY[i,:,:].gpudata,np.int32(x),np.int32(y)) return outX,outY
def show_values(matrix_size, threads_per_block): a_cpu = np.random.randn(matrix_size, matrix_size).astype(np.float32) # transfer host (CPU) memory to device (GPU) memory a_gpu = gpuarray.to_gpu(a_cpu) id_groups_x = gpuarray.empty((matrix_size, matrix_size), np.float32) id_groups_y = gpuarray.empty((matrix_size, matrix_size), np.float32) id_threads_x = gpuarray.empty((matrix_size, matrix_size), np.float32) id_threads_y = gpuarray.empty((matrix_size, matrix_size), np.float32) id_cell = gpuarray.empty((matrix_size, matrix_size), np.float32) blocks = (threads_per_block, 1, 1) blocks_per_side = int(matrix_size / threads_per_block) if (blocks_per_side * threads_per_block) < matrix_size: blocks_per_side = blocks_per_side + 1 grid = (blocks_per_side, matrix_size, 1) print("Blocks: ", blocks) print("Grid: ", grid) kernel_code = kernel_source_code % {'MATRIX_SIZE': matrix_size, 'BLOCK_SIZE': threads_per_block} compiled_kernel = compiler.SourceModule(kernel_code) kernel_binary = compiled_kernel.get_function("markThreadID") kernel_binary( # inputs a_gpu, # outputs id_groups_x, id_groups_y, id_threads_x, id_threads_y, id_cell, block=blocks, grid=grid ) id_blocks_x_cpu = id_groups_x.get() id_blocks_y_cpu = id_groups_y.get() id_threads_x_cpu = id_threads_x.get() id_threads_y_cpu = id_threads_y.get() id_cell_cpu = id_cell.get() print("id_blocks_x_cpu") print(id_blocks_x_cpu) print("id_blocks_y_cpu") print(id_blocks_y_cpu) print("id_threads_x_cpu") print(id_threads_x_cpu) print("id_threads_y_cpu") print(id_threads_y_cpu) print("id_cell_cpu") print(id_cell_cpu)
def initializeGpuMemory(self): """ Allocate GPU memory for the base model parameters """ N = self.base.data.N K = self.base.data.K self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32) self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32)
def removeProcessEventHandler(self, procId): """ Remove process procID from the set of processes and update data structures accordingly. We can assume that the base model has updated K. """ K = self.modelParams["proc_id_model","K"] self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32) self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32)
def test_work_area(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False) work_area = gpuarray.empty((plan.worksize,), np.uint8) plan.set_work_area(work_area) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
def initializeGpuMemory(self): K = self.modelParams["proc_id_model","K"] N = self.base.data.N self.gpuPtrs["graph_model","A"] = gpuarray.empty((K,K), dtype=np.bool) self.gpuPtrs["graph_model","WGS"] = gpuarray.empty((K,N), dtype=np.float32) qratio_width = int(np.ceil(np.float32(self.base.data.N)/self.params["blockSz"])) self.gpuPtrs["graph_model","qratio"] = gpuarray.empty((qratio_width,), dtype=np.float64) self.gpuPtrs["graph_model","lkhd_ratio"] = gpuarray.empty((1,), dtype=np.float32)
def get_workspace(self, n): from pyfft.cuda import Plan as pycufftplan import pycuda.gpuarray as gpuarray ws = self.get(n) if ws: return ws return self.setdefault(n, (pycufftplan(int(n), stream=self.stream, normalize=False), gpuarray.empty(n, dtype=complex64(0.).dtype), gpuarray.empty(n, dtype=complex64(0.).dtype)))
def benchmark(self): discr = self.discr given = self.plan.given from hedge.backends.cuda.tools import int_ceiling block_count = int_ceiling( len(discr.mesh.elements)/self.plan.elements_per_block()) all_fluxes_on_faces = [gpuarray.empty( (block_count * self.plan.microblocks_per_block() * given.aligned_face_dofs_per_microblock(),), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes))] field = gpuarray.empty( (self.plan.input_dofs_per_block() * block_count,), dtype=given.float_type, allocator=discr.pool.allocate) fdata = self.fake_flux_face_data_block(block_count) ilist_data = self.fake_index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=True) for dep_expr in self.all_deps: field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if "cuda_fastbench" in discr.debug: count = 1 else: count = 20 start = cuda.Event() start.record() for i in range(count): if block_count >= 2**16: return None try: gather.prepared_call( (block_count, 1), block, 0, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) ) except cuda.LaunchError: return None stop = cuda.Event() stop.record() stop.synchronize() return 1e-3/count * stop.time_since(start)
def __init__(self, A1, A2, left, use_batch=False): """Creates a new LinearOperator interface to the superoperator E. This is a wrapper to be used with SciPy's sparse linear algebra routines. Parameters ---------- A1 : ndarray Ket parameter tensor. A2 : ndarray Bra parameter tensor. left : bool Whether to multiply with a vector to the left (or to the right). """ self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.use_batch = use_batch self.left = left self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = sp.dtype(A1[0][0].dtype) self.calls = 0 self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.ones = [one.fill(1) for one in self.ones] self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def genCks( allValidSpikVec, MATRIX_SIZE, TILE_WIDTH, configVec_str, spikTransMatFile) : #using all generated valid spiking vector files, 'feed' the files to the CUDA C kernels to evaluate (1) for spikVec in allValidSpikVec[ 0 ] : # string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...] # to configVec = 211 <string> Ck_1_str = configVec_str #write into total list of Ckspri #allGenCk = addTotalCk( allGenCk, Ck_1_str ) #print spikVec #form the filenames of the Cks and the Sks Ck = 'c_' + Ck_1_str + '_' + spikVec Ck_1 = 'c_' + Ck_1_str Sk = 's_' + spikVec #print ' Ck, Ck_1, Sk: ', Ck, Ck_1, Sk #import the vectors/Matrix as numpy ND arrays Ck_1 = toNumpyArr( Ck_1, MATRIX_SIZE ) Sk = toNumpyArr( Sk, MATRIX_SIZE ) M = toNumpyArr( spikTransMatFile, MATRIX_SIZE ) #allocate memory in the GPU Ck_1gpu = gpuarray.to_gpu( Ck_1 ) Skgpu = gpuarray.to_gpu( Sk ) Mgpu = gpuarray.to_gpu( M ) SkMgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 ) Ckgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 ) #get kernel code from template by specifying the constant MATRIX_SIZE #matmul_kernel = matmul_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE} matmul_kernel = matmul_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH} #matadd_kernel = matadd_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE} matadd_kernel = matadd_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH} # compile the kernel code mulmod = compiler.SourceModule(matmul_kernel) addmod = compiler.SourceModule(matadd_kernel) matrixmul = mulmod.get_function( "MatrixMulKernel" ) matrixadd = addmod.get_function( "MatrixAddKernel" ) #call kernel functions #matrixmul( Skgpu, Mgpu, SkMgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), ) #print ' BEFORE DEVICE CALLS. Time is ' #print str(datetime.now()) #create PyCUDA events to record time of kernel execution startTime = driver.Event() endTime = driver.Event() startTime.record( ) #start the timer matrixmul( Skgpu, Mgpu, SkMgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) ) #matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), ) matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) ) endTime.record( ) #start the end time timer. endTime.synchronize( ) # synchronize end of threads simTime = startTime.time_till( endTime ) * 1e-3 print " Kernel call exec time is ", simTime #print ' AFTER DEVICE CALLS. Time is ' #print str(datetime.now()) #print Ck_1gpu.get()[ 4 ] #this is a numpy ND array #write ND array into a file NDarrToFile( Ck, Ckgpu )
def initializeGpuMemory(self): """ Allocate GPU memory for the base model parameters """ N = self.base.data.N K = self.modelParams["proc_id_model","K"] self.gpuPtrs["parent_model","Z"] = gpuarray.empty((N,), dtype=np.int32) self.gpuPtrs["parent_model","WGS"] = gpuarray.empty((K,N), dtype=np.float32) self.gpuPtrs["parent_model","urand_Z_gpu"] = gpuarray.empty((N,), dtype=np.float32) self.gpuPtrs["parent_model","Zi_temp_gpu"] = gpuarray.empty((N,), dtype=np.int32)
def initializeGpuMemory(self): K = self.params["K"] N = self.base.data.N D = self.base.data.D gridx = int(np.ceil(np.float32(N)/self.params["blockSz"])) self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32) self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32) self.gpuPtrs["proc_id_model","Xstats"] = gpuarray.empty((K,gridx), dtype=np.float32) self.gpuPtrs["proc_id_model","Xmean"] = gpuarray.empty((K,D), dtype=np.float32) self.gpuPtrs["proc_id_model","Xprec"] = gpuarray.empty((K,D,D), dtype=np.float32)
def todense(self, out=None, allocator=mem_alloc, stream=None): if out is None: out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") if self.nnz == 0: # weird but happens out.fill(0.0, stream=stream) return out # we need to out-of-place transpose if we want rowmajor outputs # thus we need a temporary to store our results if out.flags.c_contiguous: tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") else: tmp = out if stream is not None: cusparse.cusparseSetStream(cusparse_handle, stream.handle) cublas.cublasSetStream(cublas_handle, stream.handle) cusparse.cusparseScsr2dense( cusparse_handle, self.shape[0], self.shape[1], self.descr, self.data.gpudata, self.indptr.gpudata, self.indices.gpudata, tmp.gpudata, tmp.shape[0], ) if out.flags.c_contiguous: cublas.cublasSgeam( cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0], 1.0, tmp.gpudata, tmp.shape[0], 0.0, 0, tmp.shape[0], out.gpudata, out.shape[1], ) if stream is not None: cusparse.cusparseSetStream(cusparse_handle, 0) cublas.cublasSetStream(cublas_handle, 0) return out
def _init_comm_bufs(self): """ Buffers for receiving data from other modules. Notes ----- Must be executed after `_init_port_dicts()`. """ # Buffer interface to and MPI type of this module's port data array: self._data_int = {} self._data_int['gpot'] = bufint(self.data['gpot']) self._data_int['spike'] = bufint(self.data['spike']) self._data_mtype = {} self._data_mtype['gpot'] = dtype_to_mpi(self.data['gpot'].dtype) self._data_mtype['spike'] = dtype_to_mpi(self.data['spike'].dtype) # Buffers (and their interfaces and MPI types) for receiving data # transmitted from source modules: self._in_buf = {} self._in_buf['gpot'] = {} self._in_buf['spike'] = {} self._in_buf_int = {} self._in_buf_int['gpot'] = {} self._in_buf_int['spike'] = {} self._in_buf_mtype = {} self._in_buf_mtype['gpot'] = {} self._in_buf_mtype['spike'] = {} for in_id in self._in_ids: # Get interfaces of pattern connecting the current module to # source module `in_id`; `int_1` is connected to the current # module, `int_0` is connected to the other module: pat = self.routing_table[in_id, self.id]['pattern'] int_0 = self.routing_table[in_id, self.id]['int_0'] int_1 = self.routing_table[in_id, self.id]['int_1'] # The buffers must be the same size as the port data arrays of the # modules from which they received data: self._in_buf['gpot'][in_id] = \ gpuarray.empty(len(self.pm_all['gpot'][in_id]), self.pm['gpot'].dtype) self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id]) self._in_buf_mtype['gpot'][in_id] = \ dtype_to_mpi(self._in_buf['gpot'][in_id]) self._in_buf['spike'][in_id] = \ gpuarray.empty(len(self.pm_all['spike'][in_id]), self.pm['spike'].dtype) self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id]) self._in_buf_mtype['spike'][in_id] = \ dtype_to_mpi(self._in_buf['spike'][in_id])
def addNewProcessEventHandler(self, newProcParams): """ If a new process is added the parameters will be in the given dict. We need to update all our data structures that depend on K. We can assume that the base model has updated K """ del self.gpuPtrs["impulse_model","nnz_Z"] del self.gpuPtrs["impulse_model","g_suff_stats"] K = self.modelParams["proc_id_model","K"] self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32) self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32)
def DT_GPU(self, X, c): DIM = X.size floatSize = X.dtype.itemsize q = gpuarray.empty(DIM / 2, X.dtype) p = gpuarray.empty(DIM / 2, X.dtype) XNext = gpuarray.empty(DIM, X.dtype) cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2) qNext = q + c * self.dt * self.dTdp(p) pNext = p cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2) return XNext
def DV_GPU(self, X, d): DIM = X.size floatSize = X.dtype.itemsize q = gpuarray.empty(DIM / 2, X.dtype) p = gpuarray.empty(DIM / 2, X.dtype) XNext = gpuarray.empty(DIM, X.dtype) cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2) qNext = q pNext = p - d * self.dt * self.dVdq(q) cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2) return XNext
def concatenate_layers(layers): nthreads_per_block = 1024 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) # allocate memory if gpuapi.is_gpu_api_cuda(): nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): totsize = 0 layer_pos = [] print layer_bounds[-1] for n, layer in enumerate(layers): layer_pos.append(totsize) print "LAYER ", n, " size=", len(layer), "start=", totsize totsize += len(layer) print "totsize: ", totsize nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4) nodes_iter_gpu = ga.to_device(queue, nodes_iter_np) nodeset_np = [] else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer = 0 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset nmax_blocks = 10000 if gpuapi.is_gpu_api_opencl(): nthreads_per_block = 256 nmax_blocks = 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks): #print " ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start if gpuapi.is_gpu_api_cuda(): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): layer_gpu = ga.to_device(queue, layer) bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1), (1, 1, 1), np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), np.uint32(layer_start), layer_gpu.data, nodes_iter_gpu.data, g_times_l=True).wait() else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer += 1 if gpuapi.is_gpu_api_cuda(): return nodes.get(), layer_bounds elif gpuapi.is_gpu_api_opencl(): return nodes_iter_gpu.get(), layer_bounds
def register_multiple_images_subpix_cuda(stack, template): import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.cumath as cumath import skcuda.fft as cu_fft import skcuda.linalg as lin import skcuda.cublas as cub from numpy import pi, newaxis, floor import cmath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix from numpy.fft import fftshift, ifftshift fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1, use_numpy_fft=False) mod = SourceModule(""" #include <pycuda-complex.hpp>" __global__ void load_convert(unsigned short *a, float *b,int f, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = f * imlen; if (idx <imlen) { b[idx] = (float)a[offset+idx]; } } __global__ void convert_export(float *a, unsigned short *b,int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ; } } __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx] = x[idx] * y[idx]; } } __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = x[idx]._M_re; y[idx]._M_im = -x[idx]._M_im; } } __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = 0; y[idx]._M_im = x[idx] * sx; } } __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen, int nlargeh, int nh) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = imlenl*3/4; if (idx<imlen) { int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl; x[target_ind] = y[idx]; } } __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx]._M_re = 0; z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg; } } __global__ void sub_float(float *x, float *y, float sv, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { x[idx] = y[idx]-sv; } } """) load_convert_kernel = mod.get_function('load_convert') convert_export_kernel = mod.get_function('convert_export') convert_multiply_kernel = mod.get_function('convert_multiply') multiply_float_kernel = mod.get_function('multiply_comp_float') transfer_array_kernel = mod.get_function('transfer_array') calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix') conj_kernel = mod.get_function('calc_conj') sub_float_kernel = mod.get_function('sub_float') Z = stack.shape[0] M = stack.shape[1] N = stack.shape[2] max_memsize = 4200000000 imlen = M * N half_imlen = M * (N // 2 + 1) grid_dim = (64, int(imlen / (512 * 64)) + 1, 1) block_dim = (512, 1, 1) #512 threads per block stack_bin = int(max_memsize / (M * N * stack.itemsize)) stack_ite = int(Z / stack_bin) + 1 usfac = 100 ## needs to be bigger than 10 if not template.shape == stack.shape[1:]: raise ValueError("Images must have same shape.") if np.any(np.isnan(template)): template = template.copy() template[template != template] = 0 if np.any(np.isnan(stack)): stack = stack.copy() stack[stack != stack] = 0 mlarge = M * 2 nlarge = N * 2 t = time.time() plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64) plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32) plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32) cub_h = cub.cublasCreate() template_gpu = gpuarray.to_gpu(template.astype('float32')) source_gpu = gpuarray.empty((M, N), np.float32) ifft_gpu = gpuarray.empty((M, N), np.float32) result_gpu = gpuarray.empty((M, N), np.uint16) templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64) shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64) cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True) templatef_gpu = templatef_gpu.conj() move_list = np.zeros((Z, 2)) largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64) largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32) imlenl = mlarge * (nlarge // 2 + 1) zoom_factor = 1.5 dftshiftG = trunc(ceil(usfac * zoom_factor) / 2) #% Center of output array at dftshift+1 upsample_dim = int(ceil(usfac * zoom_factor)) term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)). T[:, newaxis]) / N # fftfreq # output points term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :] term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis] term2r = (ifftshift(np.arange(M, dtype='float')) - floor(M / 2))[newaxis, :] # fftfreq term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) + 1), :].astype('float32')) term2c_gpu = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu = gpuarray.to_gpu(term1r.astype('float32')) term2r_gpu = gpuarray.to_gpu(term2r.astype('float32')) term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32')) kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32) kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32) kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64) kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64) Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M)) Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N)) [Nc, Nr] = np.meshgrid(Nc, Nr) Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32')) Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32')) upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64) upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64) source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype) copy = drv.Memcpy3D() copy.set_src_host(stack.data) copy.set_dst_device(source_stack.gpudata) copy.width_in_bytes = copy.src_pitch = stack.strides[1] copy.src_height = copy.height = M for zb in range(stack_ite): zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z)) copy.depth = len(zrange) copy.src_z = int(zrange[0]) copy() for i in range(len(zrange)): t = zb * stack_bin + i load_convert_kernel(source_stack, source_gpu.gpudata, np.int32(i), np.int32(imlen), block=block_dim, grid=grid_dim) cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True) multiply_float_kernel(sourcef_gpu, templatef_gpu, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) transfer_array_kernel(largearray1_gpu, prod_gpu1, np.int32(imlenl), np.int32(half_imlen), np.int32(nlarge // 2 + 1), np.int32(N // 2 + 1), block=block_dim, grid=grid_dim) cu_fft.ifft(largearray1_gpu, largearray2_gpu, plan_inverse_big, scale=True) peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size, largearray2_gpu.gpudata, 1) rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape) md2 = trunc(mlarge / 2) nd2 = trunc(nlarge / 2) if rloc > md2: row_shift2 = rloc - mlarge else: row_shift2 = rloc if cloc > nd2: col_shift2 = cloc - nlarge else: col_shift2 = cloc row_shiftG = row_shift2 / 2. col_shiftG = col_shift2 / 2. # Initial shift estimate in upsampled grid row_shiftG0 = round(row_shiftG * usfac) / usfac col_shiftG0 = round(col_shiftG * usfac) / usfac # Matrix multiply DFT around the current shift estimate roffG = dftshiftG - row_shiftG0 * usfac coffG = dftshiftG - col_shiftG0 * usfac sub_float_kernel(term2c_gpu, term2c_gpu_ori, np.float32(coffG / usfac), np.int32(term2c_gpu.size), block=block_dim, grid=grid_dim) sub_float_kernel(term1r_gpu, term1r_gpu_ori, np.float32(roffG), np.int32(term1r_gpu.size), block=block_dim, grid=grid_dim) lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu) lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu) convert_multiply_kernel(kernc_gpu, kernc_gpuc, np.float32(-2 * pi), np.int32(kernc_gpu.size), block=block_dim, grid=grid_dim) convert_multiply_kernel(kernr_gpu, kernr_gpuc, np.float32(-2 * pi / (M * usfac)), np.int32(kernr_gpu.size), block=block_dim, grid=grid_dim) cumath.exp(kernc_gpuc, out=kernc_gpuc) cumath.exp(kernr_gpuc, out=kernr_gpuc) conj_kernel(prod_gpu1, prod_gpu2, np.int32(half_imlen), block=block_dim, grid=grid_dim) lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1) lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2) CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2) rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape) CCGmax = CCG[rlocG, clocG] rlocG = rlocG - dftshiftG #+ 1 # +1 # questionable/failed hack + 1; clocG = clocG - dftshiftG #+ 1 # -1 # questionable/failed hack - 1; row_shiftG = row_shiftG0 + rlocG / usfac col_shiftG = col_shiftG0 + clocG / usfac diffphaseG = arctan2(imag(CCGmax), real(CCGmax)) # Compute registered version of source stack calc_shiftmatrix_kernel(Nr_gpu, Nc_gpu, shiftmatrix, np.float32(row_shiftG * 2 * np.pi), np.float32(col_shiftG * 2 * np.pi), np.float32(diffphaseG), np.int32(half_imlen), block=block_dim, grid=grid_dim) cumath.exp(shiftmatrix, out=shiftmatrix) multiply_float_kernel(sourcef_gpu, shiftmatrix, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse) convert_export_kernel(ifft_gpu, result_gpu, np.int32(imlen), block=block_dim, grid=grid_dim) move_list[t, :] = (row_shiftG, col_shiftG) stack[t, :, :] = result_gpu.get() cub.cublasDestroy(cub_h) return (stack, move_list)
def test_curand_wrappers(self): from pycuda.curandom import get_curand_version if get_curand_version() is None: from pytest import skip skip("curand not installed") generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator, ) generator_types.extend( [XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator]) if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( ScrambledSobol32RandomNumberGenerator, Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator, ) generator_types.extend([ ScrambledSobol32RandomNumberGenerator, Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator, ]) if get_curand_version() >= (4, 1, 0): from pycuda.curandom import MRG32k3aRandomNumberGenerator generator_types.extend([MRG32k3aRandomNumberGenerator]) if has_double_support(): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for gen_type in generator_types: gen = gen_type() for dtype in dtypes: gen.gen_normal(10000, dtype) # test non-Box-Muller version, if available gen.gen_normal(10001, dtype) if get_curand_version() >= (4, 0, 0): gen.gen_log_normal(10000, dtype, 10.0, 3.0) # test non-Box-Muller version, if available gen.gen_log_normal(10001, dtype, 10.0, 3.0) x = gen.gen_uniform(10000, dtype) x_host = x.get() assert (-1 <= x_host).all() assert (x_host <= 1).all() gen.gen_uniform(10000, np.uint32) if get_curand_version() >= (5, 0, 0): gen.gen_poisson(10000, np.uint32, 13.0) for dtype in dtypes + [np.uint32]: a = gpuarray.empty(1000000, dtype=dtype) v = 10 a.fill(v) gen.fill_poisson(a) tmp = (a.get() == (v - 1)).sum() / a.size # noqa: F841
import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as cu_fft print('Testing fft/ifft..') N = 4096 * 16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N // 2 + 1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print('Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)) print('Testing in-place fft..') x = np.asarray( np.random.rand(batch_size, N) + 1j * np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x)
def setup_cuda_fft_resample(n_jobs, W, new_len): """Set up CUDA FFT resampling Parameters ---------- n_jobs : int | str If n_jobs == 'cuda', the function will attempt to set up for CUDA FFT resampling. W : array The filtering function to be used during resampling. If n_jobs='cuda', this function will be shortened (since CUDA assumes FFTs of real signals are half the length of the signal) and turned into a gpuarray. new_len : int The size of the array following resampling. Returns ------- n_jobs : int Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise original n_jobs is passed. cuda_dict : dict Dictionary with the following CUDA-related variables: use_cuda : bool Whether CUDA should be used. fft_plan : instance of FFTPlan FFT plan to use in calculating the FFT. ifft_plan : instance of FFTPlan FFT plan to use in calculating the IFFT. x_fft : instance of gpuarray Empty allocated GPU space for storing the result of the frequency-domain multiplication. x : instance of gpuarray Empty allocated GPU space for the data to resample. W : array | instance of gpuarray This will either be a gpuarray (if CUDA enabled) or np.ndarray. If CUDA is enabled, W will be modified appropriately for use with filter.fft_multiply(). Notes ----- This function is designed to be used with fft_resample(). """ cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None, x_fft=None, x=None, y_fft=None, y=None) n_fft_x, n_fft_y = len(W), new_len cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) // 2 + 1) cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) // 2 + 1) if n_jobs == 'cuda': n_jobs = 1 if cuda_capable: # try setting up for float64 try: # do the IFFT normalization now so we don't have to later W = gpuarray.to_gpu(W[:cuda_fft_len_x].astype('complex_') / n_fft_y) cuda_dict.update( use_cuda=True, fft_plan=cudafft.Plan(n_fft_x, np.float64, np.complex128), ifft_plan=cudafft.Plan(n_fft_y, np.complex128, np.float64), x_fft=gpuarray.zeros(max(cuda_fft_len_x, cuda_fft_len_y), np.complex128), x=gpuarray.empty(max(int(n_fft_x), int(n_fft_y)), np.float64)) logger.info('Using CUDA for FFT resampling') except Exception: logger.info('CUDA not used, could not instantiate memory ' '(arrays may be too large), falling back to ' 'n_jobs=1') else: logger.info('CUDA not used, CUDA has not been initialized, ' 'falling back to n_jobs=1') return n_jobs, cuda_dict, W
import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.cusolver as solver handle = solver.cusolverDnCreate() x = np.random.randn(1024, 1024) + 1j * np.random.rand(1024, 1024) x = x + x.conj().T # Need to reverse dimensions because CUSOLVER expects column-major matrices: n, m = x.shape x_gpu = gpuarray.to_gpu(x.T.copy()) # Set up output buffers: w = gpuarray.empty(n, dtype=np.double) # Set up parameters params = solver.cusolverDnCreateSyevjInfo() solver.cusolverDnXsyevjSetTolerance(params, 1e-7) solver.cusolverDnXsyevjSetMaxSweeps(params, 15) # Set up work buffers: lwork = solver.cusolverDnZheevj_bufferSize(handle, 'CUSOLVER_EIG_MODE_VECTOR', 'u', n, x_gpu.gpudata, m, w.gpudata, params) workspace_gpu = gpuarray.zeros(lwork, dtype=x.dtype) info = gpuarray.zeros(1, dtype=np.int32) # Compute:
transpose_naive_multi = mod.get_function("transpose_naive_multi") transpose_share = mod.get_function("transpose_share") transpose_share_multi = mod.get_function("transpose_share_multi") transpose_share_multi_conflict = mod.get_function("transpose_share_multi_conflict") TEST = False if TEST: block_size = 32 i = 2 size = i * 32 matrix = np.random.random(size = (size, size)).astype(np.float32) matrix_out_cpu = np.empty_like(matrix).astype(np.float32) matrix_in = gpuarray.to_gpu(matrix) matrix_out_naive = gpuarray.empty((size, size), np.float32) matrix_out_naive_multi = gpuarray.empty((size, size), np.float32) matrix_out_share = gpuarray.empty((size, size), np.float32) matrix_out_share_multi = gpuarray.empty((size, size), np.float32) matrix_out_share_multi_conflict = gpuarray.empty((size, size), np.float32) matrix_out_cpu = np.transpose(matrix) transpose_naive(matrix_out_naive, matrix_in, np.uint32(size), block = (32, 32, 1), grid = (size / block_size, size / block_size, 1 )) transpose_naive_multi(matrix_out_naive_multi, matrix_in, np.uint32(size), block = (32, 8, 1), grid = (size / block_size, size / block_size, 1 )) transpose_share(matrix_out_share, matrix_in, np.uint32(size), block = (32, 32, 1), grid = (size / block_size, size / block_size, 1 )) transpose_share_multi(matrix_out_share_multi, matrix_in, np.uint32(size), block = (32, 8, 1), grid = (size / block_size, size / block_size, 1 ))
def transpose(src): w, h = src.shape result = gpuarray.empty((h, w), dtype=src.dtype) _transpose(result, src) return result
GPU = True NANT = 128 #NANT = 1024 START_JD = 2458000 END_JD = 2458001 INT_TIME = 21600 NSIDE = 512 #NSIDE = 8 times = np.arange(START_JD, END_JD, INT_TIME / aipy.const.s_per_day) a_cpu = np.zeros(shape=(NANT, 12 * NSIDE**2), dtype=np.complex64) a_cpu[:, :] = 1. + 1j aa_cpu = np.empty(shape=(NANT, NANT), dtype=a_cpu.dtype) a_gpu = gpuarray.empty(a_cpu.shape, a_cpu.dtype) aa_gpu = gpuarray.empty((a_cpu.shape[0], a_cpu.shape[0]), a_cpu.dtype) h = skcuda.cublas.cublasCreate() print '# Antennas:', NANT print 'NSIDE:', NSIDE print 'Starting', time.time() for ti, jd in enumerate(times): print ti, '/', len(times) t1 = time.time() if GPU: a_gpu.set(a_cpu) t2 = time.time() #skcuda.cublas.cublasSgemm(h, 'n', 't', a_gpu.shape[0], a_gpu.shape[0], a_gpu.shape[1], 1., a_gpu.gpudata, a_gpu.shape[0], a_gpu.gpudata, a_gpu.shape[0], 0., aa_gpu.gpudata, aa_gpu.shape[0]) skcuda.cublas.cublasCgemm(h, 'n', 't', a_gpu.shape[0], a_gpu.shape[0], a_gpu.shape[1], 1., a_gpu.gpudata,
def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx)
def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def get_flat_hits(self, gpu_detector, target_flag=(0x1 << 2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, no_map=False): '''GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits( np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, channels, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos), 3)) dir = dir.get().view(np.float32).reshape((len(dir), 3)) pol = pol.get().view(np.float32).reshape((len(pol), 3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() evidx = evidx.get() channels = channels.get() hitmap = {} return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx, channels)
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n / 2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n / 2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % ( i * 2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192 * 50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, cutools.Mapped(flag), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index + 1), np.uint32(better_i), nodes, block=(1, 1, 1), grid=(1, 1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
gpu_time1 = time.time() # transfer host (CPU) memory to device (GPU) memory a_gpu = gpuarray.to_gpu(a_cpu) d_gpu = gpuarray.to_gpu(d_cpu) kr1_gpu = gpuarray.to_gpu(kr1_cpu) kr2_gpu = gpuarray.to_gpu(kr2_cpu) threshold_bl_gpu = gpuarray.to_gpu(threshold_bl) threshold_donut_gpu = gpuarray.to_gpu(threshold_donut) threshold_h_gpu = gpuarray.to_gpu(threshold_h) threshold_v_gpu = gpuarray.to_gpu(threshold_v) bound1array_gpu = gpuarray.to_gpu(bound1array) bound3array_gpu = gpuarray.to_gpu(bound3array) # create empty gpu array for the result expected_bl_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) expected_donut_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) expected_h_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) expected_v_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) observed_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) bin_bl_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) bin_donut_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) bin_h_gpu = gpuarray.empty( (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32) bin_v_gpu = gpuarray.empty(
def gpuarray_factory(shape, dtype): import pycuda.gpuarray as gpuarray return gpuarray.empty(shape=shape, dtype=dtype)
def test_ndarray_shape(self): gpuarray.empty(np.array(3), np.float32) gpuarray.empty(np.array([3]), np.float32) gpuarray.empty(np.array([2, 3]), np.float32)
b = np.asarray(np.random.rand(n, n), t) start = time.time() c = np.dot(a, b) time_cpu.append(time.time() - start) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) start = time.time() c_gpu = culinalg.dot(a_gpu, b_gpu) time_linalg.append(time.time() - start) a_gpu2 = gpuarray.to_gpu(a) b_gpu2 = gpuarray.to_gpu(b) c_gpu2 = gpuarray.empty((n, n), np.float32) h = cublasCreate() start = time.time() #cublasSgemm(h, 'n', 'n', np.int32(n), np.int32(n), np.int32(n), np.float32(1.0), a_gpu2, np.int32(n), b_gpu2, np.int32(n), np.float32(1.0), c_gpu2, np.int32(n)) cublasSgemm(h, 'n', 'n', a.shape[0], a.shape[0], a.shape[0], 1.0, a_gpu2.gpudata, a.shape[0], b_gpu2.gpudata, a.shape[0], 1.0, c_gpu2.gpudata, a.shape[0]) time_cula.append(time.time() - start) cublasDestroy(h) #cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) MAKE_PLOT = True
def setup_cuda_fft_multiply_repeated(n_jobs, h, n_fft): """Set up repeated CUDA FFT multiplication with a given filter. Parameters ---------- n_jobs : int | str If n_jobs == 'cuda', the function will attempt to set up for CUDA FFT multiplication. h : array The filtering function that will be used repeatedly. n_fft : int The number of points in the FFT. Returns ------- n_jobs : int Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise original n_jobs is passed. cuda_dict : dict Dictionary with the following CUDA-related variables: use_cuda : bool Whether CUDA should be used. fft_plan : instance of FFTPlan FFT plan to use in calculating the FFT. ifft_plan : instance of FFTPlan FFT plan to use in calculating the IFFT. x_fft : instance of gpuarray Empty allocated GPU space for storing the result of the frequency-domain multiplication. x : instance of gpuarray Empty allocated GPU space for the data to filter. h_fft : array | instance of gpuarray This will either be a gpuarray (if CUDA enabled) or ndarray. Notes ----- This function is designed to be used with fft_multiply_repeated(). """ cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None, x_fft=None, x=None) h_fft = rfft(h, n=n_fft) if n_jobs == 'cuda': n_jobs = 1 init_cuda() if _cuda_capable: from pycuda import gpuarray cudafft = _get_cudafft() # set up all arrays necessary for CUDA # try setting up for float64 try: # do the IFFT normalization now so we don't have to later h_fft = gpuarray.to_gpu(h_fft.astype('complex_') / n_fft) cuda_dict.update( use_cuda=True, fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128), ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64), x_fft=gpuarray.empty(len(h_fft), np.complex128), x=gpuarray.empty(n_fft, np.float64)) logger.info('Using CUDA for FFT FIR filtering') except Exception as exp: logger.info('CUDA not used, could not instantiate memory ' '(arrays may be too large: "%s"), falling back to ' 'n_jobs=1' % str(exp)) else: logger.info('CUDA not used, CUDA could not be initialized, ' 'falling back to n_jobs=1') return n_jobs, cuda_dict, h_fft
# for more information on how to get this number for your device MATRIX_SIZE = 2 # create two random square matrices a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32) b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32) # compute reference on the CPU to verify GPU computation c_cpu = np.dot(a_cpu, b_cpu) # transfer host (CPU) memory to device (GPU) memory a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # create empty gpu array for the result (C = A * B) c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32) # get the kernel code from the template # by specifying the constant MATRIX_SIZE kernel_code = kernel_code_template % {'MATRIX_SIZE': MATRIX_SIZE} # compile the kernel code mod = compiler.SourceModule(kernel_code) # get the kernel function from the compiled module matrixmul = mod.get_function("MatrixMulKernel") # call the kernel on the card matrixmul( # inputs a_gpu,
def initialize_kernel(self): self.kernel_code = """ // Update each cell of the grid // Any live cell with less than two live neighbors dies // Any live cell with two or three live neighbors lives // Any live cell with four or more live neighbors dies // Any dead cell with three neighbors becomes a live cell __global__ void life_step(float *board, float *board2) {{ // Matrix size unsigned int m_size = {}; unsigned int num_cells = {}; // Column index of the element unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; // Row index of the element unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; // Thread ID in the board array unsigned int thread_id = y * m_size + x; // Game of life classically takes place on an infinite grid // I've used a toroidal geometry for the problem // The matrix wraps from top to bottom and from left to right unsigned int above = (thread_id - m_size) % num_cells; unsigned int below = (thread_id + m_size) % num_cells; unsigned int left; if (thread_id % m_size == 0) {{ left = thread_id + m_size - 1; }} else {{ left = thread_id - 1; }} unsigned int right; if (thread_id % m_size == m_size - 1) {{ right = thread_id - m_size + 1; }} else {{ right = thread_id + 1; }} unsigned int above_left; if (thread_id % m_size == 0) {{ above_left = (thread_id - 1) % num_cells; }} else {{ above_left = (thread_id - m_size - 1) % num_cells; }} unsigned int above_right; if (thread_id % m_size == m_size - 1) {{ above_right = (thread_id - blockDim.x * m_size + 1) % num_cells; }} else {{ above_right = (thread_id - m_size + 1) % num_cells; }} unsigned int below_left; if (thread_id % m_size == 0) {{ below_left = (thread_id + blockDim.x * m_size - 1) % num_cells; }} else {{ below_left = (thread_id + m_size - 1) % num_cells; }} unsigned int below_right; if (thread_id % m_size == m_size - 1) {{ below_right = (thread_id + 1) % num_cells; }} else {{ below_right = (thread_id + m_size + 1) % num_cells; }} unsigned int num_neighbors = board[above] + board[below] + board[left] + board[right] + board[above_left] + board[above_right] + board[below_left] + board[below_right]; unsigned int live_and2 = board[thread_id] && (num_neighbors == 2); // Live cell with 2 neighbors unsigned int live_and3 = board[thread_id] && (num_neighbors == 3); // Live cell with 3 neighbors unsigned int dead_and3 = !(board[thread_id]) && (num_neighbors == 3); // Dead cell with 3 neighbors board2[thread_id] = live_and2 || live_and3 || dead_and3; }} """ # Transfer CPU memory to GPU memory self.board_gpu = gpuarray.to_gpu(self.board) self.next_board = gpuarray.empty((self.size, self.size), np.float32) self.kernel = self.kernel_code.format(self.size, self.size * self.size) # Compile kernel code self.mod = SourceModule(self.kernel) # Get kernel function from compiled module self.game = self.mod.get_function('life_step')
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft): """Set up repeated CUDA FFT multiplication with a given filter Parameters ---------- n_jobs : int | str If n_jobs == 'cuda', the function will attempt to set up for CUDA FFT multiplication. h_fft : array The filtering function that will be used repeatedly. If n_jobs='cuda', this function will be shortened (since CUDA assumes FFTs of real signals are half the length of the signal) and turned into a gpuarray. Returns ------- n_jobs : int Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise original n_jobs is passed. cuda_dict : dict Dictionary with the following CUDA-related variables: use_cuda : bool Whether CUDA should be used. fft_plan : instance of FFTPlan FFT plan to use in calculating the FFT. ifft_plan : instance of FFTPlan FFT plan to use in calculating the IFFT. x_fft : instance of gpuarray Empty allocated GPU space for storing the result of the frequency-domain multiplication. x : instance of gpuarray Empty allocated GPU space for the data to filter. h_fft : array | instance of gpuarray This will either be a gpuarray (if CUDA enabled) or np.ndarray. If CUDA is enabled, h_fft will be modified appropriately for use with filter.fft_multiply(). Notes ----- This function is designed to be used with fft_multiply_repeated(). """ cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None, x_fft=None, x=None) n_fft = len(h_fft) cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1) if n_jobs == 'cuda': n_jobs = 1 if cuda_capable: # set up all arrays necessary for CUDA # try setting up for float64 try: # do the IFFT normalization now so we don't have to later h_fft = gpuarray.to_gpu( h_fft[:cuda_fft_len].astype('complex_') / len(h_fft)) cuda_dict.update( use_cuda=True, fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128), ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64), x_fft=gpuarray.empty(cuda_fft_len, np.complex128), x=gpuarray.empty(int(n_fft), np.float64)) logger.info('Using CUDA for FFT FIR filtering') except Exception: logger.info('CUDA not used, could not instantiate memory ' '(arrays may be too large), falling back to ' 'n_jobs=1') else: logger.info('CUDA not used, CUDA has not been initialized, ' 'falling back to n_jobs=1') return n_jobs, cuda_dict, h_fft
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Load Memory if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) gpu_first_child = ga.to_device(queue, first_child.astype(np.int32)) gpu_nchild = ga.to_device(queue, nchild.astype(np.int32)) nparent = len(first_child) parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # Run Kernel for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): if gpuapi.is_gpu_api_cuda(): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes.data, gpu_parent_nodes.data, gpu_first_child.data, gpu_nchild.data).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') return gpu_parent_nodes.get()
import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import numpy free_bytes, total_bytes = cuda.mem_get_info() exp = 10 while True: fill_floats = free_bytes / 4 - (1 << exp) if fill_floats < 0: raise RuntimeError("couldn't find allocatable size") try: print "alloc", fill_floats ary = gpuarray.empty((fill_floats, ), dtype=numpy.float32) break except: pass exp += 1 ary.fill(float("nan")) print "filled %d out of %d bytes with NaNs" % (fill_floats * 4, free_bytes)
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def test_numpy_integer_shape(self): gpuarray.empty(np.int32(17), np.float32) gpuarray.empty((np.int32(17), np.int32(17)), np.float32)
def _diffusion_child(comm, bm=None): rank = comm.Get_rank() ngpus = comm.Get_size() nodename = socket.gethostname() name = '%s %s' % (nodename, rank) print(name) if rank == 0: # split indices on GPUs indices_split = _split_indices(bm.indices, ngpus) print('Indices:', indices_split) # send data to GPUs for k in range(1, ngpus): sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if bm.label.allaxis: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name) tac = time.time() print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds') # gather data zsh_tmp = bm.argmax_z - bm.argmin_z ysh_tmp = bm.argmax_y - bm.argmin_y xsh_tmp = bm.argmax_x - bm.argmin_x if ngpus > 1: final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) for k in range(bm.nol): sendbuf = np.copy(walkmap[k]) recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) comm.Barrier() comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM) final_zero[k] = recvbuf else: final_zero = walkmap # block and grid size block = (32, 32, 1) x_grid = (xsh_tmp // 32) + 1 y_grid = (ysh_tmp // 32) + 1 grid = (int(x_grid), int(y_grid), int(zsh_tmp)) xsh_gpu = np.int32(xsh_tmp) ysh_gpu = np.int32(ysh_tmp) # smooth if bm.label.smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) except Exception as e: print( 'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.' ) bm.label.smooth = 0 if bm.label.smooth: final_smooth = np.copy(final_zero) for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_smooth[k]) for l in range(bm.label.smooth): curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) final_smooth[k] = a_gpu.get() final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8) final_smooth = get_labels(final_smooth, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression) # uncertainty if bm.label.uncertainty: try: max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_zero[k]) kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) uq = a_gpu.get() uq *= 255 uq = uq.astype(np.uint8) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_uq, final, compress=bm.label.compression) except Exception as e: print( 'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.' ) bm.label.uncertainty = False # free device ctx.pop() del ctx # argmax final_zero = np.argmax(final_zero, axis=0).astype(np.uint8) # save finals final_zero = get_labels(final_zero, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression) # computation time t = int(time.time() - bm.TIC) if t < 60: time_str = str(t) + ' sec' elif 60 <= t < 3600: time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec' elif 3600 < t: time_str = str(t // 3600) + ' h ' + str( (t % 3600) // 60) + ' min ' + str(t % 60) + ' sec' print('Computation time:', time_str) else: data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0) data = np.empty((data_z, data_y, data_x), dtype=data_dtype) if data_dtype == 'uint8': comm.Recv([data, MPI.BYTE], source=0, tag=1) else: comm.Recv([data, MPI.FLOAT], source=0, tag=1) allx, nbrw, sorw = comm.recv(source=0, tag=2) if allx: labels = [] for k in range(3): labels_z, labels_y, labels_x = comm.recv(source=0, tag=k + 3) labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels_tmp, MPI.INT], source=0, tag=k + 6) labels.append(labels_tmp) else: labels_z, labels_y, labels_x = comm.recv(source=0, tag=3) labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels, MPI.INT], source=0, tag=6) indices = comm.recv(source=0, tag=9) indices_child = comm.recv(source=0, tag=10) # init cuda device cuda.init() dev = cuda.Device(rank % cuda.Device.count()) ctx = dev.make_context() # select the desired script if allx: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name) tac = time.time() print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds') # free device ctx.pop() del ctx # send data for k in range(walkmap.shape[0]): datatemporaer = np.copy(walkmap[k]) comm.Barrier() comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
A = np.empty((n * batchSize, n), dtype=np.complex64) B = np.empty((n * batchSize, n), dtype=A.dtype) for i in range(batchSize): x = np.random.randn(n, n) + 1j * np.random.randn(n, n) x = x + x.conj().T x = x.astype(np.complex64) A[i * n:(i + 1) * n, :] = x # Need to reverse dimensions because CUSOLVER expects column-major matrices: B[i * n:(i + 1) * n, :] = x.T.copy() x_gpu = gpuarray.to_gpu(B) # Set up output buffers: w_gpu = gpuarray.empty((batchSize, n), dtype=np.float32) # Set up parameters params = solver.cusolverDnCreateSyevjInfo() solver.cusolverDnXsyevjSetTolerance(params, 1e-7) solver.cusolverDnXsyevjSetMaxSweeps(params, 15) # Set up work buffers: lwork = solver.cusolverDnCheevjBatched_bufferSize(handle, 'CUSOLVER_EIG_MODE_VECTOR', 'u', n, x_gpu.gpudata, n, w_gpu.gpudata, params, batchSize) workspace_gpu = gpuarray.zeros(lwork, dtype=A.dtype) info = gpuarray.zeros(batchSize, dtype=np.int32)
def _sub_kmeans_gpu(X, k): import skcuda import skcuda.linalg as LA import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray LA.init() n, d = X.shape X = X.astype(np.float32) V_gpu = random_V(d, mode='gpu') m = d / 2 X_gpu = gpuarray.to_gpu(X) mu_D_gpu = skcuda.misc.mean(X_gpu, axis=0, keepdims=True) sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu) S_D_gpu = LA.dot(sub_gpu, sub_gpu, transa='T') mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)]) itr = 1 assignment_unchanged = 0 C_gpu = None MAX_ITER = 100 while itr < MAX_ITER: Pc_gpu = projection_matrix(d, m, mode='gpu') PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T') PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32) for i in range(k): PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel() global_temp = LA.dot(X_gpu, PcV_gpu, transb='T') if itr % 2 == 0: C_old = C_gpu.get() X_transformed_gpu = gpuarray.empty( (n, k, m), dtype=np.float32) for i in xrange(n): temp = global_temp[i] X_transformed_gpu[i] = skcuda.misc.subtract( PcVmu_is_gpu, temp) X_transformed_squared_gpu = LA.multiply( X_transformed_gpu, X_transformed_gpu) X_transformed_squared_gpu = X_transformed_squared_gpu.reshape( (n * k, m)) X_transformed_sum_gpu = skcuda.misc.sum( X_transformed_squared_gpu, axis=-1, keepdims=True) X_transformed_sum_gpu = X_transformed_sum_gpu.reshape((n, k)) C_gpu = skcuda.misc.argmin( X_transformed_sum_gpu, axis=1) if itr % 2 == 0: Cnew = C_gpu.get() points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8)) if points_changed == 0: assignment_unchanged += 1 if assignment_unchanged >= 2: break print('[i] Itr %d: %d points changed' % (itr, points_changed)) C = C_gpu.get() counts = {i: 0 for i in range(k)} mu_is = np.zeros((k, d)).astype(np.float32) for i in range(n): C_id = np.int(C[i]) mu_is[C_id] += X[i] counts[C_id] += 1 mu_is = np.array([mu_is[i] / counts[i] for i in range(k)]) mu_is_gpu = gpuarray.to_gpu(mu_is) S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32) maxv = np.max(counts.values()) storage = np.empty((k, np.int(maxv), d)).astype(np.float32) counter = np.zeros(k, dtype=np.uint32) for i in range(n): C_id = np.int(C[i]) X_minus_mu_isi = (X[i] - mu_is[C_id])[:, None] storage[C_id, np.int(counter[C_id]), :] = X_minus_mu_isi.ravel() counter[C_id] += 1 storage_gpu = gpuarray.to_gpu(storage) for i in range(k): curr_cluster_points = storage_gpu[i, :np.int(counter[i]), :] S_is_gpu[i] = LA.dot(curr_cluster_points, curr_cluster_points, transa='T') S_is_sum_gpu = S_is_gpu.reshape((k, d * d)) S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True) S_is_sum_gpu = S_is_sum_gpu.reshape((d, d)) S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu) w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu') maxVal = min(w) m = np.sum([1 for i in w if i / maxVal > 1e-3]) m = max(1, m) itr += 1 return C_gpu.get(), V_gpu.get(), m
def _allocate_gpu_vectors(self, p0, tile_names, tile_names_map, matches, matches_num): """ Allocates anbd initializes the arrays on the gpu that will be used for the optimization process """ self._matches_num = matches_num self._params_num = p0.shape[0] self._tiles_num = p0.shape[0] // 3 # Allocate the parameters and gradients arrays, and copy the initial parameters self._cur_params_gpu = gpuarray.to_gpu(p0.astype(np.float32)) self._next_params_gpu = gpuarray.empty(p0.shape, np.float32, order='C') self._gradients_gpu = gpuarray.zeros(p0.shape, np.float32, order='C') self._diff_params_gpu = gpuarray.empty(p0.shape, np.float32, order='C') # Allocate and copy matches and indexes mappers - TODO - should be async self._src_matches_gpu = cuda.mem_alloc( int(np.dtype(np.float32).itemsize * 2 * matches_num)) assert (self._src_matches_gpu is not None) self._dst_matches_gpu = cuda.mem_alloc( int(np.dtype(np.float32).itemsize * 2 * matches_num)) assert (self._dst_matches_gpu is not None) self._src_idx_to_tile_idx_gpu = cuda.mem_alloc( int(np.dtype(int).itemsize * matches_num)) assert (self._src_idx_to_tile_idx_gpu is not None) self._dst_idx_to_tile_idx_gpu = cuda.mem_alloc( int(np.dtype(int).itemsize * matches_num)) assert (self._dst_idx_to_tile_idx_gpu is not None) # counter = 0 # for pair_name, pair_matches in matches.items(): # pair_matches_len = len(pair_matches[0]) # cuda.py_memcpy_htoa(self._src_matches_gpu, counter, pair_matches[0].astype(np.float32, order='C')) # cuda.py_memcpy_htoa(self._dst_matches_gpu, counter, pair_matches[1].astype(np.float32, order='C')) # # copy the mapping to tile idx to the gpu TODO - note that the numpy array is reused, so should be careful in async mode # tile_idx = np.empty((pair_matches_len, ), dtype=np.int32) # tile_idx.fill(tile_names_map[pair_name[0]]) # fill with src tile idx # cuda.py_memcpy_htoa(self._src_idx_to_tile_idx_gpu, counter, tile_idx) # tile_idx.fill(tile_names_map[pair_name[1]]) # fill with dst tile idx # cuda.py_memcpy_htoa(self._dst_idx_to_tile_idx_gpu, counter, tile_idx) # counter += pair_matches_len counter = 0 src_matches_all = np.empty((matches_num, 2), dtype=np.float32, order='C') dst_matches_all = np.empty((matches_num, 2), dtype=np.float32, order='C') src_tiles_idxs_all = np.empty((matches_num, ), dtype=np.int32, order='C') dst_tiles_idxs_all = np.empty((matches_num, ), dtype=np.int32, order='C') for pair_name, pair_matches in matches.items(): pair_matches_len = len(pair_matches[0]) src_matches_all[counter:counter + pair_matches_len] = pair_matches[0].astype( np.float32) dst_matches_all[counter:counter + pair_matches_len] = pair_matches[1].astype( np.float32) src_tiles_idxs_all[counter:counter + pair_matches_len] = tile_names_map[pair_name[0]] dst_tiles_idxs_all[counter:counter + pair_matches_len] = tile_names_map[pair_name[1]] counter += pair_matches_len cuda.memcpy_htod(self._src_matches_gpu, src_matches_all) cuda.memcpy_htod(self._dst_matches_gpu, dst_matches_all) cuda.memcpy_htod(self._src_idx_to_tile_idx_gpu, src_tiles_idxs_all) cuda.memcpy_htod(self._dst_idx_to_tile_idx_gpu, dst_tiles_idxs_all) # Allocate memory for the residuals self._residuals_gpu = gpuarray.empty((matches_num, ), np.float32, order='C')
def _sub_kmeans_gpu_custom(X, k): import skcuda import skcuda.linalg as LA import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray import custom_kernels as CC LA.init() CC.init() n, d = X.shape X = X.astype(np.float32) V_gpu = random_V(d, mode='gpu') m = d / 2 X_gpu = gpuarray.to_gpu(X) mu_D_gpu = CC.column_mean(X_gpu) sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu) sub_gpu_T = LA.transpose(sub_gpu) S_D_gpu = CC.matmul(sub_gpu_T, sub_gpu) mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)]) itr = 1 assignment_unchanged = 0 C_gpu = None MAX_ITER = 100 while itr < MAX_ITER: Pc_gpu = projection_matrix(d, m, mode='gpu') PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T') PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32) for i in range(k): PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel() global_temp = LA.dot(X_gpu, PcV_gpu, transb='T') if itr % 2 == 0: C_old = C_gpu.get() C_gpu = CC.argmin_mu_diff(global_temp, PcVmu_is_gpu) if itr % 2 == 0: Cnew = C_gpu.get() points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8)) if points_changed == 0: assignment_unchanged += 1 if assignment_unchanged >= 2: break print('[i] Itr %d: %d points changed' % (itr, points_changed)) C = C_gpu.get() counts = {i: 0 for i in range(k)} for i in xrange(n): C_id = np.int(C[i]) counts[C_id] += 1 maxv = np.max(counts.values()) storage = np.zeros((k, np.int(maxv), d)).astype(np.float32) counter = np.zeros(k, dtype=np.uint32) # k for i in range(n): C_id = np.int(C[i]) storage[C_id, np.int(counter[C_id]), :] = X[i].ravel() counter[C_id] += 1 storage_gpu = gpuarray.to_gpu(storage) mu_is_gpu = CC.sum_axis2(storage_gpu) counter_gpu = gpuarray.to_gpu(counter)[:, None] mu_is_gpu = skcuda.misc.divide( mu_is_gpu, counter_gpu.astype(np.float32)) S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32) # k,d,d for i in range(k): storage_gpu[i] = skcuda.misc.subtract(storage_gpu[i], mu_is_gpu[i]) curr_cluster_points = storage_gpu[i, :np.int(counter[i]), :] # |k|,d S_is_gpu[i] = LA.dot(curr_cluster_points, curr_cluster_points, transa='T') S_is_sum_gpu = S_is_gpu.reshape((k, d * d)) S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True) S_is_sum_gpu = S_is_sum_gpu.reshape((d, d)) S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu) w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu') maxVal = min(w) m = np.sum([1 for i in w if i / maxVal > 1e-3]) m = max(1, m) itr += 1 return C_gpu.get(), V_gpu.get(), m