def f_second(i,pi,attribute_values,df): #-----------------CUDA #Cheetah Variables template = Template(f_source) template.DF = np.array(df[pi], dtype = np.int32) template.nrow = df[pi].shape[0] nfeat = df[pi].shape[1] template.ncol = nfeat template.ll = np.array(df[i], dtype=np.int32) phi_i_ = [attribute_values[item] for item in pi] combinations = np.array(list(itertools.product(*phi_i_)), dtype=np.int32) combinations = np.array(combinations.ravel().tolist(), dtype=np.int32) height = combinations.shape[0] template.H = height f_kernel = nvcc_compile(template, "my_f") ##Threads block_x = np.int(height/nfeat) blocksize = (block_x,1,1) gridsize = (1,1) ##Kernel h_res = np.zeros(height/nfeat,dtype=np.float32) d_combinations = gpu.to_gpu(combinations) d_res = gpu.to_gpu(h_res) f_kernel(d_combinations,d_res,block=blocksize,grid=gridsize) ress = d_res.get() ress = list(ress) return sum(ress)
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1): Vr = V[:,0:r].copy() Wr = W[0:r,:].copy() X_gpu = gpuarray.to_gpu(X) V_gpu = gpuarray.to_gpu(Vr) W_gpu = gpuarray.to_gpu(Wr) #Frobinius norm at previous step B_gpu = linalg.dot(V_gpu, W_gpu) L = linalg.norm(X_gpu-B_gpu)**2 iteration = 0 while 1: #update V V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu)) V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu)) B_gpu = linalg.dot(V_gpu, W_gpu) #update W W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu) W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu) B_gpu = linalg.dot(V_gpu, W_gpu) Lnew = linalg.norm(X_gpu-B_gpu)**2 if abs(Lnew-L) <= tol*(L+1): break else: L = Lnew iteration += 1 if(verbose and iteration%50==0): print "At iteration %i, the loss is %.2f" %(iteration, L) return V_gpu,W_gpu,iteration
def inline_linear_interp(amps, phases, freqs, output, df, flow, imin, start_index): # Note that imin and start_index are ignored in the GPU code; they are only # needed for CPU. if output.precision == 'double': raise NotImplementedError("Double precision linear interpolation not currently supported on CUDA scheme") flow = numpy.float32(flow) texlen = numpy.int32(len(freqs)) fmax = numpy.float32(freqs[texlen-1]) hlen = numpy.int32(len(output)) (fn1, fn2, ftex, atex, ptex, nt, nb) = get_dckernel(hlen) freqs_gpu = gpuarray.to_gpu(freqs) freqs_gpu.bind_to_texref_ext(ftex, allow_offset=False) amps_gpu = gpuarray.to_gpu(amps) amps_gpu.bind_to_texref_ext(atex, allow_offset=False) phases_gpu = gpuarray.to_gpu(phases) phases_gpu.bind_to_texref_ext(ptex, allow_offset=False) fn1 = fn1.prepared_call fn2 = fn2.prepared_call df = numpy.float32(df) g_out = output.data.gpudata lower = zeros(nb, dtype=numpy.int32).data.gpudata upper = zeros(nb, dtype=numpy.int32).data.gpudata fn1((1, 1), (nb, 1, 1), lower, upper, texlen, df, flow, fmax) fn2((nb, 1), (nt, 1, 1), g_out, df, hlen, flow, fmax, texlen, lower, upper) pycbc.scheme.mgr.state.context.synchronize() return output
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def test_cublasDgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = np.random.rand(l, m, k).astype(np.float64) B = np.random.rand(l, k, n).astype(np.float64) C_res = np.einsum('nij,njk->nik',A,B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.float64) alpha = np.float64(1.0) beta = np.float64(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasDgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get())
def compute_v_without_derivs(self, Xs, Yinvs, Ts): #Turn the parts of omega into gpuarrays Xs = np.require(Xs, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Yinvs = np.require(Yinvs, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Ts = np.require(Ts, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Xs_d = gpuarray.to_gpu(Xs) Yinvs_d = gpuarray.to_gpu(Yinvs) Ts_d = gpuarray.to_gpu(Ts) #Determine N = the number of integer points to sum over # K = the number of different omegas to compute the function at N = self.Sd.size/self.g K = Xs.size/(self.g**2) #Create room on the gpu for the real and imaginary finite sum calculations fsum_reald = gpuarray.zeros(N*K, dtype=np.double) fsum_imagd = gpuarray.zeros(N*K, dtype=np.double) #Turn all scalars into numpy data types Nd = np.int32(N) Kd = np.int32(K) gd = np.int32(self.g) blocksize = (self.tilewidth, self.tileheight, 1) gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1) self.finite_sum_without_derivs(fsum_reald, fsum_imagd, Xs_d, Yinvs_d, Ts_d, self.Sd, gd, Nd, Kd, block = blocksize, grid = gridsize) cuda.Context.synchronize() fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd) fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd) return fsums_real + 1.0j*fsums_imag
def calculate_circuit_graph_vertex_data_device(d_D, d_C, length): logger = logging.getLogger('eulercuda.pyeulertour.calculate_circuit_graph_vertex_data_device') logger.info("started.") mod = SourceModule(""" __global__ void calculateCircuitGraphVertexData( unsigned int * D,unsigned int * C,unsigned int ecount){ unsigned int tid=(blockDim.x*blockDim.y * gridDim.x*blockIdx.y) + (blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x; if( tid <ecount) { unsigned int c=D[tid]; atomicExch(C+c,1); } } """) calculate_circuit_graph_vertex_data = mod.get_function('calculateCircuitGraphVertexData') block_dim, grid_dim = getOptimalLaunchConfiguration(length, 512) logger.info('block_dim = %s, grid_dim = %s' % (block_dim, grid_dim)) np_d_D = gpuarray.to_gpu(d_D) np_d_C = gpuarray.to_gpu(d_C) calculate_circuit_graph_vertex_data( np_d_D, np_d_C, np.uintc(length), block=block_dim, grid=grid_dim ) np_d_D.get(d_D) np_d_C.get(d_C) # devdata = pycuda.tools.DeviceData() # orec = pycuda.tools.OccupancyRecord(devdata, block_dim[0] * grid_dim[1]) # logger.info("Occupancy = %s" % (orec.occupancy * 100)) logger.info("Finished. Leaving.") return d_D, d_C
def compare_sampling(self, hist, reps=10): nbins = hist.GetNbinsX(); xaxis = hist.GetXaxis() intg = hist.GetIntegral() cdf_y = np.empty(nbins+1, dtype=float) cdf_x = np.empty_like(cdf_y) cdf_x[0] = xaxis.GetBinLowEdge(1) cdf_y[0] = 0.0 for i in xrange(1,len(cdf_x)): cdf_y[i] = intg[i] cdf_x[i] = xaxis.GetBinUpEdge(i) cdf_x_gpu = gpuarray.to_gpu(cdf_x.astype(np.float32)) cdf_y_gpu = gpuarray.to_gpu(cdf_y.astype(np.float32)) block =(128,1,1) grid = (128, 1) out_gpu = gpuarray.empty(shape=int(block[0]*grid[0]), dtype=np.float32) out_h = ROOT.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(), xaxis.GetXmax()) out_h.SetLineColor(ROOT.kGreen) for i in xrange(reps): self.test_sample_cdf(np.int32(i), np.int32(len(cdf_x_gpu)), cdf_x_gpu, cdf_y_gpu, out_gpu, block=block, grid=grid) out = out_gpu.get() for v in out: out_h.Fill(v) prob = out_h.KolmogorovTest(hist) return prob, out_h
def cache_z(self, z): x = np.require(z.real, dtype = np.double, requirements = ['A','W','O','C']) y = np.require(z.imag, dtype = np.double, requirements = ['A','W','O','C']) xd = gpuarray.to_gpu(x) yd = gpuarray.to_gpu(y) cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes) cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
def gpu_apply_row_max(X): """ max(X) = y across the rows returns the gpuarray, y """ if type(X)==GPUArray: gX = X else: gX = to_gpu(np.asarray(X, dtype=np.float32)) dims = np.asarray(X.shape, dtype=np.int32) gy = to_gpu(np.zeros(dims[0], dtype=np.float32)) giy = to_gpu(np.zeros(dims[0], dtype=np.int32)) if devinfo.max_block_threads >= 1024: blocksize = 32 else: blocksize = 16 gridsize = int(dims[0] / blocksize) + 1 if gX.flags.c_contiguous: func = CUDA_Kernels.get_function("apply_rows_max") shared = 4*blocksize*(blocksize+2) # pad for bank conflicts else: func = CUDA_Kernels.get_function("apply_rows_max_cm") shared = 4*blocksize*(blocksize+1) func(gX, gy, giy, dims[0], dims[1], block=(blocksize, blocksize,1), grid = (gridsize,1), shared = shared) return gy, giy
def gpu_sweep_col_mult(X, y): """ X * y = X across the columns """ if type(X)==GPUArray: gX = X else: gX = to_gpu(np.asarray(X, dtype=np.float32)) if type(y)==GPUArray: gy = y else: gy = to_gpu(np.asarray(y, dtype=np.float32)) dims = np.asarray(X.shape, dtype=np.int32) if devinfo.max_block_threads >= 1024: blocksize = 32 else: blocksize = 16 gridsize = int(dims[0] / blocksize) + 1 shared = 4*blocksize if gX.flags.c_contiguous: func = CUDA_Kernels.get_function("sweep_columns_mult") else: func = CUDA_Kernels.get_function("sweep_columns_mult_cm") func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1), grid = (gridsize,1), shared = shared) if type(y)!=GPUArray: X = gX.get()
def __init__(self, name, type, epsW, epsB, initW, initB, momW, momB, wc, weight, bias, weightIncr , biasIncr, disableBprop = False): Layer.__init__(self, name, type, disableBprop) self.epsW = F(epsW) self.epsB = F(epsB) self.initW = initW self.initB = initB self.momW = F(momW) self.momB = F(momB) self.wc = F(wc) if weight is not None: self.weight = gpuarray.to_gpu(weight)#.astype(np.float32) else: self.weight = None if bias is not None: self.bias = gpuarray.to_gpu(bias).astype(np.float32) else: self.bias = None if self.momW > 0.0: if weightIncr is not None: self.weightIncr = gpuarray.to_gpu(weightIncr) else: self.weightIncr = None if biasIncr is not None: self.biasIncr = gpuarray.to_gpu(biasIncr) else: self.biasIncr = None
def gpu_sweep_row_div(X, y): """ X / y = X down the rows """ if type(X)==GPUArray: gX = X else: gX = to_gpu(np.asarray(X, dtype=np.float32)) if type(y)==GPUArray: gy = y else: gy = to_gpu(np.asarray(y, dtype=np.float32)) dims = np.asarray(X.shape, dtype=np.int32) if devinfo.max_block_threads >= 1024: blocksize = 32 else: blocksize = 16 gridsize = int(dims[0] / blocksize) + 1 shared = int(4*dims[1]) if gX.flags.c_contiguous: func = CUDA_Kernels.get_function("sweep_rows_div") else: func = CUDA_Kernels.get_functions("sweep_rows_div_cm") func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1), grid = (gridsize,1), shared = shared) if type(y)!=GPUArray: X = gX.get()
def _pre_run(self): assert(self.LPU_obj) assert(all([var in self.memory_manager.variables for var in self.variables.keys()])) for var, d in self.variables.items(): v_dict = self.memory_manager.variables[var] if not d['uids']: uids = v_dict['uids'].keys() inds = v_dict['uids'].values() o = np.argsort(inds) d['uids'] = [uids[i] for i in o] self.src_inds[var] = garray.to_gpu(np.arange(len(d['uids']))) else: uids = [] inds = [] for uid in d['uids']: try: inds.append(v_dict['uids'][uid]) uids.append(uid) except: pass inds = np.array(inds,np.int32) o = np.argsort(inds) self.src_inds[var] = garray.to_gpu(inds[o]) d['uids'] = [uids[i] for i in o] self._d_output[var] = garray.empty(len(d['uids']), v_dict['buffer'].dtype) d['output']=np.zeros(len(d['uids']), v_dict['buffer'].dtype) self.pre_run()
def _init_weights(self, weight_shape, bias_shape): if self.weight is None: if self.name == 'noise': assert(weight_shape[0] == weight_shape[1]) self.weight = gpuarray.to_gpu(np.eye(weight_shape[0], dtype = np.float32)) else: self.weight = gpuarray.to_gpu(randn(weight_shape, np.float32) * self.initW) if self.bias is None: if self.initB > 0.0: self.bias = gpuarray.to_gpu((np.ones(bias_shape, dtype=np.float32) * self.initB)) else: self.bias = gpuarray.zeros(bias_shape, dtype=np.float32) Assert.eq(self.weight.shape, weight_shape) Assert.eq(self.bias.shape, bias_shape) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias) if self.momW > 0.0: if self.weightIncr is None: self.weightIncr = gpuarray.zeros_like(self.weight) if self.biasIncr is None: self.biasIncr = gpuarray.zeros_like(self.bias) Assert.eq(self.weightIncr.shape, weight_shape) Assert.eq(self.biasIncr.shape, bias_shape)
def compute_v_without_derivs(self, Z): #Turn the numpy set Z into gpuarrays x = Z.real y = Z.imag x = np.require(x, dtype = np.double, requirements=['A','W','O','C']) y = np.require(y, dtype = np.double, requirements=['A','W','O','C']) xd = gpuarray.to_gpu(x) yd = gpuarray.to_gpu(y) self.yd = yd #Detemine N = the number of integer points to sum over and # K = the number of values to compute the function at N = self.Sd.size/self.g K = Z.size/self.g #Create room on the gpu for the real and imaginary finite sum calculations fsum_reald = gpuarray.zeros(N*K, dtype=np.double) fsum_imagd = gpuarray.zeros(N*K, dtype=np.double) #Make all scalars into numpy data types Nd = np.int32(N) Kd = np.int32(K) gd = np.int32(self.g) blocksize = (self.tilewidth, self.tileheight, 1) gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1) self.finite_sum_without_derivs(fsum_reald, fsum_imagd, xd, yd, self.Sd, gd, Nd, Kd, block = blocksize, grid = gridsize) cuda.Context.synchronize() fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd) fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd) return fsums_real + 1.0j*fsums_imag
def test_neural_net_regression(self): for _ in range(20): N = 10000 # Number of data points D = 100 # Dimensionality of exogenous data P = 50 # Dimensionality of endogenous data W_true = 10 * np.random.rand(D, P) - 5 b_true = 100 * np.random.rand(P) - 50 X = np.random.randn(N, D) Y = np.dot(X, W_true) + b_true[np.newaxis, :] + np.random.randn(N, P) W_lstsq = np.linalg.lstsq(np.c_[np.ones((N, 1)), X], Y)[0] b_lstsq = W_lstsq[0] W_lstsq = W_lstsq[1:] data_provider = BatchDataProvider(gpuarray.to_gpu(X.astype(np.float32), allocator=memory_pool.allocate), gpuarray.to_gpu(Y.astype(np.float32), allocator=memory_pool.allocate)) model = NeuralNetRegression([], n_in=D, n_out=P) optimizer = SGD(model, SimpleSGDUpdate, data_provider, data_provider, learning_rate_schedule=constant_scheduler(10.), early_stopping=True) optimizer.run(100) self.assertLess(np.abs(W_lstsq - model.top_layer.W.get()).max(), 1e-5)
def project(self, fs_gpu, lamda=1e2, eta=0., alpha = 2/3, sparsity=None, maxfun=20, mode=None, show=False): """ Estimates weighting vector (w s.t. w>=0) by minimizing min |B*w - f|^2 + lamda * |w|^0.66 + eta * |grad(w)|^2 """ if fs_gpu.__class__ != np.ndarray: fs = fs_gpu.get() else: fs = fs_gpu # Naive projection wproj = self.basis_host.dot(fs.reshape((fs.size,1))).astype(np.float32) #wproj /= float(self._intern_shape[1]) wproj = np.reshape(wproj, self.shape) if sparsity: wproj = imagetools.sparsify(wproj, sparsity) if mode == None: return cua.to_gpu(wproj) elif mode == 'HPQ_L1': if fs_gpu.__class__ == np.ndarray: self.data_gpu = cua.to_gpu(fs_gpu) else: self.data_gpu = fs_gpu
def _init(self, pn, qn, wf_qn): super()._init(pn, qn, wf_qn) self._wf_q_grid_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf_q_grid)) self._wf_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf)) mod = SourceModule(""" __global__ void transform(double *ps, double *qs, double *wf_q_grid, double *wf, double *out_real, double *out_imag) {{ int idx_x = threadIdx.x + blockIdx.x * blockDim.x; int idx_y = threadIdx.y + blockIdx.y * blockDim.y; int idx = idx_x + idx_y * {qn}; double qdiff, prefactor, s, c; if (idx_x >= {qn} || idx_y >= {pn}) return; for (int j = 0; j < {wf_qn}; j++) {{ qdiff = wf_q_grid[j] - qs[idx]; prefactor = exp({g} * qdiff * qdiff) * wf[j]; sincos({h} * ps[idx] * qdiff, &s, &c); out_real[idx] += prefactor * c; out_imag[idx] += prefactor * s; }} }} """.format(g=-0.5*self._gamma, h=1./HBAR, pn=pn, qn=qn, wf_qn=wf_qn)) self._kernel = mod.get_function('transform') self._kernel.prepare('PPPPPP') self._gpu_grid, self._gpu_block = carve_array(qn, pn)
def add_batch(self, X, T, wc=None): """Add a batch of training data to an iterative solution, weighted if neeed. The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method. With parameters HH_out, HT_out, the output will be put into these matrices instead of model. Args: X (matrix): input data matrix size (N * `inputs`) T (matrix): output data matrix size (N * `outputs`) wc (vector): vector of weights for data samples, one weight per sample, size (N * 1) HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together """ devH = self._project(X, dev=True) T = np.array(T, order="C", dtype=self.precision) devT = gpuarray.to_gpu(T) if wc is not None: # apply weights if given w = np.array(wc**0.5, dtype=self.precision)[:, None] # re-shape to column matrix devWC = gpuarray.to_gpu(w) misc.mult_matvec(devH, devWC, axis=0, out=devH) misc.mult_matvec(devT, devWC, axis=0, out=devT) if self.HH is None: # initialize space for self.HH, self.HT self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision) self.HH = linalg.eye(self.L, dtype=self.precision) self.HH *= self.norm linalg.add_dot(devH, devT, self.HT, transa='T') if self.precision is np.float64: linalg.add_dot(devH, devH, self.HH, transa='T') else: cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
def parameters(self, value): """Update the parameters. ``value`` must have the shape ``(weights, biases)``""" self.W = value[0] if isinstance(value[0], GPUArray) else \ gpuarray.to_gpu(value[0]) self.b = value[1] if isinstance(value[0], GPUArray) else \ gpuarray.to_gpu(value[1])
def generate(self, width, height, real_axis_range, imag_axis_range, tasks): if not is_gpu_accelerated(): self._logger.error( 'No GPU acceleration is available, please use CPU.') return iterations = np.empty(width * height, np.int32) iterations_gpu = gpuarray.to_gpu(iterations) z_values = np.empty(width * height, np.float32) z_values_gpu = gpuarray.to_gpu(z_values) cmin = complex(real_axis_range[0], imag_axis_range[0]) cmax = complex(real_axis_range[1], imag_axis_range[1]) dc = cmax - cmin dx, mx = divmod(width, self._block_size[0]) dy, my = divmod(height, self._block_size[1]) grid_size = ((dx + (mx > 0)), (dy + (my > 0))) self._get_pixel_iterations( iterations_gpu, z_values_gpu, np.int32(width), np.int32(height), np.complex64(cmin), np.complex64(dc), block=self._block_size, grid=grid_size) return (iterations_gpu, z_values_gpu, abs(dc))
def cuda_dot3(A, b): print("cuda_dot3", A.shape, b.shape) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b for now b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) temp_gpu = linalg.dot(bt_gpu, A_gpu) bt_gpu.gpudata.free() del(bt_gpu) A_gpu.gpudata.free() del(A_gpu) # send b to GPU b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(temp_gpu, b_gpu) temp_gpu.gpudata.free() del(temp_gpu) b_gpu.gpudata.free() del(b_gpu) #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM #but most likely not necessary return c_gpu.get()
def set_by_inds(self, inds, data): """ Set mapped data by integer indices. Parameters ---------- inds : sequence of int Integer indices of data elements to update. data : numpy.ndarray Data to assign. """ assert len(np.shape(inds)) == 1 assert issubclass(inds.dtype.type, numbers.Integral) N = len(inds) assert N == len(data) if not isinstance(inds, gpuarray.GPUArray): inds = gpuarray.to_gpu(inds) if not isinstance(data, gpuarray.GPUArray): data = gpuarray.to_gpu(data) # Allocate data array if it doesn't exist: if not self.data: self.data = gpuarray.empty(N, data.dtype) else: assert self.data.dtype == data.dtype try: func = self.set_by_inds.cache[inds.dtype] except KeyError: inds_ctype = tools.dtype_to_ctype(inds.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(data_ctype=self.data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]") self.set_by_inds.cache[inds.dtype] = func func(self.data, inds, data, range=slice(0, N, 1))
def main(): import numpy as np import pycuda.autoinit from pycuda import gpuarray from skdata import toy from hebel import memory_pool from hebel.data_providers import BatchDataProvider from hebel.models import NeuralNetRegression from hebel.optimizers import SGD from hebel.parameter_updaters import SimpleSGDUpdate from hebel.monitors import SimpleProgressMonitor from hebel.schedulers import exponential_scheduler # Get data data_cpu, targets_cpu = toy.Boston().regression_task() data = gpuarray.to_gpu(data_cpu.astype(np.float32), allocator=memory_pool.allocate) targets = gpuarray.to_gpu(targets_cpu.astype(np.float32), allocator=memory_pool.allocate) data_provider = BatchDataProvider(data, targets) # Create model object model = NeuralNetRegression(n_in=data_cpu.shape[1], n_out=targets_cpu.shape[1], layers=[100], activation_function='relu') # Create optimizer object optimizer = SGD(model, SimpleSGDUpdate, data_provider, data_provider, learning_rate_schedule=exponential_scheduler(.1, .9999), early_stopping=True) optimizer.run(3000)
def _initialize_gpu_ds(self): """ Setup GPU arrays. """ self.synapse_state = garray.zeros(int(self.total_synapses) + \ len(self.input_neuron_list), np.float64) if self.my_num_gpot_neurons>0: self.V = garray.zeros(int(self.my_num_gpot_neurons), np.float64) else: self.V = None if self.my_num_spike_neurons>0: self.spike_state = garray.zeros(int(self.my_num_spike_neurons), np.int32) if len(self.public_gpot_list)>0: self.public_gpot_list_g = garray.to_gpu(self.public_gpot_list) self.projection_gpot = garray.zeros(len(self.public_gpot_list), np.double) self._extract_gpot = self._extract_projection_gpot_func() if len(self.public_spike_list)>0: self.public_spike_list_g = garray.to_gpu( \ (self.public_spike_list-self.spike_shift).astype(np.int32)) self.projection_spike = garray.zeros(len(self.public_spike_list), np.int32) self._extract_spike = self._extract_projection_spike_func()
def cuda_ageSols(sols): """ makes solutions to age """ #get num sols num_sols = len(sols); #convert to form of numpy arrays sols_arr = numpy.array(sols, numpy.float32); ones_arr = numpy.zeros_like(sols,numpy.float32); ones_arr[:,constants.AGE_GENE] = 1; #copy each to gpu sols_gpu = gpuarray.to_gpu(sols_arr); mask_gpu = gpuarray.to_gpu(ones_arr); #debug if debug == True: print mask_gpu.view(); #apply mask aged_sols_gpu = sols_gpu + mask_gpu; sols = aged_sols_gpu.get().tolist();
def test_set_by_inds_from_inds(self): dest_gpu = gpuarray.to_gpu(np.zeros(5, dtype=np.float32)) ind_dest = gpuarray.to_gpu(np.array([0, 2, 4])) src_gpu = gpuarray.to_gpu(np.arange(5, 10, dtype=np.float32)) ind_src = gpuarray.to_gpu(np.array([2, 3, 4])) gpu.set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src) assert np.allclose(dest_gpu.get(), np.array([7, 0, 8, 0, 9], dtype=np.float32))
def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10,26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def add_neurons(self, number, func, W, B): """Add prepared neurons to the SLFN, merge with existing ones. Adds a number of specific neurons to SLFN network. Weights and biases must be provided for that function. If neurons of such type already exist, they are merged together. Args: number (int): the number of new neurons to add func (str): transformation function of hidden layer. Linear function creates a linear model. W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`) B (vector): a 1-D vector of neuron biases, size (`number` * 1) """ ntypes = [nr[1] for nr in self.neurons] # existing types of neurons if func in ntypes: # add to an existing neuron type i = ntypes.index(func) nn0, _, devW, devB = self.neurons[i] number = nn0 + number devW = gpuarray.to_gpu(np.hstack((devW.get(), W))) devB = gpuarray.to_gpu(np.hstack((devB.get(), B))) self.neurons[i] = (number, func, devW, devB) else: # create a new neuron type devW = gpuarray.to_gpu(W) devB = gpuarray.to_gpu(B) self.neurons.append((number, func, devW, devB)) self.reset() self.B = None
def copy_to_GPU(self): self.data_on_GPU = gpuarray.to_gpu(self.data.astype(np.float32))
def dpLSA(lamda, theta, p, X, denominators, shape_image, N, M, K, iters, r1, r2): lamda_gpu = to_gpu(lamda) theta_gpu = to_gpu(theta) p_gpu = to_gpu(p) X_gpu = to_gpu(X) den_gpu = to_gpu(denominators) steps = int(ceil(N / 1024.0)) stepsP = int(ceil(K / 50.0)) EStep = kernels.get_function("EStepDPLSA") LamdaComputing = kernels.get_function("Lamda") ThetaComputing = kernels.get_function("Theta1") CalculeDiv = kernels.get_function("Theta2") ThetaDivision = kernels.get_function("Theta3") pycuda.driver.Context.synchronize() inicio = time.time() for i in range(iters): EStep(p_gpu, theta_gpu, lamda_gpu, np.uint32(stepsP), np.uint32(K), block=(1, 1, 50), grid=(N, M, 1)) ThetaComputing(theta_gpu, X_gpu, p_gpu, np.float32(r1), np.uint32(steps), np.uint32(N), np.uint32(M), den_gpu, block=(1024, 1, 1), grid=(1, M, K)) CalculeDiv(theta_gpu, den_gpu, block=(1, M, 1), grid=(1, 1, K)) ThetaDivision(theta_gpu, den_gpu, block=(1, 1, 1), grid=(1, M, K)) LamdaComputing(lamda_gpu, X_gpu, p_gpu, np.uint32(K), np.float32(r2), block=(1, M, 1), grid=(N, 1, K)) pycuda.driver.Context.synchronize() endTime = time.time() - inicio abundances = lamda_gpu.get().reshape(shape_image[0], shape_image[1], K) endmembers = theta_gpu.get().T lamda_gpu.gpudata.free() theta_gpu.gpudata.free() X_gpu.gpudata.free() p_gpu.gpudata.free() den_gpu.gpudata.free() return (abundances, endmembers, endTime)
def create_from_numpy(self, arr): return gpuarray.to_gpu(arr.astype(self.dtype))
def get_seeds(n): return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
from time import time from pycuda.compiler import SourceModule #%% def set_module(): return SourceModule( "__global__ void scalar_multiply_kernel( int *outvect, int scalar,int *vec){int j = threadIdx.x;outvect[j] = scalar*vec[j];}" ) #%% ker = set_module() smg = ker.get_function("scalar_multiply_kernel") t_vec = np.random.randint(low=0, high=3, size=1500).astype(np.int32) t_vec_gpu = gpuarray.to_gpu(t_vec) o_vec = gpuarray.empty_like(t_vec_gpu) smg(o_vec, np.int32(2), t_vec_gpu, block=(150, 1, 1), grid=(int(1500 / 150), 1, 1)) print(np.allclose(o_vec.get(), 2 * t_vec)) #%% print(o_vec.get()) print(2 * t_vec) #%%
h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a) mod = SourceModule(""" __global__ void deviceElementwise(float * __restrict__ d_c, const float * __restrict__ d_a, const float * __restrict__ d_b, const int N) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= N) return; d_c[tid] = sqrt(fabs(d_a[tid])) + exp(d_b[tid]); } """) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) d_c = gpuarray.zeros_like(d_a) # --- Define a reference to the __global__ function and call it deviceElementwise = mod.get_function("deviceElementwise") blockDim = (BLOCKSIZE, 1, 1) gridDim = (iDivUp(N, BLOCKSIZE), 1, 1) start.record() deviceElementwise(d_c, d_a, d_b, np.int32(N), block=blockDim, grid=gridDim) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Processing time = %fs" % (secs)) h_c = d_c.get()
bestcost = c; viable = via1; } } V[di] = bestcost; } """ % (STEER_LIMIT_K, NANGLES, NSPEEDS, GRID_RES, xsize, ysize, VMIN, VMAX, AxMAX, AyMAX, TIMESTEP, homex, homey)) valueiter = mod.get_function("valueiter") V0_gpu = cuda.mem_alloc(NSPEEDS * NANGLES * xsize * ysize * 4) V = np.zeros((NSPEEDS, NANGLES, ysize, xsize), np.float32) + 1000. cuda.memcpy_htod(V0_gpu, V) ye_in = gpuarray.to_gpu(ye) del ye s = trange(20) v0 = np.sum(V, dtype=np.float64) for j in s: for i in range(20): valueiter(V0_gpu, V0_gpu, ye_in, block=(16, 8, 1), grid=(xsize // 16, ysize // 8, NANGLES * NSPEEDS)) cuda.memcpy_dtoh(V, V0_gpu) v1 = np.sum(V, dtype=np.float64) dv = v1 - v0 v0 = v1
def dwt2d(self, input_array, depth=1, gpu_output=False, gpu_allocator=cuda.mem_alloc): """ dwt2d Perform a 2D forward discrete wavelet transform. If an array has more than 2 dimensions, this function will perform a batched transform. Parameters ---------- input_array: Numpy array Input array array of data to be transformed. If the array is 2D, this function will perform a 2D forward DWT on the data. If the array has more than 2 dimensions, this function will perform a batched 2D forward DWT for the last 2 dimensions of the array. depth: int, optional Depth level of transform; must be greater than or equal to 0 (1 is the default) gpu_output: bool, optional If True, then return the coefficients as a list of GPUArray objects. (False is the default) gpu_allocator: callable, optional Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default) Returns ------- list of Numpy arrays or GPUArray objects A list of numpy arrays of the DWT coefficients either in float32 or float64, depending on whether use_float32 was set to True or False for PycudaWaveletTransform. The coefficients are organized as [c_lo_lo, (c_lo_hi_depth, c_hi_lo_depth, c_hi_hi_depth), ... , (c_lo_hi_1, c_hi_lo_1, c_hi_hi_1)] """ if not isinstance(input_array, numpy.ndarray) and not isinstance( input_array, gpuarray.GPUArray): raise TypeError('invalid input array type') if depth < 0: raise RuntimeError('invalid depth value') elif depth == 0: return [input_array] in_shape = input_array.shape if len(in_shape) < 2: raise RuntimeError('input array must have 2 or more dimensions') num_cols = int(in_shape[-1]) num_rows = int(in_shape[-2]) num_slices = int(1) if len(in_shape) > 2: for i in in_shape[:-2]: num_slices *= i # Get size of output arrays out_cols = [ int((num_cols + 1) / 2 if num_cols % 2 else (num_cols / 2)) ] out_rows = [ int((num_rows + 1) / 2 if num_rows % 2 else (num_rows / 2)) ] for i in range(depth - 1): pc = out_cols[-1] nc = int((pc + 1) / 2 if pc % 2 else (pc / 2)) out_cols.append(int(nc)) pr = out_rows[-1] nr = int((pr + 1) / 2 if pr % 2 else (pr / 2)) out_rows.append(int(nr)) # If input is in Numpy array, then copy it to a GPUArray object. if isinstance(input_array, numpy.ndarray): cont_input_array = numpy.ascontiguousarray(input_array, dtype=self._dtype) input_device_array = gpuarray.to_gpu(cont_input_array) else: if input_array.dtype != self._dtype: input_device_array = input_array.astype(self._dtype) else: input_device_array = input_array if not input_device_array.flags.c_contiguous: input_device_array = input_device_array.reshape(in_shape, order='C') # Allocate device arrays row_approx_device_array = gpuarray.GPUArray( [num_slices, num_rows, out_cols[0]], dtype=self._dtype, allocator=gpu_allocator) row_detail_device_array = gpuarray.GPUArray( [num_slices, num_rows, out_cols[0]], dtype=self._dtype, allocator=gpu_allocator) output_device_arrays = [] for r, c in zip(out_rows, out_cols): output_device_arrays.append( dict(ll=gpuarray.GPUArray([num_slices, r, c], dtype=self._dtype, allocator=gpu_allocator), hl=gpuarray.GPUArray([num_slices, r, c], dtype=self._dtype, allocator=gpu_allocator), lh=gpuarray.GPUArray([num_slices, r, c], dtype=self._dtype, allocator=gpu_allocator), hh=gpuarray.GPUArray([num_slices, r, c], dtype=self._dtype, allocator=gpu_allocator))) # Transform row_block = (256, 1, 1) row_grid_x = int(out_cols[0] / row_block[0]) + (1 if out_cols[0] % row_block[0] else 0) row_grid_y = int( num_rows / row_block[1]) + (1 if num_rows % row_block[1] else 0) row_grid_z = int(num_slices / row_block[2]) + (1 if num_slices % row_block[2] else 0) row_grid = (row_grid_x, row_grid_y, row_grid_z) shared_mem_size = (row_block[2] * row_block[1] * (self._dec_length + 2 * (row_block[0] - 1)) * self._dtype().itemsize) self._dwt_row(input_device_array, row_approx_device_array, row_detail_device_array, numpy.int32(num_rows), numpy.int32(num_cols), numpy.int32(num_slices), block=row_block, grid=row_grid, shared=shared_mem_size) col_block = (256, 1, 1) col_grid_x = int(out_cols[0] / col_block[0]) + (1 if out_cols[0] % col_block[0] else 0) col_grid_y = int(out_rows[0] / col_block[1]) + (1 if out_rows[0] % col_block[1] else 0) col_grid_z = int(num_slices / col_block[2]) + (1 if num_slices % col_block[2] else 0) col_grid = (col_grid_x, col_grid_y, col_grid_z) self._dwt_col(row_approx_device_array, row_detail_device_array, output_device_arrays[0]['ll'], output_device_arrays[0]['lh'], output_device_arrays[0]['hl'], output_device_arrays[0]['hh'], numpy.int32(num_rows), numpy.int32(out_cols[0]), numpy.int32(num_slices), block=col_block, grid=col_grid, shared=0) for d in range(1, depth): row_grid_x = int(out_cols[d] / row_block[0]) + ( 1 if out_cols[d] % row_block[0] else 0) row_grid_y = int(out_rows[d - 1] / row_block[1]) + ( 1 if out_rows[d - 1] % row_block[1] else 0) row_grid = (row_grid_x, row_grid_y, row_grid_z) self._dwt_row(output_device_arrays[d - 1]['ll'], row_approx_device_array, row_detail_device_array, numpy.int32(out_rows[d - 1]), numpy.int32(out_cols[d - 1]), numpy.int32(num_slices), block=row_block, grid=row_grid, shared=shared_mem_size) col_grid_x = int(out_cols[d] / col_block[0]) + ( 1 if out_cols[d] % col_block[0] else 0) col_grid_y = int(out_rows[d] / col_block[1]) + ( 1 if out_rows[d] % col_block[1] else 0) col_grid = (col_grid_x, col_grid_y, col_grid_z) self._dwt_col(row_approx_device_array, row_detail_device_array, output_device_arrays[d]['ll'], output_device_arrays[d]['lh'], output_device_arrays[d]['hl'], output_device_arrays[d]['hh'], numpy.int32(out_rows[d - 1]), numpy.int32(out_cols[d]), numpy.int32(num_slices), block=col_block, grid=col_grid, shared=0) # Get results from device approx_array = output_device_arrays[-1][ 'll'] if gpu_output else output_device_arrays[-1]['ll'].get() if len(in_shape) > 2: new_shape = list(in_shape[:-2]) new_shape.append(out_rows[-1]) new_shape.append(out_cols[-1]) approx_array = approx_array.reshape(new_shape) results = [approx_array] for d, r, c in zip(output_device_arrays[::-1], out_rows[::-1], out_cols[::-1]): detail_lh_array = d['lh'] if gpu_output else d['lh'].get() detail_hl_array = d['hl'] if gpu_output else d['hl'].get() detail_hh_array = d['hh'] if gpu_output else d['hh'].get() if len(in_shape) > 2: new_shape = list(in_shape[:-2]) new_shape.append(r) new_shape.append(c) detail_lh_array = detail_lh_array.reshape(new_shape) detail_hl_array = detail_hl_array.reshape(new_shape) detail_hh_array = detail_hh_array.reshape(new_shape) results.append((detail_lh_array, detail_hl_array, detail_hh_array)) return results
def dwt1d(self, input_array, depth=1, gpu_output=False, gpu_allocator=cuda.mem_alloc): """ dwt1d Perform a 1D forward discrete wavelet transform. If an array has multiple rows, this function will perform a batched transform. Parameters ---------- input_array: Numpy array or GPUArray object Input array array of data to be transformed. If the array is 1D, this function will perform a 1D forward DWT on the data. If the array is multidimensional, this function will perform a batched 1D forward DWT for all rows of the array. The rows being the last dimension of the array. depth: int, optional Depth level of transform; must be greater than or equal to 0 (1 is the default) gpu_output: bool, optional If True, then return the coefficients as a list of GPUArray objects. (False is the default) gpu_allocator: callable, optional Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default) Returns ------- list of Numpy arrays or GPUArray objects A list of numpy arrays of the DWT coefficients either in float32 or float64, depending on whether use_float32 was set to True or False for PycudaWaveletTransform. The coefficients are organized as [c_lo, c_hi_depth, c_hi_depth-1, ... , c_hi_2, c_hi_1] """ if not isinstance(input_array, numpy.ndarray) and not isinstance( input_array, gpuarray.GPUArray): raise TypeError('invalid input array type') if depth < 0: raise RuntimeError('invalid depth value') elif depth == 0: return [input_array] in_shape = input_array.shape row_size = int(in_shape[-1]) num_rows = int(1) if len(in_shape) > 1: for i in in_shape[:-1]: num_rows *= i # If input is in Numpy array, then copy it to a GPUArray object. if isinstance(input_array, numpy.ndarray): cont_input_array = numpy.ascontiguousarray(input_array, dtype=self._dtype) input_device_array = gpuarray.to_gpu(cont_input_array) else: if input_array.dtype != self._dtype: input_device_array = input_array.astype(self._dtype) else: input_device_array = input_array if not input_device_array.flags.c_contiguous: input_device_array = input_device_array.reshape(in_shape, order='C') # Get size of output arrays out_sizes = [ int((row_size + 1) / 2 if row_size % 2 else (row_size / 2)) ] for i in range(depth - 1): ps = out_sizes[-1] ns = int((ps + 1) / 2 if ps % 2 else (ps / 2)) out_sizes.append(int(ns)) # Allocate device arrays detail_device_arrays = [] approx_device_arrays = [] for s in out_sizes: detail_device_arrays.append( gpuarray.GPUArray([num_rows, s], dtype=self._dtype, allocator=gpu_allocator)) approx_device_arrays.append( gpuarray.GPUArray([num_rows, s], dtype=self._dtype, allocator=gpu_allocator)) # Transform block = (256, 1, 1) grid_x = int( out_sizes[0] / block[0]) + (1 if out_sizes[0] % block[0] else 0) grid_y = int(num_rows / block[1]) + (1 if num_rows % block[1] else 0) grid = (grid_x, grid_y, 1) shared_mem_size = block[2] * block[1] * ( self._dec_length + 2 * (block[0] - 1)) * self._dtype().itemsize self._dwt_row(input_device_array, approx_device_arrays[0], detail_device_arrays[0], numpy.int32(num_rows), numpy.int32(row_size), numpy.int32(1), block=block, grid=grid, shared=shared_mem_size) for d in range(1, depth): grid_x = int(out_sizes[d] / block[0]) + (1 if out_sizes[d] % block[0] else 0) grid = (grid_x, grid_y, 1) self._dwt_row(approx_device_arrays[d - 1], approx_device_arrays[d], detail_device_arrays[d], numpy.int32(num_rows), numpy.int32(out_sizes[d - 1]), numpy.int32(1), block=block, grid=grid, shared=shared_mem_size) # Get results from device approx_array = approx_device_arrays[ -1] if gpu_output else approx_device_arrays[-1].get() if len(in_shape) > 1: new_shape = list(in_shape[:-1]) new_shape.append(out_sizes[-1]) approx_array = approx_array.reshape(new_shape) results = [approx_array] for i, (d, s) in enumerate( zip(detail_device_arrays[::-1], out_sizes[::-1])): detail_array = d if gpu_output else d.get() if len(in_shape) > 1: new_shape = list(in_shape[:-1]) new_shape.append(s) detail_array = detail_array.reshape(new_shape) results.append(detail_array) return results
def idwt2d(self, input_list, gpu_output=False, gpu_allocator=cuda.mem_alloc): """ idwt2d Perform a 2D inverse discrete wavelet transform. If the arrays have more than 2 dimensions, this function will perform a batched transform. Parameters ---------- input_list: list of Numpy arrays A list of numpy arrays of the DWT coefficients to be reconstructed. The coefficients are organized as [c_lo_lo, (c_lo_hi_depth, c_hi_lo_depth, c_hi_hi_depth), ... , (c_lo_hi_1, c_hi_lo_1, c_hi_hi_1)]. The detail array can have dimensions that are one less than that of the approximation array for each level. If the arrays have more than 2 dimensions, this function will perform a batched 2D inverse DWT for the last 2 dimensions of the arrays. All arrays must have the same dimensions except for the last 2 dimensions. gpu_output: bool, optional If True, then return the reconstructed signal as a GPUArray object. (False is the default) gpu_allocator: callable, optional Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default) Returns ------- Numpy array or GPUArray object A Numpy array of the reconstructed signal either in float32 or float64, depending on whether use_float32 was set to True or False for PycudaWaveletTransform. """ if not isinstance(input_list, list): raise TypeError('invalid input_list type') if not isinstance(input_list[0], numpy.ndarray) and not isinstance( input_list[0], gpuarray.GPUArray): raise TypeError('invalid input_list type') if len(input_list) > 1: approx_shape = list(input_list[0].shape) for dl in input_list[1:]: if not isinstance(dl, list) and not isinstance(dl, tuple): raise TypeError('invalid input_list type') elif len(dl) != 3: raise TypeError('invalid input_list type') if any([ not isinstance(x, numpy.ndarray) and not isinstance(x, gpuarray.GPUArray) for x in dl ]): raise TypeError('invalid input_list type') detail_shape = list(dl[0].shape) if list(dl[1].shape) != detail_shape or list( dl[2].shape) != detail_shape: raise RuntimeError('arrays have incompatible shapes') if detail_shape[-1] < approx_shape[-1] - 1 or approx_shape[ -1] < detail_shape[-1]: raise RuntimeError('arrays have incompatible shapes') if detail_shape[-2] < approx_shape[-2] - 1 or approx_shape[ -2] < detail_shape[-2]: raise RuntimeError('arrays have incompatible shapes') if len(detail_shape) > 2: if approx_shape[:-2] != detail_shape[:-2]: raise RuntimeError('arrays have incompatible shapes') approx_shape[-1] = 2 * detail_shape[-1] approx_shape[-2] = 2 * detail_shape[-2] depth = len(input_list) - 1 if depth == 0: return input_list[0] in_shape = input_list[0].shape num_cols = int(in_shape[-1]) num_rows = int(in_shape[-2]) num_slices = int(1) if len(in_shape) > 2: for i in in_shape[:-2]: num_slices *= i # Get size of input_list arrays in_rows = [] in_cols = [] out_rows = [] out_cols = [] for i in range(depth): c = input_list[i + 1][0].shape[-1] r = input_list[i + 1][0].shape[-2] in_rows.append(int(r)) in_cols.append(int(c)) out_rows.append(int(2 * r)) out_cols.append(int(2 * c)) # Allocate device arrays approx_device_array = gpuarray.GPUArray( [num_slices, out_rows[-1], out_cols[-1]], dtype=self._dtype, allocator=gpu_allocator) row_approx_device_array = gpuarray.GPUArray( [num_slices, out_rows[-1], in_cols[-1]], dtype=self._dtype, allocator=gpu_allocator) row_detail_device_array = gpuarray.GPUArray( [num_slices, out_rows[-1], in_cols[-1]], dtype=self._dtype, allocator=gpu_allocator) # If input is in Numpy arrays, then copy it to GPUArray objects. input_device_list = [] for input_index, input_array in enumerate(input_list): if input_index == 0: if isinstance(input_array, numpy.ndarray): cont_input_array = numpy.ascontiguousarray( input_array, dtype=self._dtype) input_device_array = gpuarray.to_gpu(cont_input_array) else: if input_array.dtype != self._dtype: input_device_array = input_array.astype(self._dtype) else: input_device_array = input_array if not input_device_array.flags.c_contiguous: input_device_array = input_device_array.reshape( input_array.shape, order='C') input_device_list.append(input_device_array) else: detail_list = [] for detail_array in input_array: if isinstance(detail_array, numpy.ndarray): cont_input_array = numpy.ascontiguousarray( detail_array, dtype=self._dtype) input_device_array = gpuarray.to_gpu(cont_input_array) else: if detail_array.dtype != self._dtype: input_device_array = detail_array.astype( self._dtype) else: input_device_array = detail_array if not input_device_array.flags.c_contiguous: input_device_array = input_device_array.reshape( detail_array.shape, order='C') detail_list.append(input_device_array) input_device_list.append(detail_list) # Transform col_block = (256, 1, 1) col_grid_x = int(in_cols[0] / col_block[0]) + (1 if in_cols[0] % col_block[0] else 0) col_grid_y = int(in_rows[0] / col_block[1]) + (1 if in_rows[0] % col_block[1] else 0) col_grid_z = int(num_slices / col_block[2]) + (1 if num_slices % col_block[2] else 0) col_grid = (col_grid_x, col_grid_y, col_grid_z) self._idwt_col(input_device_list[0], input_device_list[1][0], input_device_list[1][1], input_device_list[1][2], row_approx_device_array, row_detail_device_array, numpy.int32(num_rows), numpy.int32(num_cols), numpy.int32(in_rows[0]), numpy.int32(in_cols[0]), numpy.int32(num_slices), block=col_block, grid=col_grid, shared=0) row_block = (256, 1, 1) row_grid_x = int(in_cols[0] / row_block[0]) + (1 if in_cols[0] % row_block[0] else 0) row_grid_y = int(out_rows[0] / row_block[1]) + (1 if out_rows[0] % row_block[1] else 0) row_grid_z = int(num_slices / row_block[2]) + (1 if num_slices % row_block[2] else 0) row_grid = (row_grid_x, row_grid_y, row_grid_z) shared_mem_size = (row_block[2] * row_block[1] * (2 * (int(self._dec_length / 2) + row_block[0] - 1)) * self._dtype().itemsize) self._idwt_row(row_approx_device_array, row_detail_device_array, approx_device_array, numpy.int32(out_rows[0]), numpy.int32(in_cols[0]), numpy.int32(in_cols[0]), numpy.int32(num_slices), block=row_block, grid=row_grid, shared=shared_mem_size) for d in range(1, depth): col_grid_x = int(in_cols[d] / col_block[0]) + (1 if in_cols[d] % col_block[0] else 0) col_grid_y = int(in_rows[d] / col_block[1]) + (1 if in_rows[d] % col_block[1] else 0) col_grid = (col_grid_x, col_grid_y, col_grid_z) self._idwt_col(approx_device_array, input_device_list[d + 1][0], input_device_list[d + 1][1], input_device_list[d + 1][2], row_approx_device_array, row_detail_device_array, numpy.int32(out_rows[d - 1]), numpy.int32(out_cols[d - 1]), numpy.int32(in_rows[d]), numpy.int32(in_cols[d]), numpy.int32(num_slices), block=col_block, grid=col_grid, shared=0) row_grid_x = int(in_cols[d] / row_block[0]) + (1 if in_cols[d] % row_block[0] else 0) row_grid_y = int(out_rows[d] / row_block[1]) + ( 1 if out_rows[d] % row_block[1] else 0) row_grid = (row_grid_x, row_grid_y, row_grid_z) self._idwt_row(row_approx_device_array, row_detail_device_array, approx_device_array, numpy.int32(out_rows[d]), numpy.int32(in_cols[d]), numpy.int32(in_cols[d]), numpy.int32(num_slices), block=row_block, grid=row_grid, shared=shared_mem_size) # Get results from device approx_array = approx_device_array if gpu_output else approx_device_array.get( ) if len(in_shape) > 2: new_shape = list(in_shape[:-2]) new_shape.append(approx_array.shape[-2]) new_shape.append(approx_array.shape[-1]) approx_array = approx_array.reshape(new_shape) return approx_array
def _process_j(self): """ Use PyCuda """ nElements = self.width * self.height nBlocks = nElements / BLOCK_SIZE + 1 #print 'No. elements:', nElements #print 'No. blocks:', nBlocks grid_dimensions = (nBlocks, 1) block_dimensions = (BLOCK_SIZE, 1, 1) tilde_im_mapping = self.pycuda_y_tilde_pbo.map() tilde_fx_mapping = self.pycuda_y_fx_tilde_pbo.map() tilde_fy_mapping = self.pycuda_y_fy_tilde_pbo.map() tilde_m_mapping = self.pycuda_y_m_tilde_pbo.map() p_tilde_mapping = self.pycuda_yp_tilde_pbo.map() p_tilde_fx_mapping = self.pycuda_yp_fx_tilde_pbo.map() p_tilde_fy_mapping = self.pycuda_yp_fy_tilde_pbo.map() p_tilde_m_mapping = self.pycuda_yp_m_tilde_pbo.map() pp_tilde_mapping = self.pycuda_ypp_tilde_pbo.map() pp_tilde_fx_mapping = self.pycuda_ypp_fx_tilde_pbo.map() pp_tilde_fy_mapping = self.pycuda_ypp_fy_tilde_pbo.map() pp_tilde_m_mapping = self.pycuda_ypp_m_tilde_pbo.map() partialsum = np.zeros((nBlocks, 1), dtype=np.float32) partialsum_gpu = gpuarray.to_gpu(partialsum) partialsum_fx = np.zeros((nBlocks, 1), dtype=np.float32) partialsum_fx_gpu = gpuarray.to_gpu(partialsum_fx) partialsum_fy = np.zeros((nBlocks, 1), dtype=np.float32) partialsum_fy_gpu = gpuarray.to_gpu(partialsum_fy) partialsum_m = np.zeros((nBlocks, 1), dtype=np.float32) partialsum_m_gpu = gpuarray.to_gpu(partialsum_m) #CUDA definition #__global__ void j(unsigned char *y_im_t, float *y_fx_t, float *y_fy_t, unsigned char *y_m_t, # unsigned char *yp_im_t, float *yp_fx_t, float *yp_fy_t, unsigned char *yp_m_t, # unsigned char *ypp_im_t, float *ypp_fx_t, float *ypp_fy_t, unsigned char *ypp_m_t, # float *output, float *output_fx, float *output_fy, float *output_m, # int len) #Make the call... self.cuda_j.prepared_call(grid_dimensions, block_dimensions,\ tilde_im_mapping.device_ptr(),tilde_fx_mapping.device_ptr(),\ tilde_fy_mapping.device_ptr(),tilde_m_mapping.device_ptr(),\ p_tilde_mapping.device_ptr(),p_tilde_fx_mapping.device_ptr(),\ p_tilde_fy_mapping.device_ptr(),p_tilde_m_mapping.device_ptr(),\ pp_tilde_mapping.device_ptr(),pp_tilde_fx_mapping.device_ptr(),\ pp_tilde_fy_mapping.device_ptr(),pp_tilde_m_mapping.device_ptr(),\ partialsum_gpu.gpudata, partialsum_fx_gpu.gpudata,\ partialsum_fy_gpu.gpudata, partialsum_m_gpu.gpudata,\ np.uint32(nElements)) cuda_driver.Context.synchronize() tilde_im_mapping.unmap() tilde_fx_mapping.unmap() tilde_fy_mapping.unmap() tilde_m_mapping.unmap() p_tilde_mapping.unmap() p_tilde_fx_mapping.unmap() p_tilde_fy_mapping.unmap() p_tilde_m_mapping.unmap() pp_tilde_mapping.unmap() pp_tilde_fx_mapping.unmap() pp_tilde_fy_mapping.unmap() pp_tilde_m_mapping.unmap() #Read out the answer... partialsum = partialsum_gpu.get() partialsum_fx = partialsum_fx_gpu.get() partialsum_fy = partialsum_fy_gpu.get() partialsum_m = partialsum_m_gpu.get() sum_gpu = np.sum(partialsum[0:np.ceil(nBlocks / 2.)]) sum_fx_gpu = np.sum(partialsum_fx[0:np.ceil(nBlocks / 2.)]) sum_fy_gpu = np.sum(partialsum_fy[0:np.ceil(nBlocks / 2.)]) sum_m_gpu = np.sum(partialsum_m[0:np.ceil(nBlocks / 2.)]) scale = 1 #print 'j (GPU) components' #print sum_gpu, sum_fx_gpu/scale/scale, sum_fy_gpu/scale/scale return sum_gpu + sum_fx_gpu / scale / scale + sum_fy_gpu / scale / scale + sum_m_gpu
def idwt1d(self, input_list, gpu_output=False, gpu_allocator=cuda.mem_alloc): """ idwt1d Perform a 1D inverse discrete wavelet transform. If an array has multiple rows, this function will perform a batched transform. Parameters ---------- input_list: list of Numpy arrays or GPUArray objects A list of numpy arrays of the DWT coefficients to be reconstructed. The coefficients are organized as [c_lo, c_hi_depth, c_hi_depth-1, ... , c_hi_2, c_hi_1]. The detail row size can be one less than the approximation row size for each level. If the arrays are multidimensional, this function will perform a batched 1D inverse DWT for all rows of the arrays. All arrays must have the same dimensions except for the last dimension. The rows being the last dimension of the array. gpu_output: bool, optional If True, then return the reconstructed signal as a GPUArray object. (False is the default) gpu_allocator: callable, optional Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default) Returns ------- Numpy array or GPUArray object An array of the reconstructed signal either in float32 or float64, depending on whether use_float32 was set to True or False for PycudaWaveletTransform. """ if not isinstance(input_list, list): raise TypeError('invalid input_list type') if any([ not isinstance(x, numpy.ndarray) and not isinstance(x, gpuarray.GPUArray) for x in input_list ]): raise TypeError('invalid input_list type') if len(input_list) > 1: approx_shape = list(input_list[0].shape) for d in input_list[1:]: detail_shape = list(d.shape) if detail_shape[-1] < approx_shape[-1] - 1 or approx_shape[ -1] < detail_shape[-1]: raise RuntimeError('arrays have incompatible shapes') if len(detail_shape) > 1: if approx_shape[:-1] != detail_shape[:-1]: raise RuntimeError('arrays have incompatible shapes') approx_shape[-1] = 2 * detail_shape[-1] depth = len(input_list) - 1 if depth == 0: return input_list[0] in_shape = input_list[0].shape num_rows = int(1) if len(in_shape) > 1: for i in in_shape[:-1]: num_rows *= i # If input is in Numpy arrays, then copy it to GPUArray objects. input_device_list = [] for input_array in input_list: if isinstance(input_array, numpy.ndarray): cont_input_array = numpy.ascontiguousarray(input_array, dtype=self._dtype) input_device_array = gpuarray.to_gpu(cont_input_array) else: if input_array.dtype != self._dtype: input_device_array = input_array.astype(self._dtype) else: input_device_array = input_array if not input_device_array.flags.c_contiguous: input_device_array = input_device_array.reshape( input_array.shape, order='C') input_device_list.append(input_device_array) # Get size of input_list arrays in_sizes = [] out_sizes = [] for i in range(depth): s = input_list[i + 1].shape[-1] in_sizes.append(int(s)) out_sizes.append(int(2 * s)) # Allocate device arrays approx_device_arrays = [] for s in out_sizes: approx_device_arrays.append( gpuarray.GPUArray([num_rows, s], dtype=self._dtype, allocator=gpu_allocator)) # Transform block = (256, 1, 1) grid_x = int( in_sizes[0] / block[0]) + (1 if in_sizes[0] % block[0] else 0) grid_y = int(num_rows / block[1]) + (1 if num_rows % block[1] else 0) grid = (grid_x, grid_y, 1) shared_mem_size = block[2] * block[1] * (2 * ( int(self._dec_length / 2) + block[0] - 1)) * self._dtype().itemsize self._idwt_row(input_device_list[0], input_device_list[1], approx_device_arrays[0], numpy.int32(num_rows), numpy.int32(in_shape[-1]), numpy.int32(in_sizes[0]), numpy.int32(1), block=block, grid=grid, shared=shared_mem_size) for d in range(1, depth): grid_x = int( in_sizes[d] / block[0]) + (1 if in_sizes[d] % block[0] else 0) grid = (grid_x, grid_y, 1) self._idwt_row(approx_device_arrays[d - 1], input_device_list[d + 1], approx_device_arrays[d], numpy.int32(num_rows), numpy.int32(out_sizes[d - 1]), numpy.int32(in_sizes[d]), numpy.int32(1), block=block, grid=grid, shared=shared_mem_size) # Get results from device result = approx_device_arrays[ -1] if gpu_output else approx_device_arrays[-1].get() if len(in_shape) > 1: new_shape = list(in_shape[:-1]) new_shape.append(result.shape[-1]) result = result.reshape(new_shape) return result
def initialize_delays(self, spikedelays): self.spikedelay_arr = gpuarray.to_gpu( array(rint(spikedelays / self.dt), dtype=int32))
""" Demonstrates how to extract the lower triangle of a matrix. """ import pycuda.autoinit import pycuda.driver as drv import numpy as np import pycuda.gpuarray as gpuarray import scikits.cuda.linalg as culinalg import scikits.cuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing lower triangle extraction for type ' + str(np.dtype(t)) N = 10 if np.iscomplexobj(t()): a = np.asarray(np.random.rand(N, N), t) else: a = np.asarray(np.random.rand(N, N)+1j*np.random.rand(N, N), t) a_gpu = gpuarray.to_gpu(a) b_gpu = culinalg.tril(a_gpu, False) print 'Success status: ', np.allclose(b_gpu.get(), np.tril(a))
def initialize_spikes(self, spiketimes, spiketimes_indices): self.spiketimes = gpuarray.to_gpu( array(rint(spiketimes / self.dt), dtype=int32)) self.spiketime_indices = gpuarray.to_gpu( array(spiketimes_indices, dtype=int32))
def trace_gpu(self, ray_array): """ Core method to do ray tracing change CPU recursive version -> GPU iterative version for every depth, call gpu functions Then do stack process to get final result """ stack = [] ray_array_cpu = np.array(ray_array, np.float32) scene_cpu = np.array(self.__scene_gpu, np.float32) print(ray_array_cpu.shape) print(scene_cpu.shape) scene_size = np.int32(scene_cpu.shape[0]) # loop for every depth for i in range(self.__max_recursion_depth): num_ray = np.int32(ray_array_cpu.shape[0]) num_thread = 256 num_block = int(np.ceil(float(num_ray) / float(num_thread))) ray_array_gpu = gpuarray.to_gpu(ray_array_cpu) scene_gpu = gpuarray.to_gpu(scene_cpu) output_intersect_gpu = gpuarray.empty((num_ray, 6), np.float32) output_obj_index_gpu = gpuarray.empty((num_ray, 1), np.int32) output_color = gpuarray.empty((num_ray, 3), np.float32) # call parallel intersect function self.kernel_fun_intersect(ray_array_gpu, scene_gpu, num_ray, scene_size, output_intersect_gpu, output_obj_index_gpu, block=(num_thread, 1, 1), grid=(num_block, 1, 1)) intersect_cpu = output_intersect_gpu.get() obj_index_cpu = output_obj_index_gpu.get() intersect_gpu = gpuarray.to_gpu(intersect_cpu) obj_index_gpu = gpuarray.to_gpu(obj_index_cpu) # the final depth will treat every objects as diffuse one # flag here is used to control this if i == self.__max_recursion_depth - 1: flag = np.int32(1) else: flag = np.int32(0) # trace diffuse self.kernel_fun_trace_diffuse(intersect_gpu, obj_index_gpu, num_ray, scene_gpu, scene_size, flag, output_color, block=(num_thread, 1, 1), grid=(num_block, 1, 1)) color_cpu = output_color.get() if flag == 1: stack.append([color_cpu]) break intersect_gpu = gpuarray.to_gpu(intersect_cpu) obj_index_gpu = gpuarray.to_gpu(obj_index_cpu) output_reflect_ray_gpu = gpuarray.empty((num_ray, 7), np.float32) output_refract_ray_gpu = gpuarray.empty((num_ray, 7), np.float32) output_fresnel_gpu = gpuarray.empty((num_ray, 1), np.float32) # trace non diffuse self.kernel_fun_trace_non_diffuse(ray_array_gpu, intersect_gpu, obj_index_gpu, num_ray, scene_gpu, scene_size, flag, output_reflect_ray_gpu, output_refract_ray_gpu, output_fresnel_gpu, block=(num_thread, 1, 1), grid=(num_block, 1, 1)) reflect_ray_cpu = output_reflect_ray_gpu.get() refract_ray_cpu = output_refract_ray_gpu.get() fresnel_cpu = output_fresnel_gpu.get() # prepare for next depth interation ray_array_cpu_next, to_stack = self.ray_batch_filter( reflect_ray_cpu, refract_ray_cpu, fresnel_cpu, obj_index_cpu) to_stack.append(color_cpu) stack.append(to_stack) ray_array_cpu = np.array(ray_array_cpu_next, np.float32) color = self.process_stack(stack, scene_gpu) return color
def to_gpu(array): return gpuarray.to_gpu(array)
def initialize_traces(self, traces, traces_offset): self.traces = gpuarray.to_gpu(array(traces, dtype=double)) self.traces_offset = gpuarray.to_gpu(array(traces_offset, dtype=int32))
import skcuda.fft as cu_fft print 'Testing fft/ifft..' N = 256 batch_size = 16 x = np.empty((batch_size, N, N), np.float32) xf = np.empty((batch_size, N, N), np.complex64) y = np.empty((batch_size, N, N), np.float32) for i in xrange(batch_size): x[i, :, :] = np.asarray(np.random.rand(N, N), np.float32) xf[i, :, :] = np.fft.fft2(x[i, :, :]) y[i, :, :] = np.real(np.fft.ifft2(xf[i, :, :])) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N, N / 2 + 1), np.complex64) plan_forward = cu_fft.Plan((N, N), np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan((N, N), np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.empty((batch_size, N, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan((N, N), np.complex64, np.complex64, batch_size)
import numpy as np import pycuda.autoinit from pycuda import gpuarray host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32) device_data = gpuarray.to_gpu(host_data) device_data_x2 = 2 * device_data host_data_x2 = device_data_x2.get() print(host_data_x2)
def cuda_gridvis(sub_array, f, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() if sub_array==1: Antennas = 40 else: Antennas = 60 # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) # determin the file type (uvfits or fitsidi) h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') #Get Visibility Data and values of UVW if settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) print "h_u", h_u.shape h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1]) freq = 1702500000. light_speed = 299792458. # Speed of light ## assume data is unpolarized #print chan print 'GCOUNT', gcount # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u,dtype='float32')) d_v = gpu.to_gpu(np.array(h_v,dtype='float32')) d_re = gpu.to_gpu(np.array(h_re,dtype='float32')) d_im = gpu.to_gpu(np.array(h_im,dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution. # This kernel also calculates the point spread function and the local sampling # from the data (for applying the weights later). gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def bench(self, data, plot=True, get=False): """ Function to test drift rate search note this bench mark contains compilation times and multiple kernel calls. larger input would be faster. """ start = cu.Event() copy_htod = cu.Event() compute = cu.Event() stop = cu.Event() start.record() self.spectr_d = gpuarray.to_gpu(data) copy_htod.record() copy_htod.synchronize() self.sweep_kernel(self.spectr_d, self.output_d, self.delay_table_d, self.nfreqs, self.ntimes, self.ndelays, block=self.block_size, grid=self.grid_size) compute.record() compute.synchronize() if get: out = self.output_d.get() else: operand = self.output_d[self.ndelays // 2] mean = gpuarray.sum(operand / np.float32(self.nfreqs)).get() var = gpuarray.sum( (operand - mean) * (operand - mean) / np.float32(self.nfreqs)) std = np.sqrt(var.get()) self.threshold_kernel(self.output_d, self.mask_d, np.float32(3 * std + mean), self.nfreqs, self.ndelays, block=self.block_size, grid=self.grid_size) out = (self.output_d * self.mask_d).get() stop.record() stop.synchronize() print "{} seconds to load data".format( start.time_till(copy_htod) * 1.e-3) print "{} seconds to compute {} channels, with {} delays".format( copy_htod.time_till(compute) * 1.e-3, self.nfreqs, self.ndelays) print copy_htod.time_till(compute) / ( self.nfreqs * self.ndelays) * 1.e6, "nanoseconds per channel, per delay" if plot: import pylab as plt f, axes = plt.subplots(2, 1) axes[0].imshow( data, aspect='auto', extent=[0, self.nfreqs, 0, self.ntimes * self.delta_t], origin='lower') axes[0].set_xlabel('Freq [Hz]') axes[0].set_ylabel('Time [s]') axes[1].imshow(out, aspect='auto', extent=[0, self.nfreqs, -0.1, 0.1], origin='lower') axes[1].set_xlabel('Freq [Hz]') axes[1].set_ylabel('Drift [Hz/s]') plt.tight_layout() plt.show() return out
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n / 2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n / 2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % ( i * 2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192 * 50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, Mapped(flag), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index + 1), np.uint32(better_i), nodes, block=(1, 1, 1), grid=(1, 1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
def run(xHfile=None, IO_DIR=None): if IO_DIR is None: IO_DIR = parent_folder if not os.path.exists(IO_DIR + "/Outfiles"): os.makedirs(IO_DIR + "/Outfiles") if xHfile is None: xHfile = find_files(IO_DIR + "/Boxes/", pattern="xH*{0:06.2f}_{1:i}_{2:.0f}*".format( Z, HII_DIM, BOX_LEN))[0] if xHfile.endswith('.npy'): xH = np.load(xHfile) p_dict = boxio.parse_filename(os.path.splitext(xHfile)[0]) else: b = boxio.readbox(xHfile) xH = b.box_data p_dict = b.param_dict Z = p_dict['z'] #growth_factor = pb.fgrowth(Z, COSMO['omega_M_0'], unnormed=True) #overwrite global variables HII_DIM = p_dict['dim'] BOX_LEN = np.float32(p_dict['BoxSize']) DELTA_K = np.float32(2 * np.pi / BOX_LEN) VOLUME = (BOX_LEN * BOX_LEN * BOX_LEN) HII_TOT_NUM_PIXELS = HII_DIM**3 try: deltax = np.load( IO_DIR + "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc.npy". format(Z, HII_DIM, BOX_LEN)) except: #deltax = boxio.readbox(IO_DIR+"/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(Z, HII_DIM, BOX_LEN)).box_data deltax = boxio.readbox( IO_DIR + "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc".format( Z, HII_DIM * 2, BOX_LEN)).box_data[:HII_DIM, :HII_DIM, :HII_DIM] kernel_source = open(cmd_folder + "/delta_T.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'VOLUME': VOLUME, 'NUM_BINS': NUM_BINS } main_module = nvcc.SourceModule(kernel_code) pbox_kernel = main_module.get_function("pbox_kernel") #pixel_deltax_d = gpuarray.to_gpu(deltax) #pixel_xH_d = gpuarray.to_gpu(xH) _const_factor = np.float32( 27 * (COSMO['omega_b_0'] * COSMO['h'] * COSMO['h'] / 0.023) * np.sqrt( (0.15 / COSMO['omega_M_0'] / COSMO['h'] / COSMO['h']) * (1 + Z) / 10.0)) delta_T = np.float32(_const_factor * xH * (1.0 + deltax)) #in mK ave = np.mean(delta_T) np.save( IO_DIR + "/Boxes/delta_T_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc.npy" .format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, int(BOX_LEN)), delta_T) deldel_T = (delta_T / ave - 1) * VOLUME / HII_TOT_NUM_PIXELS if DIMENSIONAL_T_POWER_SPEC: deldel_T *= ave plan = Plan(HII_shape, dtype=np.complex64) deldel_T_d = gpuarray.to_gpu(deldel_T.astype(np.complex64)) plan.execute(deldel_T_d) K = np.float32( np.logspace(np.log10(DELTA_K), np.log10(DELTA_K * np.sqrt(3.) * DIM), NUM_BINS)) K_d = gpuarray.to_gpu(K) k_ave_d = gpuarray.zeros_like(K_d) in_bin_ct_d = gpuarray.zeros_like(K_d) ps_d = gpuarray.zeros_like(K_d) pbox_kernel(deldel_T_d, DIM, ps_d, k_ave_d, in_bin_ct_d, K_d, block=block_size, grid=HII_grid_size) ps = ps_d.get() in_bin_ct = in_bin_ct_d.get() k_ave = k_ave_d.get() k_ave = np.where(in_bin_ct > 0, k_ave / in_bin_ct, 0.) ps_ave = np.where(in_bin_ct > 0, ps / in_bin_ct, 0.) #ps_fname = "/ps_nov_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc".format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, np.int32(BOX_LEN)) #np.savez(IO_DIR+ps_fname, k_ave=k_ave, ps_ave=ps_ave) return K, k_ave, ps_ave
__syncthreads(); } """ # Initialize kernel smoothing_kernel = nvcc.SourceModule(kernel_smooth_source).get_function("smoothingFilter") normalize_kernel = nvcc.SourceModule(kernel_norm_source).get_function("normalizeFilter") out_kernel = nvcc.SourceModule(kernel_out_source).get_function("outFilter") # Allocate memory and constants smem_size = int(TPBx*TPBy*4) Copy arrays to device once IMG_device = gpuarray.to_gpu(IMG) BOX_device = gpuarray.to_gpu(BOX) NORM_device = gpuarray.to_gpu(NORM) OUT_device = gpuarray.to_gpu(OUT) setup_stop_time = time.time() smth_kernel_start_time = cu.Event() smth_kernel_stop_time = cu.Event() norm_kernel_start_time = cu.Event() norm_kernel_stop_time = cu.Event() out_kernel_start_time = cu.Event() out_kernel_stop_time = cu.Event() ########## # The kernel will convolve the image with a gaussian weighted sum # determine the BOX size that allows the sum to reach either the maxRad or
imsize = settings['imsize'] # nx - 2 imsize, it means 2048 when imsize=1024 nx = np.int32(2 * imsize) # create fft plan nx*nx plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64) f = pyfits.open(settings['vfile']) channel = f[0].data.data.shape[3] # Read channel from uvfits file for chan in range(0, channel): #dpsf, gpu_im = cuda_gridvis(f, settings, plan, chan) dpsf, gpu_im = cuda_gridvis(sub_array, f, settings, plan, chan) gpu_dpsf = gpu.to_gpu(dpsf) print "**********************************************" if PLOTME: dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1) ## Clean the PSF if imsize >= 1024: cpsf = serial_clean_beam(dpsf, imsize / 50.) elif imsize >= 512: cpsf = serial_clean_beam(dpsf, imsize / 25.) elif imsize >= 256: cpsf = serial_clean_beam(dpsf, imsize / 12.) gpu_cpsf = gpu.to_gpu(cpsf)
def ifft(self, ff_fft): xf_gpu = gpuarray.to_gpu(ff_fft) x_gpu = gpuarray.empty(self.shapeX, np.float32) skfft.ifft(xf_gpu, x_gpu, self.ifftplan, False) return x_gpu.get()
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); } ''') matrix_ker = ker.get_function('matrix_mult_ker') test_a = np.float32([xrange(1, 5)] * 4) test_b = np.float32([xrange(14, 10, -1)] * 4) output_mat = np.matmul(test_a, test_b) test_a_gpu = gpuarray.to_gpu(test_a) test_b_gpu = gpuarray.to_gpu(test_b) output_mat_gpu = gpuarray.empty_like(test_a_gpu) matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2, 2, 1), grid=(2, 2, 1)) assert (np.allclose(output_mat_gpu.get(), output_mat))
def fft(self, ff): x_gpu = gpuarray.to_gpu(ff) xf_gpu = gpuarray.empty(self.shapeK, np.complex64) skfft.fft(x_gpu, xf_gpu, self.fftplan, False) return xf_gpu.get()
count_facs = SourceModule(""" __global__ void count_facs(int x, unsigned int *y, int *z) { if (x % y[threadIdx.x]) { z[0] += 1; } } """).get_function("count_facs") ''' limit = 5000 # limit = int(input('Limit: ')) start_time = time() with open('primes1.txt') as f: primes = np.fromiter(map(int, f.read().strip().split(',')), dtype=np.uint32) p_gpu = gpuarray.to_gpu(primes) antiprimes = [] most = 0 for x in gpuarray.arange(2, limit + 1, dtype=np.uint32): x_local = x.get() fac = [[], []] x_temp = x.copy() for prime in p_gpu: more = cumath.fmod(x_temp, prime).get() == np.uintc(0) more_app = more if more: power = 0 while more: power += 1 x_temp = x_temp / prime