def allocation(self): super(DGModalGpu, self).allocation() self.ul_gpu = cuda.to_device(self.ul) self.ul_prev_gpu = cuda.to_device(self.ul) self.ul_tmp_gpu = cuda.to_device(self.ul) self.kl_gpu = cuda.to_device(self.ul) self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
def test_compare_order(): ''' compare_order between C(row-major), F(column-major) ''' compare_order = mod_cu.get_function('compare_order') nx, ny = 3, 4 f_1d = np.arange(nx*ny, dtype='f8') f_2d_C = f_1d.reshape((nx,ny), order='C') f_2d_F = f_1d.reshape((nx,ny), order='F') print '' print 'f_1d_C\n\n', f_1d print 'f_2d_C\n', f_2d_C print 'f_2d_F\n', f_2d_F print '' print 'after cuda' ret_f_1d = np.zeros_like(f_1d) f_1d_gpu = cuda.mem_alloc_like(f_1d) f_2d_C_gpu = cuda.to_device(f_2d_C) compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_C\n', ret_f_1d f_2d_F_gpu = cuda.to_device(f_2d_F) compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_F\n', ret_f_1d
def train_gpu(self, num_iter, model_file_path): if self.batch == 0: # Prepare to send the numpy array to gpu self.syn1_gpu = cuda.to_device(self.syn1) # Create word idx and related data-structure. self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize) word_rep_ptr = int(self.base_word_rep) self.word_reps = {} for w_idx, word in sorted(self.dictionary.items()): word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32) word_point = self.words_rep[word][1].astype(dtype=np.int32) self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr) word_rep_ptr += WordRep.memsize print "GPU transfers done." self.sent_reps_gpu = cuda.to_device(self.sent_reps) # Prepare sentences for GPU transfer. idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary] for sentence in self.sentences] # Prepare the kernel function kernel = self.kernel_str.get_function("train_sg") words = np.empty(self.num_sents, dtype=np.int32) # sent_reps = np.copy(self.sent_reps) for iter in range(num_iter): # Sample words for each sentence and transfer to GPU for s_idx in range(self.num_sents): words[s_idx] = random.choice(idx_sentences[s_idx]) words_gpu = cuda.to_device(words) kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu, block=(self.size, 1, 1), grid=(self.num_sents, 1, 1)) # autoinit.context.synchronize() self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype) pickle_dump(self.sent_reps, model_file_path)
def test_simple_kernel_2(self): mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = np.random.randn(400).astype(np.float32) b = np.random.randn(400).astype(np.float32) a_gpu = drv.to_device(a) b_gpu = drv.to_device(b) dest = np.zeros_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=(400, 1, 1)) assert la.norm(dest-a*b) == 0 drv.Context.synchronize() # now try with offsets dest = np.zeros_like(a) multiply_them( drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu, block=(399, 1, 1)) assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
def multiply_csr(matrix, vector, block_size, repeat=1): ''' Method multiply matrix by vector using CUDA module for CSR. Calculation executed on nVidia GPU. Parameters ========== matrix : Scipy matrix or numpy array Matrix to multiplication. vector : numpy array Vector to multiplication. His length must equal number of columns matrix. block_size : int (recommended 128 or 256) Size of block CUDA. repeat : int > 0 Number of repetitions multiplications. It has no effect on result. Specifies the length of returned list of execution times. Returns ======= Tuple of result multiplication and list of execution times. ''' if len(vector) != matrix.shape[1]: raise ArithmeticError('Length of the vector is not equal to the' 'number of columns of the matrix.') matrix = mf.convert_to_scipy_csr(matrix) data = numpy.array(matrix.data, dtype=numpy.float32) indices = numpy.array(matrix.indices, dtype=numpy.int32) indptr = numpy.array(matrix.indptr, dtype=numpy.int32) data = cuda.to_device(data) indices = cuda.to_device(indices) indptr = cuda.to_device(indptr) num_rows = matrix.shape[0] result = numpy.zeros(num_rows, dtype=numpy.float32) time_list = [] grid_size = int(numpy.ceil((num_rows+0.0)/block_size)) block = (block_size, 1, 1) grid = (grid_size, 1) g_vector = cuda.to_device(vector) num_rows = numpy.int32(num_rows) kernel, texref = cudacodes.get_cuda_csr(block_size=block_size) texref.set_address(g_vector, vector.nbytes) tex = [texref] for _ in range(repeat): start.record() kernel(data, indices, indptr, cuda.Out(result), num_rows, block=block, grid=grid, texrefs=tex) end.record() end.synchronize() time_list.append(start.time_till(end)) return (result, time_list)
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel): REAL = par_reac.REAL Nq = len(field.xq) N = len(XK) MV = numpy.zeros(len(XK)) L = numpy.sqrt(2*surface.Area) # Representative length AI_int = 0 # Setup vector K = par_reac.K tic = time.time() w = getWeights(K) X_V = numpy.zeros(N*K) X_Kx = numpy.zeros(N*K) X_Ky = numpy.zeros(N*K) X_Kz = numpy.zeros(N*K) X_Kc = numpy.zeros(N*K) X_Vc = numpy.zeros(N*K) for i in range(N*K): X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K] X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0] X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1] X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2] X_Kc[i] = XK[i/K] X_Vc[i] = XV[i/K] toc = time.time() time_set = toc - tic sort = surface.sortSource phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL)) m_gpu = cuda.to_device(X_V[sort].astype(REAL)) mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL)) my_gpu = cuda.to_device(X_Ky[sort].astype(REAL)) mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL)) mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL)) mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL)) AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32)) xkDev = cuda.to_device(surface.xk.astype(REAL)) wkDev = cuda.to_device(surface.wk.astype(REAL)) get_phir = kernel.get_function("get_phir") GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ)) get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold), AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1)) AI_aux = numpy.zeros(Nq, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32) AI_int = numpy.sum(AI_aux) phir_cpu = numpy.zeros(Nq, dtype=REAL) phir_cpu = cuda.from_device(phir, Nq, dtype=REAL) return phir_cpu, AI_int
def __init__(self, code, point, struct_ptr): self.code = cuda.to_device(code) self.point = cuda.to_device(point) self.code_shape, self.code_dtype = code.shape, code.dtype self.point_shape, self.point_dtype = point.shape, point.dtype cuda.memcpy_htod(int(struct_ptr), np.int32(code.size)) cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code))) cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
def sync_to_device(self): self.object_array = np.array([f.as_array() for f in self.object_list]) self.d_object_array = cuda.to_device(self.object_array) self.d_object_count = cuda.to_device(np.array([self.object_count], dtype=np.int32)) self.device_ptr = cuda.to_device(np.array([self.d_object_array, self.d_object_count], dtype=np.intp)) return self.device_ptr
def nlargest(self, n): """Returns the per-individual threshold above which there are n outputs. @param n: number of outputs which should be above the threshold @type params: int @return list of thresholds, in order of individuals, which delimit the top n output values """ log.debug("enter nlargest with n=%d", n) # Find one more output so that we can use strictly-less-than when counting # and underestimate lift rather than overestimating it. n = n + 1 passSizes = [] while n > 0: nextSize = min(self.maxHeapFloats, n) passSizes.append(nextSize) n -= nextSize log.debug("pass sizes: %r", passSizes) thresholdsMat = np.ones(shape=(self.popSize,), dtype=np.float32) * np.inf self.thresholds = driver.to_device(thresholdsMat) uintBytes = np.dtype(np.uint32).itemsize thresholdCounts = np.zeros(shape=(self.popSize,), dtype=np.uint32) self.thresholdCounts = driver.to_device(thresholdCounts) for passSize in passSizes: log.debug("begin pass size %d", passSize) self.nlargestKernel.prepared_call(self.nlargestGridDim, self.outputs, self.trainSet.size, self.popSize, passSize, self.thresholds, self.thresholdCounts) driver.Context.synchronize() if log.isEnabledFor(logging.DEBUG): thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat) log.debug("thresholds: %s", str(thresholdsMat)) thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts) log.debug("thresholdCounts: %s", str(thresholdCounts)) self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat) return self.thresholdsMat
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, surf, LorY, w, param, timing, kernel): if param.GPU==1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() tic.record() REAL = param.REAL mDev = cuda.to_device(m.astype(REAL)) mKtcDev = cuda.to_device(mKtc.astype(REAL)) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size directKt_gpu = kernel.get_function("P2PKt") AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32)) # GPU arrays are flattened, need to point to first element ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev, surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold), numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1)) toc.record() toc.synchronize() timing.time_P2P += tic.time_till(toc)*1e-3 tic.record() AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32) timing.AI_int += sum(AI_aux[surfTar.unsort]) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 return Ktx_gpu, Kty_gpu, Ktz_gpu
def K(self, Q, P, angles, quadratures): drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32)) Nx = Q.shape[0] Ny = int(floor(quadratures.size / 1024.)) K = scipy.empty((Nx,), dtype=scipy.float32) Kb = drv.mem_alloc(4*Ny*Nx) Q_gpu = drv.to_device(Q) P_gpu = drv.to_device(P) self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb, block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4) self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4) return K/self.L
def __init__(self): self.stream = cuda.Stream() self.pool = pycuda.tools.PageLockedMemoryPool() self._clear() # These resources rely on the slots/ringbuffer mechanism for sharing, # and so can be shared across any number of launches, genomes, and # render kernels. Notably, seeds are self-synchronizing, so they're not # attached to either stream object. self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32)) seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256) self.d_seeds = cuda.to_device(seeds) self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16 self.d_points = cuda.mem_alloc(self._len_d_points)
def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel): if param.GPU==1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL tic.record() M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)] MSort = numpy.zeros(param.Nm*M2P_size) MdSort = numpy.zeros(param.Nm*M2P_size) i = -1 for C in surfTar.M2P_list[surf,0:M2P_size]: i+=1 MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md # (free, total) = cuda.mem_get_info() # print 'Global memory occupancy: %f%% free'%(free*100/total) MDev = cuda.to_device(MSort.astype(REAL)) MdDev = cuda.to_device(MdSort.astype(REAL)) # (free, total) = cuda.mem_get_info() # print 'Global memory occupancy: %f%% free'%(free*100/total) # GPU arrays are flattened, need to point to first element ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size multipole_gpu = kernel.get_function("M2P") multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev, surfTar.xcDev, surfTar.ycDev, surfTar.zcDev, MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa), numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY), block=(param.BSZ,1,1), grid=(GSZ,1)) toc.record() toc.synchronize() timing.time_M2P += tic.time_till(toc)*1e-3 return K_gpu, V_gpu
def test_multichannel_linear_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 1, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int i = threadIdx.x+blockDim.x*threadIdx.y; float4 texval = tex1Dfetch(mtx_tex, i); dest[i*CHANNELS + 0] = texval.x; dest[i*CHANNELS + 1] = texval.y; dest[i*CHANNELS + 2] = texval.z; dest[i*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (16, 16) channels = 4 a = np.random.randn(*(shape+(channels,))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) dest = np.zeros(shape+(channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex] ) #print a #print dest assert la.norm(dest-a) == 0
def index_list_backend(self, ilists): from pytools import single_valued ilist_length = single_valued(len(il) for il in ilists) assert ilist_length == self.plan.dofs_per_face from cgen import Typedef, POD from pytools import flatten flat_ilists_uncast = numpy.array(list(flatten(ilists))) if numpy.max(flat_ilists_uncast) >= 256: tp = numpy.uint16 else: tp = numpy.uint8 flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp) assert (flat_ilists == flat_ilists_uncast).all() return GPUIndexLists( type=tp, code=[Typedef(POD(tp, "index_list_entry_t"))], device_memory=cuda.to_device(flat_ilists), bytes=flat_ilists.size * flat_ilists.itemsize, )
def batch_indexing(self, planes, data_points): data_size = data_points.shape[0] / 128 self.benchmark_begin('preparing') gpu_alloc_objs = [] # for data points #addresses = [] #for point in data_points: # point_addr = drv.to_device(point) # gpu_alloc_objs.append(point_addr) # addresses.append(int(point_addr)) #np_addresses = numpy.array(addresses).astype(numpy.uint64) # 64 bit addressing space. each point costs 8 bytes #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8) #drv.memcpy_htod(arrays_gpu, np_addresses) # for planes planes_addresses = [] for plane in planes: plane_addr = drv.to_device(plane) gpu_alloc_objs.append(plane_addr) planes_addresses.append(int(plane_addr)) planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64) # 64 bit addressing space. each point costs 8 bytes planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8) drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses) # projections projections = numpy.zeros(data_size).astype(numpy.uint64) length = numpy.array([data_size]).astype(numpy.uint64) print "total: " + str(data_size) + " data points to indexing." self.benchmark_end('preparing') self.benchmark_begin('cudaing') self.indexing_kernel( planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length), block = self.block, grid = self.grid) self.benchmark_end('cudaing') #count = 0 #for pro in projections: # print "count: " + str(count) + " " + str(pro) # count += 1 #print projections.shape return projections
def go_sort_old(count, stream=None): data = np.fromstring(np.random.bytes(count), dtype=np.uint8) ddata = cuda.to_device(data) print 'Done seeding' grids = count / 8192 pfxs = np.zeros((grids + 1, 256), dtype=np.int32) dpfxs = cuda.to_device(pfxs) launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) #dsplit = cuda.to_device(pfxs) #launch('crappy_split', dpfxs, dsplit, #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) #if not stream: #split = cuda.from_device_like(dsplit, pfxs) #split_ = cuda.from_device_like(dsplit_, pfxs) #print np.all(split == split_) dshortseg_pfxs = cuda.mem_alloc(256 * 4) dshortseg_sums = cuda.mem_alloc(256 * 4) launch('prefix_sum', dpfxs, np.int32(grids * 256), dshortseg_pfxs, dshortseg_sums, block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1) dsorted = cuda.mem_alloc(count * 4) launch('sort_8', ddata, dsorted, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) launch('sort_8_a', ddata, dsorted, dpfxs, dsplit, block=(32, 32, 1), grid=(grids, 1), stream=stream) if not stream: sorted = cuda.from_device(dsorted, (count,), np.int32) f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r]) sort_stat = f(rle(sorted)) with open('dev.txt', 'w') as fp: fp.write(sort_stat) sorted_np = np.sort(data) np_stat = f(rle(sorted_np)) with open('cpu.txt', 'w') as fp: fp.write(np_stat) print 'is_sorted?', np.all(sorted == sorted_np)
def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n): """ Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2. Parameters ------------ kernel_nr : int concurrent kernel number y_cls : array-like binary class labels (1,-1) cls1: int first class number cls2: int second class number cls1_n : int number of elements of class 1 cls2_n : int number of elements of class 2 kernel_out : array-like array for gpu kernel result, size=2*len(y_cls) """ warp=32 align_cls1_n = cls1_n+(warp-cls1_n%warp)%warp align_cls2_n = cls2_n+(warp-cls2_n%warp)%warp self.cls1_N_aligned=align_cls1_n sum_cls= align_cls1_n+align_cls2_n self.sum_cls[kernel_nr] = sum_cls self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32) self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32) self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr]) self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr]) self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb )) self.g_y[kernel_nr] = cuda.to_device(y_cls) self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32) ker_out = self.kernel_out[kernel_nr] self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}): from hedge.backends.cuda.tools import pad_and_join # single_item = [([ block1, block2, ... ], decl), ...] # multi_item = [([ [ item1, item2, ...], ... ], decl), ...] multi_blocks = [ ["".join(s) for s in part_data] for part_data, part_decls in multi_item] block_sizes = [ max(len(b) for b in part_blocks) for part_blocks in multi_blocks] from pytools import single_valued block_count = single_valued( len(si_part_blocks) for si_part_blocks, si_part_decl in single_item) from cgen import Struct, ArrayOf struct_members = [] for part_data, part_decl in single_item: assert block_count == len(part_data) single_valued(len(block) for block in part_data) struct_members.append(part_decl) for part_data, part_decl in multi_item: struct_members.append( ArrayOf(part_decl, max(len(s) for s in part_data))) superblocks = [] for superblock_num in range(block_count): data = "" for part_data, part_decl in single_item: data += part_data[superblock_num] for part_blocks, part_size in zip(multi_blocks, block_sizes): assert block_count == len(part_blocks) data += pad(part_blocks[superblock_num], part_size) superblocks.append(data) superblock_size = devdata.align( single_valued(len(sb) for sb in superblocks)) data = pad_and_join(superblocks, superblock_size) assert len(data) == superblock_size*block_count class SuperblockedDataStructure(Record): pass return SuperblockedDataStructure( struct=Struct(struct_name, struct_members), device_memory=cuda.to_device(data), block_bytes=superblock_size, data=data, **extra_fields )
def __init__(self, fields, str_f, pt0, pt1): """ """ common.check_type('fields', fields, Fields) common.check_type('str_f', str_f, (str, list, tuple), str) common.check_type('pt0', pt0, (list, tuple), (int, float)) common.check_type('pt1', pt1, (list, tuple), (int, float)) pt0 = list( common.convert_indices(fields.ns, pt0) ) pt1 = list( common.convert_indices(fields.ns, pt1) ) # local variables str_fs = common.convert_to_tuple(str_f) dtype_str_list = fields.dtype_str_list for strf in str_fs: strf_list = ['ex', 'ey', 'ez', 'hx', 'hy', 'hz'] common.check_value('str_f', strf, strf_list) for axis, n, p0, p1 in zip(['x', 'y', 'z'], fields.ns, pt0, pt1): common.check_value('pt0 %s' % axis, p0, range(n)) common.check_value('pt1 %s' % axis, p1, range(n)) # program macros = ['NMAX', 'XID', 'YID', 'ZID', \ 'ARGS', \ 'TARGET', 'SOURCE', 'OVERWRITE', \ 'DTYPE'] values = common_gpu.macro_replace_list(pt0, pt1) + \ ['DTYPE *source', \ 'target[sub_idx]', 'source[idx]', '='] + \ dtype_str_list ksrc = common.replace_template_code( \ open(common_gpu.src_path + 'copy.cu').read(), macros, values) program = SourceModule(ksrc) kernel_copy = program.get_function('copy') # allocation source_bufs = [fields.get_buf(str_f) for str_f in str_fs] shape = common.shape_two_points(pt0, pt1, len(str_fs)) host_array = np.zeros(shape, fields.dtype) split_host_array = np.split(host_array, len(str_fs)) split_host_array_dict = dict( zip(str_fs, split_host_array) ) target_buf = cuda.to_device(host_array) # global variables self.mainf = fields self.kernel_copy = kernel_copy self.source_bufs = source_bufs self.target_buf = target_buf self.host_array = host_array self.split_host_array_dict = split_host_array_dict
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*, and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Set up using PyCUDA CUDAArray support self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C') self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C') self.gpu.tex2lr.set_array(self.gpu.rsmiles) self.gpu.tex2cr.set_array(self.gpu.rcounts) else: # Manually handle setup temprlmat = self._padded_array(refsmilesmat) if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768: raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape) self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes) cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream) temprcmat = self._padded_array(refcountsmat) self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes) cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = temprcmat.shape[1] descriptor.height = temprcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0]) self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0]) self.gpu.stream.synchronize() del temprlmat del temprcmat #}}} self.rlengths = reflengths self.rshape = refsmilesmat.shape self.nref = refsmilesmat.shape[0] # Copy reference lengths to GPU self.gpu.rl_gpu = cuda.to_device(reflengths) # Allocate buffers for query set magnitudes self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes) if refmags is not None: cuda.memcpy_htod(self.gpu.rmag_gpu,refmags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr]) return
def get_heartbeat(d_lead, length, sampling_rate): # Kernel Parameters threads_per_block = 200 num_blocks = length / threads_per_block # Get RR reduce_by = 32 edge_signal = cuda.mem_alloc(4 * length) edge_detect(edge_signal, d_lead, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) indecies = numpy.zeros(length / reduce_by).astype(numpy.int32) masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32)) d_index = cuda.to_device(indecies) index_of_peak(d_index, masks, edge_signal, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by) # Allocate output # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32) dev_rr = cuda.mem_alloc(c_length * 4) num_blocks = (c_length / threads_per_block) + 1 get_compact_rr(dev_rr, cd_index, numpy.int32(sampling_rate), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) clean_result(dev_rr, numpy.int32(120), numpy.int32(40), numpy.int32(1), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) moving_average_filter(dev_rr, c_length, 250) index = cuda.from_device(cd_index, (c_length,), numpy.int32) rr = cuda.from_device(dev_rr, (c_length,), numpy.int32) index[0] = index[1] return rr, index / float(sampling_rate * 3600)
def __init__(self, array, struct_arr_ptr): print "copying data to device" self.data = cuda.to_device(array) self.shape, self.dtype = array.shape, array.dtype cuda.memcpy_htod(int(struct_arr_ptr), numpy.getbuffer(numpy.int32(len(array[0])))) cuda.memcpy_htod(int(struct_arr_ptr) + 8, numpy.getbuffer(numpy.intp(int(self.data))))
def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu, surf, LorY, K_diag, IorE, L, w, param, timing, kernel): tic = cuda.Event() toc = cuda.Event() tic.record() REAL = param.REAL mDev = cuda.to_device(m.astype(REAL)) mxDev = cuda.to_device(mx.astype(REAL)) myDev = cuda.to_device(my.astype(REAL)) mzDev = cuda.to_device(mz.astype(REAL)) mKcDev = cuda.to_device(mKc.astype(REAL)) mVcDev = cuda.to_device(mVc.astype(REAL)) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size direct_gpu = kernel.get_function("P2P") AI_int = cuda.to_device(zeros(param.Nround, dtype=int32)) # GPU arrays are flattened, need to point to first element ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays # Check if internal or external to send correct singular integral if IorE==1: sglInt = surfSrc.sglInt_intDev else: sglInt = surfSrc.sglInt_extDev direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev, surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev, mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt, surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list), int32(LorY), REAL(param.kappa), REAL(param.threshold), int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int, surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1)) toc.record() toc.synchronize() timing.time_P2P += tic.time_till(toc)*1e-3 tic.record() AI_aux = zeros(param.Nround, dtype=int32) AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32) timing.AI_int += sum(AI_aux[surfTar.unsort]) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 return K_gpu, V_gpu
def allocate_gpu(s): a = np.zeros((6,5),'f') if s.num_device == 0: a[-2,:] = 1.5 if s.num_device == 1: a[1,:] = 2.0 a[-2,:] = 2.5 if s.num_device == 2: a[1,:] = 3.0 s.arr_gpu = cuda.to_device(a) s.nx, s.ny = s.shape = a.shape s.dtype = a.dtype
def transfer_leads(*h_leads): length = len(h_leads[0]) result = [] grid = ((length / 1024)+1, 1) block = (1024, 1, 1) for h_lead in h_leads: d_lead16 = cuda.to_device(h_lead) d_lead32 = cuda.mem_alloc(h_lead.nbytes * 2) to_float(d_lead32, d_lead16, numpy.int32(length), grid=grid, block=block) result.append(d_lead32) return tuple(result) + (length,)
def __init__(self, array, struct_arr_ptr): self.data = cuda.to_device(array) self.shape, self.dtype = array.shape, array.dtype """ numpy.getbuffer() needed due to lack of new-style buffer interface for scalar numpy arrays as of numpy version 1.9.1 see: https://github.com/inducer/pycuda/pull/60 """ cuda.memcpy_htod(int(struct_arr_ptr), numpy.getbuffer(numpy.int32(array.size))) cuda.memcpy_htod(int(struct_arr_ptr) + 8, numpy.getbuffer(numpy.uintp(int(self.data))))
def local_to_device(self, early_free=True): """ Copies locally set array data to device memory. Parameters ---------- early_free: boolean When True, will automatically free all device memory before allocation, preventing double-allocation for a short time. """ if self.device_ptr != None and early_free == True: self.device_ptr.free() self.device_ptr = drv.to_device(self.local_array)
def make_blocks(devdata, data): from hedge.backends.cuda.tools import pad_and_join blocks = ["".join(b) for b in data] block_size = devdata.align(max(len(b) for b in blocks)) class BlockedDataStructure(Record): pass return BlockedDataStructure( blocks=cuda.to_device(pad_and_join(blocks, block_size)), max_per_block=max(len(b) for b in data), block_size=block_size, )
def alloc_eh_fields(s): f = np.zeros((s.nx, s.ny, s.nz), 'f') s.ex_gpu = cuda.to_device(f) s.ey_gpu = cuda.to_device(f) s.ez_gpu = cuda.to_device(f) s.hx_gpu = cuda.to_device(f) s.hy_gpu = cuda.to_device(f) s.hz_gpu = cuda.to_device(f) s.eh_fields = [s.ex_gpu, s.ey_gpu, s.ez_gpu, s.hx_gpu, s.hy_gpu, s.hz_gpu]
tpb = 256 bpg = (nx * ny * nz) / tpb print 'dim (%d, %d, %d)' % (nx, ny, nz) total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print 'mem %d MB' % (total_bytes / (1024**2)) else: print 'mem %1.2f GB' % (float(total_bytes) / (1024**3)) # memory allocate f = np.random.randn(nx * ny * nz).astype(np.float32).reshape((nx, ny, nz)) #f = np.zeros((nx,ny,nz), 'f') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex = set_c(f, 'yz') cey = set_c(f, 'zx') cez = set_c(f, 'xy') cex_gpu = cuda.to_device(cex) cey_gpu = cuda.to_device(cey) cez_gpu = cuda.to_device(cez) descr = cuda.ArrayDescriptor3D()
def go_sort_old(count, stream=None): data = np.fromstring(np.random.bytes(count), dtype=np.uint8) ddata = cuda.to_device(data) print 'Done seeding' grids = count / 8192 pfxs = np.zeros((grids + 1, 256), dtype=np.int32) dpfxs = cuda.to_device(pfxs) launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) #dsplit = cuda.to_device(pfxs) #launch('crappy_split', dpfxs, dsplit, #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) #if not stream: #split = cuda.from_device_like(dsplit, pfxs) #split_ = cuda.from_device_like(dsplit_, pfxs) #print np.all(split == split_) dshortseg_pfxs = cuda.mem_alloc(256 * 4) dshortseg_sums = cuda.mem_alloc(256 * 4) launch('prefix_sum', dpfxs, np.int32(grids * 256), dshortseg_pfxs, dshortseg_sums, block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1) dsorted = cuda.mem_alloc(count * 4) launch('sort_8', ddata, dsorted, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) launch('sort_8_a', ddata, dsorted, dpfxs, dsplit, block=(32, 32, 1), grid=(grids, 1), stream=stream) if not stream: sorted = cuda.from_device(dsorted, (count, ), np.int32) f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r]) sort_stat = f(rle(sorted)) with open('dev.txt', 'w') as fp: fp.write(sort_stat) sorted_np = np.sort(data) np_stat = f(rle(sorted_np)) with open('cpu.txt', 'w') as fp: fp.write(np_stat) print 'is_sorted?', np.all(sorted == sorted_np)
def make_frame(frame_dir, frame, image_dir, frames, global_vars, find_total_max=False, save_density=False, save_entanglement=False, save_expectation_value=False, exp_range=[0, 0], **kwargs): global yMax, rhoMax, rhoMin frame_number = (frame.split("_")[1]).split(".")[0] print "Plotting", frame_dir QuantumState = np.load(frame_dir) # for i in xrange(len(QuantumState)): # # if QuantumState[i,0,0,0]*QuantumState[i,0,0,0].conjugate()>0.: # print i, QuantumState[i,0,0,0] # print "Info prob:",np.sum(QuantumState*QuantumState.conjugate()) # QuantumState+= 1. fig = plt.figure(figsize=(15, 12)) oldQuantumState = None oldQuantumState = QuantumState.copy() xSize = global_vars["Qx"] / 2 ySize = global_vars["Qy"] / 2 zSize = 1 blockX = 256 x_nSize = xSize * ySize * (2 * xSize * ySize - 1) while x_nSize % blockX != 0: blockX /= 2 gridX = x_nSize / blockX blockY = 1 blockZ = 1 gridY = 1 gridZ = 1 blockX_r1_r2 = 16 blockY_r1_r2 = 16 blockZ_r1_r2 = 1 while xSize % blockX_r1_r2 != 0: blockX_r1_r2 /= 2 blockY_r1_r2 /= 2 gridX_r1_r2 = xSize * xSize / blockX_r1_r2 gridY_r1_r2 = ySize * ySize / blockY_r1_r2 gridZ_r1_r2 = 1 QuantumField_r1_r2_real = np.zeros((xSize, ySize, xSize, ySize), dtype=np.float64) QuantumField_r1_r2_imag = np.zeros((xSize, ySize, xSize, ySize), dtype=np.float64) Rho_projected_real = np.zeros((xSize, ySize), dtype=np.float64) # AnalyticFieldReal = np.zeros((xSize, xSize), dtype = np.float64) # AnalyticFieldImag = np.zeros((xSize, xSize), dtype = np.float64) Lattice = np.zeros(3, dtype=np.int_) Lattice[0], Lattice[1], Lattice[2] = xSize, ySize, zSize Time = np.zeros(1, dtype=np.float64) Time[0] = int(frame_number) # purity = np.zeros(1, dtype = np.float64) # gpuPurity = drv.to_device(purity) gpuLattice = drv.to_device(Lattice) gpuTime = drv.to_device(Time) gpuQField = drv.to_device(QuantumState) gpuQF_r1_r2_real = drv.to_device(QuantumField_r1_r2_real) gpuQF_r1_r2_imag = drv.to_device(QuantumField_r1_r2_imag) gpuRho_projected_real = drv.to_device(Rho_projected_real) # gpuAnalyticFieldReal = drv.to_device(AnalyticFieldReal) # gpuAnalyticFieldImag = drv.to_device(AnalyticFieldImag) get_QS(gpuQField, gpuQF_r1_r2_real, gpuQF_r1_r2_imag, gpuLattice, block=(blockX, blockY, blockZ), grid=(gridX, gridY, gridZ)) get_Rho_projected(gpuQF_r1_r2_real, gpuQF_r1_r2_imag, gpuRho_projected_real, gpuLattice, block=(blockX_r1_r2, blockY_r1_r2, blockZ_r1_r2), grid=(gridX_r1_r2, gridY_r1_r2, gridZ_r1_r2)) # if save_entanglement: # calcPurity(gpuQF_r1_r2_real, gpuQF_r1_r2_imag, gpuPurity, gpuLattice, block=(blockX_r1_r2,blockY_r1_r2,blockZ_r1_r2), grid=(gridX_r1_r2,gridY_r1_r2,gridZ_r1_r2)) # get_Analytic(gpuAnalyticFieldReal, gpuAnalyticFieldImag, gpuAn, gpuBn, gpuAn2, gpuBn2, gpuLattice, gpuTime, block=(blockX,blockY,blockZ), grid=(gridX,gridY,gridZ)) QuantumField_r1_r2_real = drv.from_device(gpuQF_r1_r2_real, QuantumField_r1_r2_real.shape, np.float64) QuantumField_r1_r2_imag = drv.from_device(gpuQF_r1_r2_imag, QuantumField_r1_r2_imag.shape, np.float64) Rho_projected_real = drv.from_device(gpuRho_projected_real, Rho_projected_real.shape, np.float64) # AnalyticFieldReal = drv.from_device(gpuAnalyticFieldReal, AnalyticFieldReal.shape, np.float64) # AnalyticFieldImag = drv.from_device(gpuAnalyticFieldImag, AnalyticFieldImag.shape, np.float64) RhoFieldProjected = np.zeros((xSize, ySize), dtype=DTYPE) # RhoFieldProjectedAnalytic = np.zeros((xSize), dtype = DTYPE) rho_total = np.sum(Rho_projected_real) # print rho_total # print rho_total for x in xrange(xSize): for y in xrange(ySize): RhoFieldProjected[x, y] = Rho_projected_real[x, y] / rho_total #Get Rho and Phase from QField # RhoField = (QuantumField_r1_r2_real+1.j*QuantumField_r1_r2_imag)*(QuantumField_r1_r2_real+1.j*QuantumField_r1_r2_imag).conjugate() # PhaseField = np.arctan2(QuantumField_r1_r2_imag,QuantumField_r1_r2_real)+np.pi if int(frame_number) == 0: # rhoMin, rhoMax = np.amin(RhoField).real, np.amax(RhoField).real yMax = np.amax(RhoFieldProjected.real) time = int(frame_number) # purity = drv.from_device(gpuPurity, purity.shape, np.float64) # entanglement = 1-2.*purity[0]/(rho_total*rho_total) xAxis = setXaxis(xSize) yAxis = setXaxis(ySize) time_text = plt.suptitle(r'$\tau = $' + str(time), fontsize=14, horizontalalignment='center', verticalalignment='top') # if save_entanglement: # time_text = plt.suptitle(r'$\tau = $' + str(time) + ' ' + r'$\mathcal{E} = $' + str(entanglement) ,fontsize=14,horizontalalignment='center',verticalalignment='top') # full_text = plt.suptitle(r'$\tau = $' + str(time) + ' ' + r'$P_{f} = $' + str('{:1.15f}'.format(prob)) + ' ' + r'$P_{p} = $' + str('{:1.15f}'.format(probP)),fontsize=14,horizontalalignment='center',verticalalignment='top') # gs = gridspec.GridSpec(1,1) # ax = fig.add_subplot(gs[0,0], xlim=(0,xSize), xlabel=r'$x(\ell)$', ylim=(-0.0000001, 1.1*yMax), ylabel=r'${| \Psi |}^{2}$') ############################### PLOTTING #################################################### # print "Rho: ", RhoFieldProjected if yMax == 0.: yMax += 1. # Density plt.subplot(111) plt.imshow(((RhoFieldProjected.real).T), extent=(np.amin(xAxis), np.amax(xAxis), np.amin(yAxis), np.amax(yAxis)), origin='lower', cmap=colorMapRho, norm=colors.SymLogNorm(linthresh=linThresh * yMax, linscale=linScale, vmin=0., vmax=yMax)) putLabels(r'$x\ \ (\ell)$', r'$y\ \ (\ell)$', r'$\rho \ \ (\frac{1}{\ell^2})$') ax = plt.gca() ax.set_aspect('equal') # #Phase # plt.subplot(122) # plt.imshow((PhaseField.real), extent=(np.amin(xAxis), np.amax(xAxis), np.amin(xAxis), np.amax(xAxis)), origin = 'lower', # cmap=colorMapPhase, norm=colors.Normalize(vmin=0.,vmax=2.*np.pi)) # putLabels(r'$x_{1}\ \ (\ell)$', r'$x_{2}\ \ (\ell)$', r'$\theta \ \ (Radians)$') # # Any value whose absolute value is > .0001 will have zero transparency # alphas = Normalize(0, rhoMax, clip=True)((rhoMax-RhoField.real)) # # alphas = colors.SymLogNorm(linthresh=linThresh*rhoMax,linscale=linScale,vmin=0.,vmax=10., clip=True)((rhoMax-RhoField.real).T) # alphas = np.clip(alphas**4, 0.0, 1) # alpha value clipped at the bottom at .4 # # Normalize the colors b/w 0 and 1, we'll then pass an MxNx4 array to imshow # cmap = plt.cm.gist_gray # colorsMap = colors.SymLogNorm(linthresh=linThresh*rhoMax,linscale=linScale,vmin=0.,vmax=rhoMax)((0.*RhoField.real)) # colorsMap = cmap(colorsMap) # # Now set the alpha channel to the one we created above # colorsMap[..., -1] = alphas # plt.imshow(colorsMap, extent=(np.amin(xAxis), np.amax(xAxis), np.amin(xAxis), np.amax(xAxis)), origin = 'lower') # ax = plt.gca() # ax.set_aspect('equal') # Screen density fig.tight_layout(pad=0.4, w_pad=5.0, h_pad=1.0, rect=[.05, .05, .95, .95]) if save_density: rho_dir = frame_dir.split("Data")[0] + "Density" if not os.path.exists(rho_dir + '/'): os.makedirs(rho_dir + '/') print "Saving density to", rho_dir + '/Frame_' + frame_number + '.npy' np.save(rho_dir + '/Frame_' + frame_number, RhoFieldProjected.real) # if save_entanglement: # ent_dir = frame_dir.split("Data")[0] + "Entanglement" # if not os.path.exists(ent_dir+ '/'): # os.makedirs(ent_dir + '/') # print "Saving entanglement to", ent_dir + '/entanglement.npy' # if time==0: # entanglementArray = np.asarray([[entanglement, time]]) # else: # entArrayOld = np.load(ent_dir + '/entanglement.npy' ) # entArrayNew = np.asarray([[entanglement, time]]) # entanglementArray = np.append(entArrayOld, entArrayNew, axis=0) # np.save(ent_dir + '/entanglement' , entanglementArray) # if save_expectation_value: # exp_dir = frame_dir.split("Data")[0] + "expectation" # if not os.path.exists(exp_dir+ '/'): # os.makedirs(exp_dir + '/') # print "Saving expectation to", exp_dir + '/expectation.npy' # exp_val = 0. # if exp_range==[0,0]: # for x in xrange(xSize): # exp_val += x*RhoFieldProjected[x].real # else: # for x in xrange(int(exp_range[0]*xSize),int(exp_range[1]*xSize),1): # exp_val += x*RhoFieldProjected[x].real # if time==0: # expectationArray = np.asarray([exp_val]) # else: # expArrayOld = np.load(exp_dir + '/expectation.npy' ) # expArrayNew = np.asarray([exp_val]) # expectationArray = np.append(expArrayOld, expArrayNew, axis=0) # np.save(exp_dir + '/expectation' , expectationArray) #Free memory gpuLattice.free() gpuQField.free() gpuQF_r1_r2_real.free() gpuQF_r1_r2_imag.free() gpuRho_projected_real.free() gpuTime.free() # gpuPurity.free() # gpuAnalyticFieldReal.free() # gpuAnalyticFieldImag.free() if not os.path.exists(image_dir + '/'): os.makedirs(image_dir + '/') fig.savefig(image_dir + '/Frame_' + frame_number + ".png") plt.close(fig)
import numpy import numpy.linalg as la from pycuda.compiler import SourceModule from six.moves import range thread_strides = 16 block_size = 256 macroblock_count = 33 total_size = thread_strides * block_size * macroblock_count dtype = numpy.float32 a = numpy.random.randn(total_size).astype(dtype) b = numpy.random.randn(total_size).astype(dtype) a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.mem_alloc(a.nbytes) from cgen import FunctionBody, \ FunctionDeclaration, POD, Value, \ Pointer, Module, Block, Initializer, Assign from cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal( FunctionDeclaration(Value("void", "add"), arg_decls=[ Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]
def get_phir_gpu(XK, XV, surface, field, par_reac, kernel): """ It computes the reaction potential on the GPU and it brings the data to the cpu. Arguments ---------- XK : array, input for the double layer potential. XV : array, input for the single layer potential. surface : class, surface where we are computing the reaction potential. field : class, information about the different regions in the molecule. par_reac: class, fine parameters related to the surface. Returns -------- phir_cpu: array, reaction potential brought from the GPU to the cpu. AI_int : int, counter of the amount of near singular integrals solved. """ REAL = par_reac.REAL Nq = len(field.xq) N = len(XK) AI_int = 0 # Setup vector K = par_reac.K tic = time.time() w = getWeights(K) X_V = numpy.zeros(N * K) X_Kx = numpy.zeros(N * K) X_Ky = numpy.zeros(N * K) X_Kz = numpy.zeros(N * K) X_Kc = numpy.zeros(N * K) X_Vc = numpy.zeros(N * K) for i in range(N * K): X_V[i] = XV[i // K] * w[i % K] * surface.area[i // K] X_Kx[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 0] X_Ky[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 1] X_Kz[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 2] X_Kc[i] = XK[i // K] X_Vc[i] = XV[i // K] toc = time.time() sort = surface.sortSource phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL)) m_gpu = cuda.to_device(X_V[sort].astype(REAL)) mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL)) my_gpu = cuda.to_device(X_Ky[sort].astype(REAL)) mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL)) mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL)) mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL)) AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32)) xkDev = cuda.to_device(surface.xk.astype(REAL)) wkDev = cuda.to_device(surface.wk.astype(REAL)) get_phir = kernel.get_function("get_phir") GSZ = int(numpy.ceil(float(Nq) / par_reac.BSZ)) get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold), AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ, 1, 1), grid=(GSZ, 1)) AI_aux = numpy.zeros(Nq, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32) AI_int = numpy.sum(AI_aux) phir_cpu = numpy.zeros(Nq, dtype=REAL) phir_cpu = cuda.from_device(phir, Nq, dtype=REAL) return phir_cpu, AI_int
if( idx < nx ) f[ijk] += sin(0.1*tn); } """ if __name__ == '__main__': nx, ny, nz = 320, 320, 320 print 'dim (%d, %d, %d)' % (nx, ny, nz) print 'mem %1.2f GB' % ( nx*ny*nz*4*12./(1024**3) ) # memory allocate f = np.zeros((nx,ny,nz), 'f', order='F') cf = np.ones_like(f)*0.5 ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(cf) cey_gpu = cuda.to_device(cf) cez_gpu = cuda.to_device(cf) chx_gpu = cuda.to_device(cf) chy_gpu = cuda.to_device(cf) chz_gpu = cuda.to_device(cf) # prepare kernels from pycuda.compiler import SourceModule
sys.exit() if (nx*ny)%Dy != 0: print "Error: nx*ny is not multiple of %d" % (Dy) sys.exit() Bx, By = nz/Dx, nx*ny/Dy # number of block MBy = MAX_BLOCK/Bx bpg_list = [(Bx,MBy) for i in range(By/MBy)] if By%MBy != 0: bpg_list.append( (Bx,By%MBy) ) #print bpg_list # memory allocate f = np.zeros((nx,ny,nz), 'f') cf = np.ones_like(f)*0.5 ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex = set_c(f,'yz') cey = set_c(f,'zx') cez = set_c(f,'xy') descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype)
plt.imshow(J) plt.show() cu.init() d = cu.Device(1) ctx = d.make_context() kernel_size = 3 block_size = (16, 16) #grid_size = calculate_grid_size((height, width), block_size) grid_size = (32, 32) #print(I.shape) #print(grid_size) I_gpu = cu.to_device(I.astype('float32')) J_gpu = cu.mem_alloc(J.nbytes) source = cu.module_from_file("sobel.cubin") kernel_naive = source.get_function("sobel_filter") kernel_naive.prepare(['P', 'P', 'Q', 'Q', 'Q', 'Q']) time = kernel_naive.prepared_timed_call(grid_size, block_size, I_gpu, J_gpu, height, width, kernel_size, 4) J1 = cu.from_device(J_gpu, shape=J.shape, dtype="float32") print("Time spent in kernel1: {}s".format(time() * 1e-3)) print("L1 norm: {}".format( np.sum(np.sum( np.abs(J - J1) )) )) plt.imshow(J1) plt.show()
total_bytes = nx*ny*nz*8*9 if total_bytes/(1024**3) == 0: print '%d MB' % ( total_bytes/(1024**2) ) else: print '%1.2f GB' % ( float(total_bytes)/(1024**3) ) if nz%32 != 0: print "Error: nz is not multiple of 32" sys.exit() # memory allocate f = np.zeros((nx,ny,nz), dtype=np.float64) cf = np.ones_like(f)*0.5 eh_gpus = ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu = [cuda.to_device(f) for i in range(6)] ce_gpus = cex_gpu, cey_gpu, cez_gpu = [cuda.to_device(cf) for i in range(3)] # prepare kernels tpb = 256 for bpg in xrange(65535, 0, -1): if (nx * ny * nz / tpb) % bpg == 0: break print 'tpb = %d, bpg = %g' % (tpb, bpg) from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('Dx',str(tpb)).replace('nxyz',str(nx*ny*nz)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)), options=['-m 64'] ) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src")
def test_memory(self): z = np.random.randn(400).astype(np.float32) new_z = drv.from_device_like(drv.to_device(z), z) assert la.norm(new_z - z) == 0
""" kernels = """""" import numpy as np import sys import pycuda.driver as cuda import pycuda.autoinit if __name__ == '__main__': size = 100 # MBtye nx = size * (1024**2) / 4 nloop = 50 # memory allocate fw_gpu = cuda.to_device(np.zeros(nx, 'f')) fr_gpu = cuda.to_device(np.random.randn(nx).astype(np.float32)) # prepare kernels from pycuda.compiler import SourceModule body = 'fr[idx]' for i in range(nloop): if (i > 0): body += ' + fr[idx]' kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace( 'BODY', body) print kernels mod = SourceModule(kernels) kern_list = [] for i in range(nloop): kern_list.append(mod.get_function("func%.2d" % i))
if pt[2] != None: cf[:, :, pt[2]] = 0 return cf if __name__ == '__main__': nx, ny, nz = 320, 320, 320 print 'dim (%d, %d, %d)' % (nx, ny, nz) print 'mem %1.2f GB' % (nx * ny * nz * 4 * 12. / (1024**3)) # memory allocate f = np.zeros((nx, ny, nz), 'f') cf = np.zeros_like(f) ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1))) cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1))) cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None))) chx_gpu = cuda.to_device(set_c(cf, (None, 0, 0))) chy_gpu = cuda.to_device(set_c(cf, (0, None, 0))) chz_gpu = cuda.to_device(set_c(cf, (0, 0, None))) # prepare kernels from pycuda.compiler import SourceModule
if __name__ == '__main__': digit_level = 25 #################################可変パラメーター############################# 最大25 digitN = 1 << digit_level gsz, lsz, gsz2, lsz2 = Creategszlsz() #GPUカーネルのgridサイズblockサイズ計算 print("A,Bの要素数=", digitN) FMT_P0 = FMTClass(MODP=469762049, MODP_WnSqrt=CreateWnSqrt(60733, 469762049, 26)) FMT_P1 = FMTClass(MODP=1811939329, MODP_WnSqrt=CreateWnSqrt(59189, 1811939329, 26)) FMT_P2 = FMTClass(MODP=2013265921, MODP_WnSqrt=CreateWnSqrt(52278, 2013265921, 27)) host_E = np.zeros(digitN * 2).astype(np.uint32) #結果格納用 E = drv.to_device(host_E) # gpuメモリ確保。かならず0に初期化しておかないといけない print("初期値生成") host_A, host_B = InitializeAB() A0 = drv.to_device(host_A) # gpuメモリ確保&転送 B0 = drv.to_device(host_B) # gpuメモリ確保&転送 A1 = drv.to_device(host_A) # gpuメモリ確保&転送 B1 = drv.to_device(host_B) # gpuメモリ確保&転送 A2 = drv.to_device(host_A) # gpuメモリ確保&転送 B2 = drv.to_device(host_B) # gpuメモリ確保&転送 print("GPU計算開始") stime = time.time() E0 = FMT_P0.Convolution(A0, B0) E1 = FMT_P1.Convolution(A1, B1) E2 = FMT_P2.Convolution(A2, B2)
def go_sort(count, stream=None): grids = count / 8192 keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16) #keys = np.arange(count, dtype=np.uint16) #np.random.shuffle(keys) mkeys = np.reshape(keys, (grids, 8192)) vals = np.arange(count, dtype=np.uint32) dkeys = cuda.to_device(keys) dvals = cuda.to_device(vals) print 'Done seeding' dpfxs = cuda.mem_alloc(grids * 256 * 4) doffsets = cuda.mem_alloc(count * 2) launch('prefix_scan_8_0', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) # This stage will be rejiggered along with the split launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) #print frle(tkeys & 0xff) d_skeys = cuda.mem_alloc(count * 2) d_svals = cuda.mem_alloc(count * 4) if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) # Test integrity of sort (keys and values kept together): # skeys[i] = keys[svals[i]] for all i print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' dkeys, d_skeys = d_skeys, dkeys dvals, d_svals = d_svals, dvals if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('prefix_scan_8_8', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) if not stream: pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = np.reshape(tkeys, (grids, 8192)) new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) print np.nonzero(new_offs != offsets) fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) #print frle(fkeys) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: #print cuda.from_device(doffsets, (4, 8192), np.uint16) #print cuda.from_device(dkeys, (4, 8192), np.uint16) #print cuda.from_device(d_skeys, (4, 8192), np.uint16) skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' sorted_keys = np.sort(keys) # Test that ordering is correct. (Note that we don't need 100% # correctness, so this test should be made "soft".) print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
if __name__ == '__main__': nx, ny, nz = 240, 256, 256 tpb = 256 bpg = (nx * ny * nz) / tpb print 'dim (%d, %d, %d)' % (nx, ny, nz) total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print 'mem %d MB' % (total_bytes / (1024**2)) else: print 'mem %1.2f GB' % (float(total_bytes) / (1024**3)) # memory allocate f = np.zeros((nx, ny, nz), 'f', order='F') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(f, 'yz')) cey_gpu = cuda.to_device(set_c(f, 'zx')) cez_gpu = cuda.to_device(set_c(f, 'xy')) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('TPB', str(tpb)).replace('nxy', str(nx * ny)).replace( 'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)))
def test_two_directions(self): IMAGE_DIR = "Backpack-perfect" im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png")) im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png")) stereo = SemiGlobalMatching(im1, im2, os.path.join("../data", IMAGE_DIR, "calib.txt"), window_size=3, resize=(640, 480)) params = { "p1": 5, "p2": 90000, "census_kernel_size": 7, "reversed": True } stereo.set_params(params) stereo.params['ndisp'] = 50 t1 = time() assert stereo.p1 is not None, "parameters have not been set" t1 = time() cim1 = stereo.census_transform(stereo.im1) cim2 = stereo.census_transform(stereo.im2) #print(f"census transform time {time() - t1}") if not stereo.reversed: D = range(int(stereo.params['ndisp'])) else: D = reversed(range(int(-stereo.params['ndisp']), 1)) cost_images = stereo.compute_disparity_img(cim1, cim2, D) cost_images = np.float32(cost_images) m, n, D = cost_images.shape # direction == (1,0) stereo.directions = [(1, 0), (-1, 0), (1, 1), (-1, 1), (1, -1), (-1, -1)] #stereo.directions = [(0,1)] t1 = time() L = stereo.aggregate_cost(cost_images) print("python aggregate cost %f" % (time() - t1)) L = L.transpose((2, 0, 1)) cost_images = cost_images.transpose((2, 0, 1)) cost_images = np.ascontiguousarray(cost_images, dtype=np.float32) d, rows, cols = cost_images.shape d_step = 1 rows = np.int32(rows) cols = np.int32(cols) compiler_constants = { 'D_STEP': d_step, 'D': d, 'ARR_SIZE': math.floor(d / d_step), 'P1': 5, 'P2': 90000, 'SHMEM_SIZE': 64 } build_options = [format_compiler_constants(compiler_constants)] mod = SourceModule(open("../lib/sgbm_helper.cu").read(), options=build_options) shmem_size = 16 vertical_blocks = int(math.ceil(rows / shmem_size)) #r_aggregate = mod.get_function('r_aggregate') vertical_aggregate_down = mod.get_function('vertical_aggregate_down') vertical_aggregate_up = mod.get_function('vertical_aggregate_up') diagonal_br_tl_aggregate = mod.get_function('diagonal_br_tl_aggregate') diagonal_tl_br_aggregate = mod.get_function('diagonal_tl_br_aggregate') diagonal_tr_bl_aggregate = mod.get_function('diagonal_tr_bl_aggregate') diagonal_bl_tr_aggregate = mod.get_function('diagonal_bl_tr_aggregate') #l_aggregate = mod.get_function('l_aggregate') t1 = time() cost_images_ptr = drv.to_device(cost_images) dp_ptr = drv.mem_alloc(cost_images.nbytes) vertical_aggregate_down(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) vertical_aggregate_up(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) #r_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks)) #l_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks)) diagonal_tl_br_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_bl_tr_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_tr_bl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_br_tl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) print("cuda aggregate cost %f" % (time() - t1)) drv.stop_profiler() agg_image = drv.from_device(dp_ptr, cost_images.shape, dtype=np.float32) s1 = np.sum(np.float64(L)) s2 = np.sum(np.float64(agg_image)) print("L sum: %f" % s1) print("out sum: %f" % s2) self.assertTrue(np.all(np.isclose(agg_image, L)))
""" kernels = """""" import numpy as np import sys import pycuda.driver as cuda import pycuda.autoinit if __name__ == '__main__': size = 100 # MBtye nx = size * (1024**2) / 4 nloop = 30 # memory allocate fw_gpu = cuda.to_device(np.zeros(nx, 'f')) f = np.random.randn(nx).astype(np.float32) fr_gpu_list = [] for i in range(nloop): fr_gpu_list.append(cuda.to_device(f)) # prepare kernels from pycuda.compiler import SourceModule args = 'float *fr00' body = 'f = 0.543*fr00[idx];\n\t__syncthreads();\n' for i in range(nloop): if (i > 0): args += ', float *fr%.2d' % i body += '\tf += %1.3f*fr%.2d[idx];\n\t__syncthreads();\n' % ( np.random.ranf(), i) kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace(
import pycuda.driver as cuda import pycuda.autoinit import numpy as np from magma_wrapper import magma_spotrf_gpu_wrap, magma_get_spotrf_nb_wrap n = 1000 # Create matrix to be factored A = np.eye(n, dtype='float32') + np.ones((n,n))*.1 A_gpu = cuda.to_device(A) # Allocate pagelocked work array nwork = magma_get_spotrf_nb_wrap(n) work_gpu = cuda.pagelocked_empty((nb,nb), dtype='float32') # Do Cholesky factorization info = magma_spotrf_gpu_wrap('U', n, A_gpu, n, work_gpu) # Copy back the Cholesy factor and check for correctness L = cuda.from_device(A_gpu, (n,n), 'float32') print np.abs(np.dot(L,L.T)-A).max()
gsz = 1 << (digit_level - 1) # gpu global_work_size lsz = min(gsz, 256) # gpu local_work_size gsz2 = digitN lsz2 = min(gsz2, 256) #print("A,Bの要素数=",digitN) FMT_P0 = FMTClass(MODP=469762049, MODP_WnSqrt=CreateWnSqrt(60733, 469762049, 26)) FMT_P1 = FMTClass(MODP=1811939329, MODP_WnSqrt=CreateWnSqrt(59189, 1811939329, 26)) FMT_P2 = FMTClass(MODP=2013265921, MODP_WnSqrt=CreateWnSqrt(52278, 2013265921, 27)) host_E = np.zeros(digitN * 2).astype(np.uint32) #結果格納用 E = drv.to_device(host_E) # gpuメモリ確保。かならず0に初期化しておかないといけない #print("初期値生成") host_A, host_B = InitializeAB() A0 = drv.to_device(host_A % np.uint32(FMT_P0.MODP)) # gpuメモリ確保&転送 B0 = drv.to_device(host_B % np.uint32(FMT_P0.MODP)) # gpuメモリ確保&転送 A1 = drv.to_device(host_A % np.uint32(FMT_P1.MODP)) # gpuメモリ確保&転送 B1 = drv.to_device(host_B % np.uint32(FMT_P1.MODP)) # gpuメモリ確保&転送 A2 = drv.to_device(host_A % np.uint32(FMT_P2.MODP)) # gpuメモリ確保&転送 B2 = drv.to_device(host_B % np.uint32(FMT_P2.MODP)) # gpuメモリ確保&転送 #print("GPU計算開始") stime = time.time() E0 = FMT_P0.Convolution(A0, B0) E1 = FMT_P1.Convolution(A1, B1) E2 = FMT_P2.Convolution(A2, B2)
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing, kernel): """ It computes the adjoint double layer potential. Arguments ---------- XKt : array, input for the adjoint double layer potential. LorY : int, Laplace (1) or Yukawa (2). surfSrc: class, source surface, the one that contains the gauss points. surfTar: class, target surface, the one that contains the collocation points. Kt_diag: array, diagonal elements of the adjoint double layer integral operator. self : int, position in the surface array of the source surface. param : class, parameters related to the surface. ind0 : array, it contains the indices related to the treecode computation. timing : class, it contains timing information for different parts of the code. kernel : pycuda source module. Returns -------- Kt_lyr: array, adjoint double layer potential. """ if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) tic.record() K = param.K w = getWeights(K) X_Kt = numpy.zeros(Ns * K) X_Ktc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_Kt[:] = XKt[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K] X_Ktc[:] = XKt[NsK // K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 X_aux = numpy.zeros(Ns * K) getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt, X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_Kt = X_Kt[surfSrc.sortSource] X_Ktc = X_Ktc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT Ktx_aux = numpy.zeros(param.Nround) Kty_aux = numpy.zeros(param.Nround) Ktz_aux = numpy.zeros(param.Nround) ### CPU code if param.GPU == 0: if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self, ind0.index_large, param, LorY, timing) Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_aux, Kty_aux, Ktz_aux, self, LorY, w, param, timing) ### GPU code elif param.GPU == 1: Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL)) Kty_gpu = cuda.to_device(Kty_aux.astype(REAL)) Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu, Kty_gpu, Ktz_gpu, self, ind0, param, LorY, timing, kernel) Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_gpu, Kty_gpu, Ktz_gpu, self, LorY, w, param, timing, kernel) tic.record() Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL) Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL) Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() Kt_lyr = (Ktx_aux[surfTar.unsort] * surfTar.normal[:, 0] + Kty_aux[surfTar.unsort] * surfTar.normal[:, 1] + Ktz_aux[surfTar.unsort] * surfTar.normal[:, 2]) if abs(Kt_diag) > 1e-12: # if same surface Kt_lyr += Kt_diag * XKt toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return Kt_lyr
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param, ind0, timing, kernel): """ It computes the single and double layer potentials. Arguments ---------- XK : array, input for the double layer potential. XV : array, input for the single layer potential. LorY : int, Laplace (1) or Yukawa (2). surfSrc: class, source surface, the one that contains the gauss points. surfTar: class, target surface, the one that contains the collocation points. K_diag : array, diagonal elements of the double layer integral operator. V_diag : array, diagonal elements of the single layer integral operator. IorE : int, internal (1) or external (2). self : int, position in the surface array of the source surface. param : class, parameters related to the surface. ind0 : array, it contains the indices related to the treecode computation. timing : class, it contains timing information for different parts of the code. kernel : pycuda source module. Returns -------- K_lyr : array, double layer potential. V_lyr : array, single layer potential. """ if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) L = numpy.sqrt(2 * surfSrc.area) # Representative length tic.record() K = param.K w = getWeights(K) X_V = numpy.zeros(Ns * K) X_Kx = numpy.zeros(Ns * K) X_Ky = numpy.zeros(Ns * K) X_Kz = numpy.zeros(Ns * K) X_Kc = numpy.zeros(Ns * K) X_Vc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_V[:] = XV[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K] X_Kx[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 0] X_Ky[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 1] X_Kz[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 2] X_Kc[:] = XK[NsK // K] X_Vc[:] = XV[NsK // K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_V = X_V[surfSrc.sortSource] X_Kx = X_Kx[surfSrc.sortSource] X_Ky = X_Ky[surfSrc.sortSource] X_Kz = X_Kz[surfSrc.sortSource] X_Kc = X_Kc[surfSrc.sortSource] X_Vc = X_Vc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT K_aux = numpy.zeros(param.Nround) V_aux = numpy.zeros(param.Nround) ### CPU code if param.GPU == 0: K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, ind0.index_large, param, LorY, timing) K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing) ### GPU code elif param.GPU == 1: K_gpu = cuda.to_device(K_aux.astype(REAL)) V_gpu = cuda.to_device(V_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0, param, LorY, timing, kernel) K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L, w, param, timing, kernel) tic.record() K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL) V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() K_lyr = K_aux[surfTar.unsort] V_lyr = V_aux[surfTar.unsort] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return K_lyr, V_lyr