def execute(self): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) while (sender_ready == 0): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.sender_ready, 0, 1)
def setTotalDensity(data_dir, mf, global_vars): global RhoField, PhaseField, xSize blockX, blockY, blockZ = global_vars["blockX"], global_vars[ "blockY"], global_vars["blockZ"] gridX, gridY, gridZ = global_vars["gridX"] * global_vars[ "num_GPUs"], global_vars["gridY"], global_vars["gridZ"] QuantumState = np.load(data_dir) xSize, ySize, zSize = QuantumState.shape[0], QuantumState.shape[ 1], QuantumState.shape[2] Lattice[0], Lattice[1], Lattice[2], Lattice[3] = xSize, ySize, zSize, mf RhoField = np.zeros((xSize, ySize), dtype=DTYPE) PhaseField = np.zeros((xSize, ySize), dtype=np.float64) gpuQuantumState = drv.to_device(QuantumState) gpuPhaseField = drv.to_device(PhaseField) gpuRhoField = drv.to_device(RhoField) gpuLattice = drv.to_device(Lattice) getTotalDensity(gpuQuantumState, gpuRhoField, gpuPhaseField, gpuLattice, block=(blockX, blockY, blockZ), grid=(gridX, gridY)) RhoField = drv.from_device(gpuRhoField, RhoField.shape, DTYPE) PhaseField = drv.from_device(gpuPhaseField, PhaseField.shape, np.float64) gpuQuantumState.free() gpuPhaseField.free() gpuRhoField.free() gpuLattice.free()
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel): REAL = par_reac.REAL Nq = len(field.xq) N = len(XK) MV = numpy.zeros(len(XK)) L = numpy.sqrt(2*surface.Area) # Representative length AI_int = 0 # Setup vector K = par_reac.K tic = time.time() w = getWeights(K) X_V = numpy.zeros(N*K) X_Kx = numpy.zeros(N*K) X_Ky = numpy.zeros(N*K) X_Kz = numpy.zeros(N*K) X_Kc = numpy.zeros(N*K) X_Vc = numpy.zeros(N*K) for i in range(N*K): X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K] X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0] X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1] X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2] X_Kc[i] = XK[i/K] X_Vc[i] = XV[i/K] toc = time.time() time_set = toc - tic sort = surface.sortSource phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL)) m_gpu = cuda.to_device(X_V[sort].astype(REAL)) mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL)) my_gpu = cuda.to_device(X_Ky[sort].astype(REAL)) mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL)) mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL)) mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL)) AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32)) xkDev = cuda.to_device(surface.xk.astype(REAL)) wkDev = cuda.to_device(surface.wk.astype(REAL)) get_phir = kernel.get_function("get_phir") GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ)) get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold), AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1)) AI_aux = numpy.zeros(Nq, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32) AI_int = numpy.sum(AI_aux) phir_cpu = numpy.zeros(Nq, dtype=REAL) phir_cpu = cuda.from_device(phir, Nq, dtype=REAL) return phir_cpu, AI_int
def make_tracks(fig, file_name): global rhoMin, rhoMax quantum_state = np.load(file_name) VortField = np.zeros((xSize, ySize, spinComps), dtype=DTYPE) VxField = np.zeros((xSize, ySize), dtype=DTYPE) VyField = np.zeros((xSize, ySize), dtype=DTYPE) PhaseField = np.zeros((xSize, ySize), dtype=np.float64) VFieldAverage = np.zeros((xSize, ySize), dtype=DTYPE) RhoField = np.zeros((xSize, ySize), dtype=DTYPE) boson_field = np.zeros((xSize, ySize, spinComps), dtype=DTYPE) Lattice = np.zeros(4, dtype=np.int_) Lattice[0], Lattice[1], Lattice[2] = xSize, ySize, zSize gpuQField = drv.to_device(quantum_state) quantum_state = drv.from_device(gpuQField, quantum_state.shape, DTYPE) gpuVField = drv.to_device(VortField) gpuVxField = drv.to_device(VxField) gpuVyField = drv.to_device(VyField) gpuPhaseField = drv.to_device(PhaseField) gpuVFieldAverage = drv.to_device(VFieldAverage) gpuRhoField = drv.to_device(RhoField) gpuBosonField = drv.to_device(boson_field) gpuLattice = drv.to_device(Lattice) getPlotDetailsVorticity(gpuQField, gpuVField, gpuRhoField, gpuBosonField, gpuPhaseField, gpuVxField, gpuVyField, gpuLattice, block=(blockX, blockY, blockZ), grid=(gridX, gridY)) VortField = drv.from_device(gpuVField, VortField.shape, DTYPE) boson_field = drv.from_device(gpuBosonField, boson_field.shape, DTYPE) #vortex_centers = find_dark_vortex_from_boson(boson_field) vortex_centers = find_dark_vortex_from_vorticity(VortField) x = vortex_centers[:, 0] y = vortex_centers[:, 1] colors = [color_code[c] for c in vortex_centers[:, 2]] plt.subplot(111) plt.scatter(x, y, c=colors, alpha=.25, s=1) #putLabels('', r'$y\ \ (\ell)$', r'$\rho \ \ (\frac{1}{\ell^2})$') ax = plt.gca() ax.set_aspect('equal') ax.set_ylim(0, ySize) ax.set_xlim(0, xSize) # Screen density fig.tight_layout(pad=0.4, w_pad=5.0, h_pad=1.0, rect = [.05, .05, .95, .95]) #Free GPU memory gpuQField.free() gpuVField.free() gpuVxField.free() gpuVyField.free() gpuPhaseField.free() gpuVFieldAverage.free() gpuRhoField.free() gpuBosonField.free() gpuLattice.free() return fig
def execute(self): for i in range(len(self.op.from_id)): sender_ready = drv.from_device(self.sender_ready[i], (1, ), np.int8) while (sender_ready == 0): sender_ready = drv.from_device(self.sender_ready[i], (1, ), np.int8) drv.memset_d8(self.sender_ready[i], 0, 1)
def exchange(nx, ny, a_gpu, b_gpu): nof = np.nbytes['float32'] # nbytes of float cuda.memcpy_htod( int(b_gpu), cuda.from_device(int(a_gpu) + (nx - 2) * ny * nof, (ny, ), np.float32)) cuda.memcpy_htod_async( int(a_gpu) + (nx - 1) * ny * nof, cuda.from_device(int(b_gpu) + ny * nof, (ny, ), np.float32))
def send(target, nx, ny, nz, fx_gpu, fy_gpu): if target < myrank: offset_fx = int(fx_gpu) offset_fy = int(fy_gpu) else: offset_fx = int(fx_gpu) + nx*ny*(nz-1)*nof offset_fy = int(fy_gpu) + nx*ny*(nz-1)*nof mpi.world.send(target, 0, cuda.from_device(offset_fx, (nx,ny), np.float32)) mpi.world.send(target, 1, cuda.from_device(offset_fy, (nx,ny), np.float32))
def get_heartbeat(d_lead, length, sampling_rate): # Kernel Parameters threads_per_block = 200 num_blocks = length / threads_per_block # Get RR reduce_by = 32 edge_signal = cuda.mem_alloc(4 * length) edge_detect(edge_signal, d_lead, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) indecies = numpy.zeros(length / reduce_by).astype(numpy.int32) masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32)) d_index = cuda.to_device(indecies) index_of_peak(d_index, masks, edge_signal, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by) # Allocate output # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32) dev_rr = cuda.mem_alloc(c_length * 4) num_blocks = (c_length / threads_per_block) + 1 get_compact_rr(dev_rr, cd_index, numpy.int32(sampling_rate), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) clean_result(dev_rr, numpy.int32(120), numpy.int32(40), numpy.int32(1), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) moving_average_filter(dev_rr, c_length, 250) index = cuda.from_device(cd_index, (c_length, ), numpy.int32) rr = cuda.from_device(dev_rr, (c_length, ), numpy.int32) index[0] = index[1] return rr, index / float(sampling_rate * 3600)
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2): ctx1 = cuda.Device(dev1).make_context() a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32) ctx1.pop() ctx2 = cuda.Device(dev2).make_context() cuda.memcpy_htod(int(b_gpu), a) b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32) ctx2.pop() ctx1 = cuda.Device(dev1).make_context() cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b) ctx1.pop()
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2): ctx1 = cuda.Device(dev1).make_context() a = cuda.from_device(int(a_gpu) + (nx - 2) * ny * nof, (ny, ), np.float32) ctx1.pop() ctx2 = cuda.Device(dev2).make_context() cuda.memcpy_htod(int(b_gpu), a) b = cuda.from_device(int(b_gpu) + ny * nof, (ny, ), np.float32) ctx2.pop() ctx1 = cuda.Device(dev1).make_context() cuda.memcpy_htod_async(int(a_gpu) + (nx - 1) * ny * nof, b) ctx1.pop()
def send(s, rank, tag_mark, direction): if direction == 'f': offset_gpu = int(s.arr_gpu) + s.ny * nof elif direction == 'b': offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof print type(offset_gpu) mpi.world.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny, ), s.dtype))
def evaluate(self, params, returnOutputs=False): """Evaluate several networks (with given params) on training set. @param params: network params @type params: list of Parameters @param returnOutputs: return network output values (debug) @type returnOutputs: bool, default False @return output matrix if returnOutputs=True, else None """ if self.popSize != len(params): raise ValueError("Need %d Parameter structures (provided %d)" % ( self.popSize, len(params))) paramArrayType = Parameters * len(params) driver.memcpy_htod(self.params, paramArrayType(*params)) # TODO: remove driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4) self.evaluateKernel.prepared_call(self.evaluateGridDim, self.trainSetDev, self.trainSet.size, self.params, self.popSize, self.outputs) driver.Context.synchronize() self.outputsMat = driver.from_device(self.outputs, shape=(self.popSize, self.trainSet.size), dtype=np.float32) if returnOutputs: return self.outputsMat
def pullVort(self): self.context.push() Vort = drv.from_device(self.QField, (self.xSize, self.ySize, self.zSize, 10), dtype=np.int_) self.context.pop() return Vort
def test_stub(shift, trials=10, rounds=1): # Run once so that evt_a doesn't include initialization time sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_a = cuda.Event().record(stream) for i in range(trials): buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_b = cuda.Event().record(stream) evt_b.synchronize() dur = evt_b.time_since(evt_a) / (rounds * trials) print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000), count * sorter.radix_bits / (dur * 32 * 1000)) if shift == 0 and correctness: print '\nTesting correctness' out = cuda.from_device(buf, (count,), np.uint32) sort = np.sort(keys) if np.all(out == sort): print 'Correct' else: nz = np.nonzero(out != sort)[0] print sorted(set(nz >> 13)) for i in nz: print i, out[i-1:i+2], sort[i-1:i+2] assert False, 'Oh no'
def get_from_device(self, index_list=None): ''' Copy array data from GPU device and wrap in a numpy arrays. If index_list is None, return list of numpy arrays (one/array). If index_list is a single integer, return single numpy array. If index_list is an iterable, list of numpy arrays (one/selected array). ''' single = False if index_list is None: index_list = range(len(self.data)) else: try: int(index_list) index_list = [index_list] single = True except TypeError: pass results = [] try: for i in index_list: results.append(cuda.from_device(self.data[i], self.shapes[i], self.dtypes[i])) except cuda.LaunchError: import traceback traceback.print_exc() traceback.print_stack() raise ValueError, 'Invalid device pointer: %d' % i if single: return results[0] else: return results
def send(s, rank, tag_mark, direction): if direction == "f": offset_gpu = int(s.arr_gpu) + s.ny * nof elif direction == "b": offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof print type(offset_gpu) comm.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.dtype))
def train_gpu(self, num_iter, model_file_path): if self.batch == 0: # Prepare to send the numpy array to gpu self.syn1_gpu = cuda.to_device(self.syn1) # Create word idx and related data-structure. self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize) word_rep_ptr = int(self.base_word_rep) self.word_reps = {} for w_idx, word in sorted(self.dictionary.items()): word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32) word_point = self.words_rep[word][1].astype(dtype=np.int32) self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr) word_rep_ptr += WordRep.memsize print "GPU transfers done." self.sent_reps_gpu = cuda.to_device(self.sent_reps) # Prepare sentences for GPU transfer. idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary] for sentence in self.sentences] # Prepare the kernel function kernel = self.kernel_str.get_function("train_sg") words = np.empty(self.num_sents, dtype=np.int32) # sent_reps = np.copy(self.sent_reps) for iter in range(num_iter): # Sample words for each sentence and transfer to GPU for s_idx in range(self.num_sents): words[s_idx] = random.choice(idx_sentences[s_idx]) words_gpu = cuda.to_device(words) kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu, block=(self.size, 1, 1), grid=(self.num_sents, 1, 1)) # autoinit.context.synchronize() self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype) pickle_dump(self.sent_reps, model_file_path)
def print_arr_gpus(s): s.send_result() if mpi.rank == 0: result = cuda.from_device(s.arr_gpu,s.shape,s.dtype) for i in range(1,ngpu): result = np.concatenate((result,mpi.world.recv(i,10))) for i in xrange(s.ny): print result[:s.nx,i],'\t',result[s.nx:2*s.nx,i],'\t',result[2*s.nx:,i]
def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu, surf, LorY, K_diag, IorE, L, w, param, timing, kernel): tic = cuda.Event() toc = cuda.Event() tic.record() REAL = param.REAL mDev = cuda.to_device(m.astype(REAL)) mxDev = cuda.to_device(mx.astype(REAL)) myDev = cuda.to_device(my.astype(REAL)) mzDev = cuda.to_device(mz.astype(REAL)) mKcDev = cuda.to_device(mKc.astype(REAL)) mVcDev = cuda.to_device(mVc.astype(REAL)) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size direct_gpu = kernel.get_function("P2P") AI_int = cuda.to_device(zeros(param.Nround, dtype=int32)) # GPU arrays are flattened, need to point to first element ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays # Check if internal or external to send correct singular integral if IorE==1: sglInt = surfSrc.sglInt_intDev else: sglInt = surfSrc.sglInt_extDev direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev, surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev, mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt, surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list), int32(LorY), REAL(param.kappa), REAL(param.threshold), int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int, surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1)) toc.record() toc.synchronize() timing.time_P2P += tic.time_till(toc)*1e-3 tic.record() AI_aux = zeros(param.Nround, dtype=int32) AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32) timing.AI_int += sum(AI_aux[surfTar.unsort]) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 return K_gpu, V_gpu
def print_arr_gpus(ngpu, nx, ny, a_gpu): send_result(nx, ny, a_gpu) if mpi.rank == 0: result = cuda.from_device(a_gpu, (nx,ny), 'float32') print ngpu for i in range(1,ngpu): result = np.concatenate((result, mpi.world.recv(i,10))) for i in xrange(ny): print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]
def print_arr_gpus(ngpu, nx, ny, a_gpu): send_result(nx, ny, a_gpu) if mpi.rank == 0: result = cuda.from_device(a_gpu, (nx, ny), 'float32') print ngpu for i in range(1, ngpu): result = np.concatenate((result, mpi.world.recv(i, 10))) for i in xrange(ny): print result[:nx, i], '\t', result[nx:2 * nx, i], '\t', result[2 * nx:, i]
def saveVorticityGPU(): num_GPUs, directory_name = global_vars["num_GPUs"], global_vars["base_directory_name"] vorticity = drv.from_device(gpu[0].gpuVortField, gpu[0].vortField.shape, dtype = np.int_) for i in xrange(1,global_vars["num_GPUs"]): vortI = gpu[i].pullVort() vorticity = np.concatenate((vorticity,vortI)) direct = directory_name + "/Extra/" if not os.path.exists(direct): os.makedirs(direct) np.save(direct+"vorticity.npy", vorticity)
def get_heartbeat(d_lead, length, sampling_rate): # Kernel Parameters threads_per_block = 200 num_blocks = length / threads_per_block # Get RR reduce_by = 32 edge_signal = cuda.mem_alloc(4 * length) edge_detect(edge_signal, d_lead, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) indecies = numpy.zeros(length / reduce_by).astype(numpy.int32) masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32)) d_index = cuda.to_device(indecies) index_of_peak(d_index, masks, edge_signal, grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by) # Allocate output # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32) dev_rr = cuda.mem_alloc(c_length * 4) num_blocks = (c_length / threads_per_block) + 1 get_compact_rr(dev_rr, cd_index, numpy.int32(sampling_rate), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) clean_result(dev_rr, numpy.int32(120), numpy.int32(40), numpy.int32(1), numpy.int32(c_length), grid=(num_blocks, 1), block=(threads_per_block, 1, 1)) moving_average_filter(dev_rr, c_length, 250) index = cuda.from_device(cd_index, (c_length,), numpy.int32) rr = cuda.from_device(dev_rr, (c_length,), numpy.int32) index[0] = index[1] return rr, index / float(sampling_rate * 3600)
def lift(self, n): """Returns (positive rate within n largest) / (overall positive rate) for each individual. @return list of counts, in order of individuals """ self.countKernel.prepared_call(self.countGridDim, self.outputs, self.trainSet.size, len(self.trainSet.positives), self.popSize, self.thresholds, self.counts) driver.Context.synchronize() countsMat = driver.from_device(self.counts, shape=(self.popSize, self.countBlockDim[0]), dtype=np.uint32) #log.debug("counts %r: %s", countsMat.shape, str(countsMat)) log.debug("count sum over threads: %s", str(countsMat.sum(axis=1))) self.countSums = countsMat.sum(axis=1) self.nlargestPositiveRate = np.float32(self.countSums) / n log.debug("positive rate (n largest outputs): %s", str(self.nlargestPositiveRate)) overallPositiveRate = float(len(self.trainSet.positives)) / float(self.trainSet.size) log.debug("positive rate (overall): %.04f", overallPositiveRate) lifts = self.nlargestPositiveRate / overallPositiveRate sortedLifts = sorted(enumerate(lifts), key=lambda (i, l): l, reverse=True) topIndex, topLift = sortedLifts[0] topOutputs = self.outputsMat[topIndex] nans = np.sum(np.isnan(topOutputs)) neginfs = np.sum(np.isneginf(topOutputs)) posinfs = np.sum(np.isposinf(topOutputs)) omin = np.nanmin(topOutputs) omax = np.nanmax(topOutputs) threshold = self.thresholdsMat[topIndex] """ log.info("The top ANN's outputs are:") log.info( " %.02f%% NaN, %.02f%% -inf, %.02f%% +inf, min %.02e, max %.02e, thresh %.02e", 100.0 * nans / len(topOutputs), 100.0 * neginfs / len(topOutputs), 100.0 * posinfs / len(topOutputs), omin, omax, threshold) """ return lifts
def get(self): """ Returns ------- numpy.array nx5 of d=1 simplices containing: [index1, index2, dist, sigma1, sigma2] with sigma 1 < sigma 2 """ self.result = drv.from_device(self.k_simplices_ptr, self.k_simplices.shape, np.float32) return self.result
def task1(grid, grid_width, grid_height): list_ptr = grid2list(grid, grid_width, grid_height) grid_size = grid_width * grid_height shp = (grid_size, ) typ = np.float32 xsum_ptr = cuda.to_device(np.zeros(shp, dtype=typ)) ysum_ptr = cuda.to_device(np.zeros(shp, dtype=typ)) print "xsum initialized to ", cuda.from_device(xsum_ptr, shp, typ) print "ysum initialized to ", cuda.from_device(ysum_ptr, shp, typ) func = mod.get_function("task1") func(list_ptr, xsum_ptr, ysum_ptr, grid=(grid_size, 1, 1), block=(32, 1, 1)) pycuda.autoinit.context.synchronize() res_xsum = cuda.from_device(xsum_ptr, shp, typ) res_ysum = cuda.from_device(ysum_ptr, shp, typ) #xsum_ptr.free(), ysum_ptr.free() return xsum_ptr, ysum_ptr, res_xsum, res_ysum
def nlargest_cpu(ann, n): """CPU implementation of nlargest.""" outputs = driver.from_device(ann.outputs, shape=(ann.popSize, ann.trainSize), dtype=np.float32) thresholds = [] for row in outputs: sortedRow = sorted(row, reverse=True) thresholds.append(sortedRow[n]) return thresholds
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, surf, LorY, w, param, timing, kernel): if param.GPU==1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() tic.record() REAL = param.REAL mDev = cuda.to_device(m.astype(REAL)) mKtcDev = cuda.to_device(mKtc.astype(REAL)) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size directKt_gpu = kernel.get_function("P2PKt") AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32)) # GPU arrays are flattened, need to point to first element ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev, surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold), numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1)) toc.record() toc.synchronize() timing.time_P2P += tic.time_till(toc)*1e-3 tic.record() AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32) timing.AI_int += sum(AI_aux[surfTar.unsort]) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 return Ktx_gpu, Kty_gpu, Ktz_gpu
def task2(grid, grid_width, grid_height, scaling): mod = SourceModule(sourcestr.format(grid_width, grid_height)) xsum_ptr, ysum_ptr, res_xsum, res_ysum= task1(grid, grid_width, grid_height) print "task 1 xsum", res_xsum print "task 1 ysum", res_ysum grid_size = grid_width * grid_height shp = (grid_size,) typ = np.float32 xsum_out_ptr = cuda.to_device(np.zeros(shp, dtype=typ)) ysum_out_ptr = cuda.to_device(np.zeros(shp, dtype=typ)) func = mod.get_function("task2") func(xsum_ptr, ysum_ptr, xsum_out_ptr, ysum_out_ptr, grid=(grid_size, 1, 1), block=(1024,1,1)) pycuda.autoinit.context.synchronize() res_xsum = cuda.from_device(xsum_out_ptr, shp, typ) res_ysum = cuda.from_device(ysum_out_ptr, shp, typ) for y in range(grid_height): for x in range(grid_width): cur_avgx, cur_avgy = res_xsum[x+y*grid_width], res_ysum[x+y*grid_width] for car in grid[(x,y)]: # car.vx = -car.vx # car.vy = -car.vy # car.vx += cur_avgx # car.vy += cur_avgy car.add_velocity( scale((cur_avgx, cur_avgy), scaling) )
def go_sort_old(count, stream=None): data = np.fromstring(np.random.bytes(count), dtype=np.uint8) ddata = cuda.to_device(data) print 'Done seeding' grids = count / 8192 pfxs = np.zeros((grids + 1, 256), dtype=np.int32) dpfxs = cuda.to_device(pfxs) launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) #dsplit = cuda.to_device(pfxs) #launch('crappy_split', dpfxs, dsplit, #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) #if not stream: #split = cuda.from_device_like(dsplit, pfxs) #split_ = cuda.from_device_like(dsplit_, pfxs) #print np.all(split == split_) dshortseg_pfxs = cuda.mem_alloc(256 * 4) dshortseg_sums = cuda.mem_alloc(256 * 4) launch('prefix_sum', dpfxs, np.int32(grids * 256), dshortseg_pfxs, dshortseg_sums, block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1) dsorted = cuda.mem_alloc(count * 4) launch('sort_8', ddata, dsorted, dpfxs, block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) launch('sort_8_a', ddata, dsorted, dpfxs, dsplit, block=(32, 32, 1), grid=(grids, 1), stream=stream) if not stream: sorted = cuda.from_device(dsorted, (count,), np.int32) f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r]) sort_stat = f(rle(sorted)) with open('dev.txt', 'w') as fp: fp.write(sort_stat) sorted_np = np.sort(data) np_stat = f(rle(sorted_np)) with open('cpu.txt', 'w') as fp: fp.write(np_stat) print 'is_sorted?', np.all(sorted == sorted_np)
def train_gpu(self, num_iter, model_file_path): if self.batch == 0: # Prepare to send the numpy array to gpu self.syn1_gpu = cuda.to_device(self.syn1) # Create word idx and related data-structure. self.base_word_rep = cuda.mem_alloc( len(self.dictionary) * WordRep.memsize) word_rep_ptr = int(self.base_word_rep) self.word_reps = {} for w_idx, word in sorted(self.dictionary.items()): word_code = 1 - 2 * self.words_rep[word][0].astype( dtype=np.int32) word_point = self.words_rep[word][1].astype(dtype=np.int32) self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr) word_rep_ptr += WordRep.memsize print "GPU transfers done." self.sent_reps_gpu = cuda.to_device(self.sent_reps) # Prepare sentences for GPU transfer. idx_sentences = [[ self.dictionary.token2id[word] for word in sentence if word in self.dictionary ] for sentence in self.sentences] # Prepare the kernel function kernel = self.kernel_str.get_function("train_sg") words = np.empty(self.num_sents, dtype=np.int32) # sent_reps = np.copy(self.sent_reps) for iter in range(num_iter): # Sample words for each sentence and transfer to GPU for s_idx in range(self.num_sents): words[s_idx] = random.choice(idx_sentences[s_idx]) words_gpu = cuda.to_device(words) kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu, block=(self.size, 1, 1), grid=(self.num_sents, 1, 1)) # autoinit.context.synchronize() self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype) pickle_dump(self.sent_reps, model_file_path)
def test_mwc(rounds=5000, nblocks=64, blockwidth=512): import pycuda.driver as cuda from pycuda.compiler import SourceModule import time nthreads = blockwidth * nblocks seeds = make_seeds(nthreads, host_seed=42) dseeds = cuda.to_device(seeds) mod = SourceModule(assemble_code(mwctestlib)) for trial in range(2): print "Trial %d, on CPU: " % trial, sums = np.zeros(nthreads, dtype=np.uint64) ctime = time.time() mults = seeds[:, 0].astype(np.uint64) states = seeds[:, 1] carries = seeds[:, 2] for i in range(rounds): step = np.frombuffer((mults * states + carries).data, dtype=np.uint32).reshape((nthreads, 2)) states[:] = step[:, 0] carries[:] = step[:, 1] sums += states ctime = time.time() - ctime print "Took %g seconds." % ctime print "Trial %d, on device: " % trial, dsums = cuda.mem_alloc(8 * nthreads) fun = mod.get_function("test_mwc") dtime = fun(dseeds, dsums, np.float32(rounds), block=(blockwidth, 1, 1), grid=(nblocks, 1), time_kernel=True) print "Took %g seconds." % dtime dsums = cuda.from_device(dsums, nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" print sums print dsums
def test_mwc(rounds=5000, nblocks=64, blockwidth=512): import pycuda.driver as cuda from pycuda.compiler import SourceModule import time nthreads = blockwidth * nblocks seeds = make_seeds(nthreads, host_seed=42) dseeds = cuda.to_device(seeds) mod = SourceModule(assemble_code(mwctestlib)) for trial in range(2): print "Trial %d, on CPU: " % trial, sums = np.zeros(nthreads, dtype=np.uint64) ctime = time.time() mults = seeds[0].astype(np.uint64) states = seeds[1] carries = seeds[2] for i in range(rounds): step = np.frombuffer((mults * states + carries).data, dtype=np.uint32).reshape((2, nthreads), order='F') states[:] = step[0] carries[:] = step[1] sums += states ctime = time.time() - ctime print "Took %g seconds." % ctime print "Trial %d, on device: " % trial, dsums = cuda.mem_alloc(8*nthreads) fun = mod.get_function("test_mwc") dtime = fun(dseeds, dsums, np.float32(rounds), block=(blockwidth,1,1), grid=(nblocks,1), time_kernel=True) print "Took %g seconds." % dtime dsums = cuda.from_device(dsums, nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" print sums print dsums
def test_stub(shift, trials=10, rounds=1): # Run once so that evt_a doesn't include initialization time sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_a = cuda.Event().record(stream) for i in range(trials): buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_b = cuda.Event().record(stream) evt_b.synchronize() dur = evt_b.time_since(evt_a) / (rounds * trials) print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000), count * sorter.radix_bits / (dur * 32 * 1000)) if shift == 0 and correctness: print '\nTesting correctness' out = cuda.from_device(buf, (count, ), np.uint32) sort = np.sort(keys) if np.all(out == sort): print 'Correct' else: nz = np.nonzero(out != sort)[0] print sorted(set(nz >> 13)) for i in nz: print i, out[i - 1:i + 2], sort[i - 1:i + 2] assert False, 'Oh no'
def _print_interp_knots(self, rdr, tsidx=5): infos = cuda.from_device(self.info_a.d_params, (tsidx + 1, len(rdr.packer)), f32) for i, n in zip(infos[-1], rdr.packer.packed): print '%60s %g' % ('_'.join(n), i)
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing, kernel): """ It computes the adjoint double layer potential. Arguments ---------- XKt : array, input for the adjoint double layer potential. LorY : int, Laplace (1) or Yukawa (2). surfSrc: class, source surface, the one that contains the gauss points. surfTar: class, target surface, the one that contains the collocation points. Kt_diag: array, diagonal elements of the adjoint double layer integral operator. self : int, position in the surface array of the source surface. param : class, parameters related to the surface. ind0 : array, it contains the indices related to the treecode computation. timing : class, it contains timing information for different parts of the code. kernel : pycuda source module. Returns -------- Kt_lyr: array, adjoint double layer potential. """ if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) tic.record() K = param.K w = getWeights(K) X_Kt = numpy.zeros(Ns * K) X_Ktc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_Kt[:] = XKt[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K] X_Ktc[:] = XKt[NsK // K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 X_aux = numpy.zeros(Ns * K) getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt, X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_Kt = X_Kt[surfSrc.sortSource] X_Ktc = X_Ktc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT Ktx_aux = numpy.zeros(param.Nround) Kty_aux = numpy.zeros(param.Nround) Ktz_aux = numpy.zeros(param.Nround) ### CPU code if param.GPU == 0: if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort( surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self, ind0.index_large, param, LorY, timing) Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_aux, Kty_aux, Ktz_aux, self, LorY, w, param, timing) ### GPU code elif param.GPU == 1: Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL)) Kty_gpu = cuda.to_device(Kty_aux.astype(REAL)) Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu, Kty_gpu, Ktz_gpu, self, ind0, param, LorY, timing, kernel) Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_gpu, Kty_gpu, Ktz_gpu, self, LorY, w, param, timing, kernel) tic.record() Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL) Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL) Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() Kt_lyr = (Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] + Kty_aux[surfTar.unsort]*surfTar.normal[:,1] + Ktz_aux[surfTar.unsort]*surfTar.normal[:,2]) if abs(Kt_diag) > 1e-12: # if same surface Kt_lyr += Kt_diag * XKt toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return Kt_lyr
def send_result(s): mpi.world.send(0, 10, cuda.from_device(s.arr_gpu, s.shape, s.dtype))
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param, ind0, timing, kernel): tic = cuda.Event() toc = cuda.Event() REAL = param.REAL Ns = len(surfSrc.triangle) Nt = len(surfTar.triangle) L = numpy.sqrt(2*surfSrc.Area) # Representative length tic.record() K = param.K w = getWeights(K) X_V = numpy.zeros(Ns*K) X_Kx = numpy.zeros(Ns*K) X_Ky = numpy.zeros(Ns*K) X_Kz = numpy.zeros(Ns*K) X_Kc = numpy.zeros(Ns*K) X_Vc = numpy.zeros(Ns*K) NsK = numpy.arange(Ns*K) X_V[:] = XV[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K] X_Kx[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,0] X_Ky[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,1] X_Kz[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,2] X_Kc[:] = XK[NsK/K] X_Vc[:] = XV[NsK/K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc)*1e-3 tic.record() C = 0 getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc)*1e-3 tic.record() for C in reversed(range(1,len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc)*1e-3 tic.record() X_V = X_V[surfSrc.sortSource] X_Kx = X_Kx[surfSrc.sortSource] X_Ky = X_Ky[surfSrc.sortSource] X_Kz = X_Kz[surfSrc.sortSource] X_Kc = X_Kc[surfSrc.sortSource] X_Vc = X_Vc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 param.Nround = len(surfTar.twig)*param.NCRIT K_aux = numpy.zeros(param.Nround) V_aux = numpy.zeros(param.Nround) AI_int = 0 ### CPU code if param.GPU==0: K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, ind0.index_large, param, LorY, timing) K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing) ### GPU code elif param.GPU==1: K_gpu = cuda.to_device(K_aux.astype(REAL)) V_gpu = cuda.to_device(V_aux.astype(REAL)) if surfTar.offsetMlt[self,len(surfTar.twig)]>0: K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0, param, LorY, timing, kernel) K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L, w, param, timing, kernel) tic.record() K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL) V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() K_lyr = K_aux[surfTar.unsort] V_lyr = V_aux[surfTar.unsort] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 return K_lyr, V_lyr
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel): REAL = par_reac.REAL Nq = len(field.xq) N = len(XK) MV = numpy.zeros(len(XK)) L = numpy.sqrt(2*surface.Area) # Representative length AI_int = 0 # Setup vector K = par_reac.K tic = time.time() w = getWeights(K) X_V = numpy.zeros(N*K) X_Kx = numpy.zeros(N*K) X_Ky = numpy.zeros(N*K) X_Kz = numpy.zeros(N*K) X_Kc = numpy.zeros(N*K) X_Vc = numpy.zeros(N*K) for i in range(N*K): X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K] X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0] X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1] X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2] X_Kc[i] = XK[i/K] X_Vc[i] = XV[i/K] toc = time.time() time_set = toc - tic sort = surface.sortSource phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL)) m_gpu = cuda.to_device(X_V[sort].astype(REAL)) mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL)) my_gpu = cuda.to_device(X_Ky[sort].astype(REAL)) mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL)) mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL)) mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL)) AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=int32)) xkDev = cuda.to_device(surface.xk.astype(REAL)) wkDev = cuda.to_device(surface.wk.astype(REAL)) get_phir = kernel.get_function("get_phir") GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ)) get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, int32(len(surface.xj)), int32(Nq), int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold), AI_int_gpu, int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1)) AI_aux = numpy.zeros(Nq, dtype=int32) AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=int32) AI_int = numpy.sum(AI_aux) phir_cpu = numpy.zeros(Nq, dtype=REAL) phir_cpu = cuda.from_device(phir, Nq, dtype=REAL) return phir_cpu, AI_int
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing, kernel): tic = cuda.Event() toc = cuda.Event() REAL = param.REAL Ns = len(surfSrc.triangle) Nt = len(surfTar.triangle) L = numpy.sqrt(2*surfSrc.Area) # Representative length tic.record() K = param.K w = getWeights(K) X_Kt = numpy.zeros(Ns*K) X_Ktc = numpy.zeros(Ns*K) NsK = numpy.arange(Ns*K) X_Kt[:] = XKt[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K] X_Ktc[:] = XKt[NsK/K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc)*1e-3 tic.record() C = 0 X_aux = numpy.zeros(Ns*K) getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt, X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc)*1e-3 tic.record() for C in reversed(range(1,len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc)*1e-3 tic.record() X_Kt = X_Kt[surfSrc.sortSource] X_Ktc = X_Ktc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 param.Nround = len(surfTar.twig)*param.NCRIT Ktx_aux = numpy.zeros(param.Nround) Kty_aux = numpy.zeros(param.Nround) Ktz_aux = numpy.zeros(param.Nround) AI_int = 0 ### CPU code if param.GPU==0: if surfTar.offsetMlt[self,len(surfTar.twig)]>0: Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self, ind0.index_large, param, LorY, timing) Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_aux, Kty_aux, Ktz_aux, self, LorY, w, param, timing) ### GPU code elif param.GPU==1: Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL)) Kty_gpu = cuda.to_device(Kty_aux.astype(REAL)) Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL)) if surfTar.offsetMlt[self,len(surfTar.twig)]>0: Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu, Kty_gpu, Ktz_gpu, self, ind0, param, LorY, timing, kernel) Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_gpu, Kty_gpu, Ktz_gpu, self, LorY, w, param, timing, kernel) tic.record() Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL) Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL) Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() Kt_lyr = Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] \ + Kty_aux[surfTar.unsort]*surfTar.normal[:,1] \ + Ktz_aux[surfTar.unsort]*surfTar.normal[:,2] if abs(Kt_diag)>1e-12: # if same surface Kt_lyr += Kt_diag * XKt toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 return Kt_lyr
def from_array(cls, a): foobar_array = cuda.from_device(a[0], 1, dtype=np.uint32) data = cuda.from_device(a[1], 10, dtype=np.int32) return cls(foobar_array[0], data)
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param, ind0, timing, kernel): """ It computes the single and double layer potentials. Arguments ---------- XK : array, input for the double layer potential. XV : array, input for the single layer potential. LorY : int, Laplace (1) or Yukawa (2). surfSrc: class, source surface, the one that contains the gauss points. surfTar: class, target surface, the one that contains the collocation points. K_diag : array, diagonal elements of the double layer integral operator. V_diag : array, diagonal elements of the single layer integral operator. IorE : int, internal (1) or external (2). self : int, position in the surface array of the source surface. param : class, parameters related to the surface. ind0 : array, it contains the indices related to the treecode computation. timing : class, it contains timing information for different parts of the code. kernel : pycuda source module. Returns -------- K_lyr : array, double layer potential. V_lyr : array, single layer potential. """ if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) L = numpy.sqrt(2 * surfSrc.area) # Representative length tic.record() K = param.K w = getWeights(K) X_V = numpy.zeros(Ns * K) X_Kx = numpy.zeros(Ns * K) X_Ky = numpy.zeros(Ns * K) X_Kz = numpy.zeros(Ns * K) X_Kc = numpy.zeros(Ns * K) X_Vc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_V[:] = XV[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K] X_Kx[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 0] X_Ky[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 1] X_Kz[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[ NsK // K] * surfSrc.normal[NsK // K, 2] X_Kc[:] = XK[NsK // K] X_Vc[:] = XV[NsK // K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_V = X_V[surfSrc.sortSource] X_Kx = X_Kx[surfSrc.sortSource] X_Ky = X_Ky[surfSrc.sortSource] X_Kz = X_Kz[surfSrc.sortSource] X_Kc = X_Kc[surfSrc.sortSource] X_Vc = X_Vc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT K_aux = numpy.zeros(param.Nround) V_aux = numpy.zeros(param.Nround) ### CPU code if param.GPU == 0: K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, ind0.index_large, param, LorY, timing) K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing) ### GPU code elif param.GPU == 1: K_gpu = cuda.to_device(K_aux.astype(REAL)) V_gpu = cuda.to_device(V_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0, param, LorY, timing, kernel) K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L, w, param, timing, kernel) tic.record() K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL) V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() K_lyr = K_aux[surfTar.unsort] V_lyr = V_aux[surfTar.unsort] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return K_lyr, V_lyr
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param, ind0, timing, kernel): tic = cuda.Event() toc = cuda.Event() REAL = param.REAL Ns = len(surfSrc.triangle) Nt = len(surfTar.triangle) L = sqrt(2*surfSrc.Area) # Representative length tic.record() K = param.K w = getWeights(K) X_V = zeros(Ns*K) X_Kx = zeros(Ns*K) X_Ky = zeros(Ns*K) X_Kz = zeros(Ns*K) X_Kc = zeros(Ns*K) X_Vc = zeros(Ns*K) NsK = arange(Ns*K) X_V[:] = XV[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K] X_Kx[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,0] X_Ky[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,1] X_Kz[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,2] X_Kc[:] = XK[NsK/K] X_Vc[:] = XV[NsK/K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc)*1e-3 tic.record() C = 0 getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc)*1e-3 tic.record() for C in reversed(range(1,len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc)*1e-3 tic.record() X_V = X_V[surfSrc.sortSource] X_Kx = X_Kx[surfSrc.sortSource] X_Ky = X_Ky[surfSrc.sortSource] X_Kz = X_Kz[surfSrc.sortSource] X_Kc = X_Kc[surfSrc.sortSource] X_Vc = X_Vc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 param.Nround = len(surfTar.twig)*param.NCRIT K_aux = zeros(param.Nround) V_aux = zeros(param.Nround) AI_int = 0 ### CPU code if param.GPU==0: K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, ind0.index_large, param, LorY, timing) K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing) ### GPU code elif param.GPU==1: K_gpu = cuda.to_device(K_aux.astype(REAL)) V_gpu = cuda.to_device(V_aux.astype(REAL)) if surfTar.offsetMlt[self,len(surfTar.twig)]>0: K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0, param, LorY, timing, kernel) K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_gpu, V_gpu, self, LorY, K_diag, V_diag, IorE, L, w, param, timing, kernel) tic.record() K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL) V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc)*1e-3 tic.record() K_lyr = K_aux[surfTar.unsort] V_lyr = V_aux[surfTar.unsort] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc)*1e-3 return K_lyr, V_lyr
def send(s, rank, tag_mark): a = cuda.from_device(s.arr_gpu, (6, 5), np.float32) mpi.world.send(rank, tag_mark, a)
def send(s,rank,tag_mark): a = cuda.from_device(s.arr_gpu,(6,5),np.float32) mpi.world.send( rank,tag_mark, a)
a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.mem_alloc(c.nbytes) # use the normal kernel from pycuda.compiler import SourceModule mod = SourceModule(kernels.replace("NN", str(nx * ny))) cumul = mod.get_function("cumul") cumul(a_gpu, b_gpu, c_gpu, block=(256, 1, 1), grid=(2, 1)) cuda.memcpy_dtoh(c, c_gpu) assert np.linalg.norm(c - a * b) < 1e-6 # use the gpuarray a_ga = gpuarray.to_gpu(a) b_ga = gpuarray.to_gpu(b) assert np.linalg.norm((a_ga * b_ga).get() - a * b) < 1e-6 # arguments cumul(a_ga, b_ga, c_gpu, block=(256, 1, 1), grid=(2, 1)) cuda.memcpy_dtoh(c, c_gpu) assert np.linalg.norm(c - a * b) < 1e-6 # sub-area memcpy from gpuarray a_sub = np.zeros(100, dtype=np.complex64) a_sub[:] = cuda.from_device(int(a_ga.gpudata) + 900 * np.nbytes["complex64"], (100,), "complex64") print a_sub d_g = gpuarray.zeros((10, 13), dtype=np.complex64) print cuda.from_device(int(d_g.gpudata) + 100 * np.nbytes["complex64"], (30,), "complex64")
def get_phir_gpu(XK, XV, surface, field, par_reac, kernel): """ It computes the reaction potential on the GPU and it brings the data to the cpu. Arguments ---------- XK : array, input for the double layer potential. XV : array, input for the single layer potential. surface : class, surface where we are computing the reaction potential. field : class, information about the different regions in the molecule. par_reac: class, fine parameters related to the surface. Returns -------- phir_cpu: array, reaction potential brought from the GPU to the cpu. AI_int : int, counter of the amount of near singular integrals solved. """ REAL = par_reac.REAL Nq = len(field.xq) N = len(XK) AI_int = 0 # Setup vector K = par_reac.K tic = time.time() w = getWeights(K) X_V = numpy.zeros(N * K) X_Kx = numpy.zeros(N * K) X_Ky = numpy.zeros(N * K) X_Kz = numpy.zeros(N * K) X_Kc = numpy.zeros(N * K) X_Vc = numpy.zeros(N * K) for i in range(N * K): X_V[i] = XV[i // K] * w[i % K] * surface.area[i // K] X_Kx[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 0] X_Ky[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 1] X_Kz[i] = XK[i // K] * w[i % K] * surface.area[ i // K] * surface.normal[i // K, 2] X_Kc[i] = XK[i // K] X_Vc[i] = XV[i // K] toc = time.time() sort = surface.sortSource phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL)) m_gpu = cuda.to_device(X_V[sort].astype(REAL)) mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL)) my_gpu = cuda.to_device(X_Ky[sort].astype(REAL)) mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL)) mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL)) mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL)) AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32)) xkDev = cuda.to_device(surface.xk.astype(REAL)) wkDev = cuda.to_device(surface.wk.astype(REAL)) get_phir = kernel.get_function("get_phir") GSZ = int(numpy.ceil(float(Nq) / par_reac.BSZ)) get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold), AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ, 1, 1), grid=(GSZ, 1)) AI_aux = numpy.zeros(Nq, dtype=numpy.int32) AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32) AI_int = numpy.sum(AI_aux) phir_cpu = numpy.zeros(Nq, dtype=REAL) phir_cpu = cuda.from_device(phir, Nq, dtype=REAL) return phir_cpu, AI_int
def send(s, rank, tag_mark, arr_gpu): if mpi.rank > rank: offset_gpu = int(arr_gpu)+s.ny*nbof elif mpi.rank < rank: offset_gpu = int(arr_gpu)+(s.nx-2)*s.ny*nbof mpi.world.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.f.dtype))
def show(s, a): print a print cuda.from_device( s.arr_gpu, (6,5), np.float32 ).T
def print_arr_gpu(s): print cuda.from_device(s.arr_gpu,s.shape,s.dtype)
def show(s, a): print a print cuda.from_device(s.arr_gpu, (6, 5), np.float32).T
def __repr__(self): return "{}\n{}".format( cuda.from_device(self.x_ptr, self.shape, self.dtype), cuda.from_device(self.y_ptr, self.shape, self.dtype))
block=(256, 1, 1), grid=(1, 1)) diagonal_tr_bl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_br_tl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) agg_image = drv.from_device(dp_ptr, cost_images.shape, dtype=np.float32) print(f"aggregation time {time() - t1}") #agg_image = agg_image.transpose((2,1,0)) min_cost_im = np.argmin(agg_image, axis=0) gt = np.argmin(L, axis=0) agg_image = agg_image.transpose((2, 1, 0)) min_cost_im += 1 gt += 1 im = stereo.compute_depth(np.int32(min_cost_im)) gt_im = stereo.compute_depth(np.int32(gt)) ##im = stereo.normalize(im, 0.1) ##gt_im = stereo.normalize(gt_im, 0.1) np.save("../out_images/testim.npy", im) np.save("../out_images/gtim.npy", im)
def __str__(self): return str(cuda.from_device(self.data, self.shape, self.dtype))
def send(rank, tag_mark, nx, ny, arr_gpu): if mpi.rank > rank: offset_gpu = int(arr_gpu)+(ny+1)*nbof elif mpi.rank < rank: offset_gpu = int(arr_gpu)+((nx-2)*ny+1)*nbof mpi.world.send(rank, tag_mark, cuda.from_device(offset_gpu, (ny-2,), dtof))
def print_gpu(s): #s.show() print cuda.from_device( s.arr_gpu, (6,5), np.float32 )