def corr(input, axes=(-1, -2), norm=False, returngpu=False, **kwargs): """ simple autocorrelation of input along axes (default: last two) using gpu axes: axes to correlate along, defaults to last two norm: do normalisation along non correlation axes and normalise for pair count returngpu: retrun a cupy array """ axes = sorted([input.ndim + a if a < 0 else a for a in axes]) fftshape = [_fastlen(2 * input.shape[ax]) for ax in axes] dinput = _cp.array(input) if norm: dinput *= 1 / dinput.mean(axis=[i for i in range(input.ndim) if i not in axes] or None) ret = _cp.fft.rfftn(dinput, fftshape) ret = _cp.abs(ret) ** 2 ret = _cp.fft.irfftn(ret, axes=axes) ret = _cp.fft.fftshift(ret, axes=axes)[ tuple((Ellipsis, *(slice(ps // 2 - input.shape[ax], ps // 2 + input.shape[ax]) for ax, ps in zip(axes, fftshape)))) ] if norm: n = corr(_cp.ones(tuple(input.shape[ax] for ax in axes)), returngpu=True) ret /= n ret[(...,) + (n < 0.9).nonzero()] = _np.nan if not returngpu: ret = _cp.asnumpy(ret) _cp.get_default_memory_pool().free_all_blocks() return ret
def test_empty_int_huge_size(self): a = cupy.empty(2**31, dtype='b') a.fill(123) self.assertTrue((a == 123).all()) # Free huge memory for slow test del a cupy.get_default_memory_pool().free_all_blocks()
def test_empty_int_huge_size_fill0(self): a = cupy.empty(2 ** 31, dtype='b') a.fill(0) assert (a == 0).all() # Free huge memory for slow test del a cupy.get_default_memory_pool().free_all_blocks()
def test_empty_huge_size(self): a = cupy.empty((1024, 2048, 1024), dtype='b') a.fill(123) assert (a == 123).all() # Free huge memory for slow test del a cupy.get_default_memory_pool().free_all_blocks()
def __init__( self, model: Any, config=None, optimizer: Any = None, mixed_precision: bool = False, grad_scaler: Optional[PyTorchGradScaler] = None, ): if mixed_precision and not has_torch_amp: raise ValueError( "Mixed-precision training is not supported, requires capable GPU and torch>=1.9.0" ) super().__init__(model, config, optimizer) if grad_scaler is None: grad_scaler = PyTorchGradScaler(mixed_precision) self._grad_scaler = grad_scaler self._mixed_precision = mixed_precision if CupyOps.xp is not None and isinstance(get_current_ops(), CupyOps): pools = context_pools.get() if "pytorch" not in pools: from cupy import get_default_memory_pool set_gpu_allocator("pytorch") get_default_memory_pool().free_all_blocks()
def test_empty_huge_size_fill0(self): a = cupy.empty((1024, 2048, 1024), dtype='b') a.fill(0) self.assertTrue((a == 0).all()) # Free huge memory for slow test del a cupy.get_default_memory_pool().free_all_blocks()
def test(self): # Elementwise a = cupy.ones(self.size, dtype='b') # Reduction result = a.sum() self.assertEqual(self.size, result) # Free huge memory for slow test del a cupy.get_default_memory_pool().free_all_blocks()
def test_concatenate_32bit_boundary(self): a = cupy.zeros((2**30, ), dtype=cupy.int8) b = cupy.zeros((2**30, ), dtype=cupy.int8) ret = cupy.concatenate([a, b]) del a del b del ret # Free huge memory for slow test cupy.get_default_memory_pool().free_all_blocks()
def prepare_eval_data(self): pos_eval_users = cp.array(self._pos_eval_users) pos_eval_items = cp.array(self._pos_eval_items) neg_mat = cp.array(self._neg_mat) neg_eval_users_base = cp.repeat(pos_eval_users, self._eval_negative_samples) # Generate negative samples test_u_neg, test_i_neg = generate_negatives( neg_users=neg_eval_users_base, true_mat=neg_mat, item_range=self.num_items, sort=True, use_trick=False) test_u_neg = test_u_neg.reshape( (-1, self._eval_negative_samples)).get() test_i_neg = test_i_neg.reshape( (-1, self._eval_negative_samples)).get() test_users = self._pos_eval_users.reshape((-1, 1)) test_items = self._pos_eval_items.reshape((-1, 1)) # Combine positive and negative samples test_users = np.concatenate((test_u_neg, test_users), axis=1) test_items = np.concatenate((test_i_neg, test_items), axis=1) # Generate duplicate mask ## Stable sort indices by incrementing all values with fractional position indices = np.arange(test_users.shape[1]).reshape( (1, -1)).repeat(test_users.shape[0], axis=0) summed_items = np.add(test_items, indices / test_users.shape[1]) sorted_indices = np.argsort(summed_items, axis=1) sorted_order = np.argsort(sorted_indices, axis=1) sorted_items = np.sort(test_items, axis=1) ## Generate duplicate mask dup_mask = np.equal(sorted_items[:, 0:-1], sorted_items[:, 1:]) dup_mask = np.concatenate((dup_mask, np.zeros( (test_users.shape[0], 1))), axis=1) r_indices = np.arange(test_users.shape[0]).reshape( (-1, 1)).repeat(test_users.shape[1], axis=1) dup_mask = dup_mask[r_indices, sorted_order].astype(np.float32) # Reshape all to (-1) and split into chunks batch_size = self.eval_users_per_batch * test_users.shape[1] split_indices = np.arange(batch_size, test_users.shape[0] * test_users.shape[1], batch_size) self.eval_users = np.split(test_users.reshape(-1), split_indices) self.eval_items = np.split(test_items.reshape(-1), split_indices) self.dup_mask = np.split(dup_mask.reshape(-1), split_indices) # Free GPU memory to make space for Tensorflow cp.get_default_memory_pool().free_all_blocks()
def print_proc_metadata(): num_cores = mp.cpu_count() mempool = cp.get_default_memory_pool() print('--------------------------------------------------------') print('| num_cpu_cores: {:<37} |'.format(num_cores)) print('| mempool used bytes: {:<32} |'.format(mempool.used_bytes())) print('| mempool total bytes: {:<31} |'.format(mempool.total_bytes())) print('| mempool limit bytes: {:<31} |'.format( cp.get_default_memory_pool().get_limit())) print('--------------------------------------------------------')
def test_cumprod_huge_array(self): size = 2**32 # Free huge memory for slow test cupy.get_default_memory_pool().free_all_blocks() a = cupy.ones(size, 'b') result = cupy.cumprod(a, dtype='b') del a assert (result == 1).all() # Free huge memory for slow test del result cupy.get_default_memory_pool().free_all_blocks()
def test_memory(): assert (cp.get_default_memory_pool().used_bytes() == 0) a = Test_Custom_Cupy.test_create_real_cupy_from_c() b = a * 2 assert (cp.array_equal(b.sum(), a.sum() * 2)) a = None b = None assert (cp.get_default_memory_pool().used_bytes() == 0)
def _cupy_convolve_fft(self, image1, image2, mode=None): import cupy import numpy # TODO: review if this is needed cupy.cuda.set_allocator(None) self._debug_allocation(f"before FFT") is_planning_on = cupy.fft.config.enable_nd_planning cupy.fft.config.enable_nd_planning = False if image1.ndim == image2.ndim == 0: # scalar inputs return image1 * image2 elif not image1.ndim == image2.ndim: raise ValueError("Dimensions do not match.") elif image1.size == 0 or image2.size == 0: # empty arrays return cupy.array([]) s1 = numpy.asarray(image1.shape) s2 = numpy.asarray(image2.shape) shape = tuple(s1 + s2 - 1) fsize = shape # tuple(int(2 ** math.ceil(math.log2(x))) for x in tuple(shape)) image1_fft = cupy.fft.rfftn(image1, fsize) image2_fft = cupy.fft.rfftn(image2, fsize) ret = cupy.fft.irfftn(image1_fft * image2_fft) # ret = ret.astype(cupy.float32) #cupy.real(ret) fslice = tuple([slice(0, int(sz)) for sz in shape]) ret = ret[fslice] # if mode=='same': newshape = cupy.asarray(image1.shape) currshape = cupy.array(ret.shape) startind = (currshape - newshape) // 2 endind = startind + newshape myslice = [slice(startind[k], endind[k]) for k in range(len(endind))] ret = ret[tuple(myslice)] cupy.fft.config.enable_nd_planning = is_planning_on del image1_fft del image2_fft cupy.get_default_memory_pool().free_all_blocks() self._debug_allocation(f"after fft") return ret
def start(self, rand_seed=None): if rand_seed is None: rand_seed = np.random.randint(1e5) self.nPh = int(self.nPh) self._reset_results() self._generate_initial_coodinate(self.nPh) M = np.int32(self.model.voxel_model.shape[1]) L = np.int32(self.model.voxel_model.shape[2]) print("") print("###### Start (Random seed: %s) ######" % rand_seed) print("") start_ = time.time() cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() add_ = cp.asarray(self.add.astype(np.int32), dtype=np.int32) p_ = cp.asarray(self.p.astype(np.float32), dtype=np.float32) v_ = cp.asarray(self.v.astype(np.float32), dtype=np.float32) w_ = cp.asarray(self.w.astype(np.float32), dtype=np.float32) ma_ = cp.asarray(self.model.ma.astype(np.float32)) ms_ = cp.asarray(self.model.ms.astype(np.float32)) n_ = cp.asarray(self.model.n.astype(np.float32)) g_ = cp.asarray(self.model.g.astype(np.float32)) v_model = cp.asarray(self.model.voxel_model.astype(np.int8), dtype=np.int8) l_ = cp.float32(self.model.voxel_space) nph = cp.int32(self.nPh) end_p = cp.int8(self.model.end_point) func((int((self.nPh + self.threadnum - 1) / self.threadnum), 1), (self.threadnum, 1), (add_, p_, v_, w_, ma_, ms_, n_, g_, v_model, l_, M, L, nph, end_p, np.int32(rand_seed))) self.add = cp.asnumpy(add_) self.p = cp.asnumpy(p_) self.v = cp.asnumpy(v_) self.w = cp.asnumpy(w_) del add_, p_, v_, w_, ma_, ms_, n_, g_, del v_model, l_, M, L, nph, end_p, rand_seed, cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() gc.collect() self._end_process() print("###### End ######") self.getRdTtRate() calTime(time.time(), start_) return self
def test_fft_allocate(self): # Check CuFFTError is not raised when the GPU memory is enough. # See https://github.com/cupy/cupy/issues/1063 # TODO(mizuno): Simplify "a" after memory compaction is implemented. a = [] for i in six.moves.range(10): a.append(cupy.empty(100000000)) del a b = cupy.empty(100000007, dtype=cupy.float32) cupy.fft.fft(b) # Free huge memory for slow test del b cupy.get_default_memory_pool().free_all_blocks()
def test_with_over_size_array(self): # real example from #3009 size = 5 * 10**8 try: a = testing.shaped_random((size, ), cupy, cupy.float64) b = cupy.asarray(DummyObjectWithCudaArrayInterface(a, 2, None)) testing.assert_array_equal(a, b) except cupy.cuda.memory.OutOfMemoryError: pass else: del b, a finally: cupy.get_default_memory_pool().free_all_blocks()
def saveELM(svd_file, original_file, final_file, point_file, weight_file, dim): file1 = h5py.File(svd_file) file2 = h5py.File(original_file) distances = file1['distances'][:] file1.close() file2.close() file3 = h5py.File(point_file) mat = file3['mat'][:] file3.close() surf_size = distances.shape[1] memory_pool = cupy.get_default_memory_pool() pinned_memory_pool = cupy.get_default_pinned_memory_pool() data_dim = distances.shape[0] tmp = numpy.zeros((data_dim, surf_size, dim)) pinvmat = cupy.asarray(mat) for inst in range(data_dim): if inst % 200 == 0: print(inst) dt = cupy.asarray(distances[inst]) res = cupy.matmul(pinvmat, dt.transpose()) tmp[inst] = cupy.asnumpy(res.transpose()) del dt del res # memory_pool.free_all_blocks() pinned_memory_pool.free_all_blocks() saveh5 = h5py.File(final_file, 'w') saveh5.create_dataset('data', data=tmp) saveh5.close()
def process(self, inputs): mode = self.conf.get('mode', 'full') axes = self.conf.get('axes', []) use_cpu = self.conf.get('use_cpu', False) in1 = inputs['in1'] in2 = inputs['in2'] if len(axes) == 0: axes = None elif len(axes) == 1: axes = axes[0] if use_cpu: fftconv = sifftconv(in1, in2, mode=mode, axes=axes) else: cache = cp.fft.config.get_plan_cache() cache.clear() mempool = cp.get_default_memory_pool() mempool.free_all_blocks() if cache.get_size() > 0: cache.set_size(0) # if cache.get_memsize() != 0: # cache.set_memsize(0) fftconv = cufftconv(in1, in2, mode=mode, axes=axes) return {'fftconvolve': fftconv}
def log_memory_usage(self, header=""): if not USE_GPU: return mempool = xp.get_default_memory_pool() logger.info( f"{header} GPU memory used/Total: {sizeof_fmt(mempool.used_bytes())}/{sizeof_fmt(mempool.total_bytes())}" )
def main(): mempool = cp.get_default_memory_pool() mempool.get_limit() opti_vector = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] mean = [7.5, 5.1, 17.5, 5.1, 5.05, 12.5, 5.1] variable_costs = [1, 9, 5, 15, 2, 11, 18] distributions = [1, 0, 1, 0, 1, 1, 0] repeat = 25 for k in range(1): i = 1 sample_size = [ 10000, 50000, 100000, 500000, 1000000, 5000000, 10000000, 15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 55000000, 60000000, 65000000, 70000000, 75000000, 80000000, 85000000, 90000000 ] for index, samplesize in enumerate(sample_size): for l in range(repeat): mempool.free_all_blocks() c = solver( sample_size=samplesize, mean=mean, fixed_costs=0, variable_costs=variable_costs, distributions=distributions, usl=0.1, float_type=floattype, ) c.closing_dimension(c.tolerances(opti_vector=opti_vector)) dev.synchronize() print(samplesize)
def run_hpl(n,nr,tol=16): """ Run the High-performance LINPACK test on a matrix of size n x n, nr number of times and ensures that the the maximum of the three residuals is strictly less than the prescribed tol erance (defaults to 16). This function returns the performance in GFlops/Sec. """ mempool = cn.get_default_memory_pool() if args.type=='fp32': accuracy=cn.float32 if args.type=='fp64': accuracy=cn.float64 a = cn.random.rand(n, n).astype(accuracy); b = cn.random.rand(n, 1).astype(accuracy); x,t = iterate_func(nr,cn.linalg.solve, a, b,n,mempool) eps = cn.finfo(accuracy).eps r = cn.dot(a, x)-b r0 = cn.linalg.norm(r, cn.inf) r1 = r0/(eps * cn.linalg.norm(a, 1) * n) r2 = r0/(eps * cn.linalg.norm(a, cn.inf) * cn.linalg.norm(x, cn.inf) * n) performance = (1e-9* (2.0/3.0 * n * n * n+ 3.0/2.0 * n * n) *nr/t) verified = np.max((r0.get(), r1.get(), r2.get())) < 16 umem = 4 * mempool.used_bytes() // (1024*1024) msg='performance={} umem={} verified={} r0={} r1={} r2={}'.format(performance,umem,verified,r0,r1,r2) logging.info(msg) if not verified: err="Solution did not meet the prescribed tolerance {}".format(tol) raise RuntimeError(err) return performance,umem
def use_default_mempool_in_cupy(): """Use the default memory pool in CuPy.""" global _using_torch_mempool _ensure_cupy() cupy.cuda.set_allocator(cupy.get_default_memory_pool().malloc) _using_torch_mempool = False
def modeling(self, path, save_dicom=False): self.save_dicom = save_dicom mempool = cp.get_default_memory_pool() pinned_mempool = cp.get_default_pinned_memory_pool() mempool.free_all_blocks() pinned_mempool.free_all_blocks() self._calc_kukv() u, v = self._get_inital_vector() for i in tqdm(range(self.repetition)): u, v = self._calc_onestep(u, v) self.model_shape = u.shape print("Model Size: %s Mb" % (sys.getsizeof(u) / 1e6)) U = cp.asnumpy(u) del self.ku, self.kv, u, v gc.collect() mempool.free_all_blocks() pinned_mempool.free_all_blocks() if save_dicom: self._save_dicom(U, path) U = self._adjust_vbtv(U) self._calc_microarchitecture(U) self._save_info(path) U = self._model_binarization(U) if self.tile_num_xz != 0: U = np.tile(U, (self.tile_num_xz, self.tile_num_y, self.tile_num_xz)) return U
def _cufftn(data, overwrite_input=False, **kwargs): """ Calculate the N-dimensional fft of an image with memory efficiency """ # Get memory pools mempool = cp.get_default_memory_pool() pinned_mempool = cp.get_default_pinned_memory_pool() # Real vs. Complex data if data.dtype in [cp.float32, cp.float64]: value_type = 'R2C' fftn = cufft.rfftn elif data.dtype in [cp.complex64, cp.complex128]: value_type = 'C2C' fftn = cufft.fftn else: raise ValueError(f"{data.dtype} is unrecognized data type.") # Get plan for computing fft plan = cufft.get_fft_plan(data, value_type=value_type) # Compute fft with plan: fft = fftn(data, overwrite_x=overwrite_input, **kwargs) # Release memory del plan mempool.free_all_blocks() pinned_mempool.free_all_blocks() return fft
def setUp(self): if self.memory == 'managed': if cuda.runtime.is_hip: pytest.skip('HIP does not support managed memory') self.old_pool = cupy.get_default_memory_pool() self.new_pool = cuda.MemoryPool(cuda.malloc_managed) cuda.set_allocator(self.new_pool.malloc)
def ACE_cp(img, ratio=4, radius=300, gpu_id=0): # 常规的ACE实现 with cp.cuda.Device(gpu_id): mempool = cp.get_default_memory_pool() pinned_mempool = cp.get_default_pinned_memory_pool() para = getPara(radius, gpu_id=gpu_id) # print("para.device:", para.device) # print("img.device:", img.device) height, width = img.shape size = 2 * radius + 1 # zh,zw = [0]*radius + list(range(height)) + [height-1]*radius, [0]*radius + list(range(width)) + [width -1]*radius # Z = img[cp.ix_(zh, zw)] Z = cp.zeros((height + 2 * radius, width + 2 * radius)) Z[radius:-radius, radius:-radius] = img res = cp.zeros(img.shape) para = cp.asarray(para) for h in range(size): for w in range(size): if para[h][w] == 0: continue res += (para[h][w] * cp.clip( (img - Z[h:h + height, w:w + width]) * ratio, -1, 1)) del Z, para gc.collect() mempool.free_all_blocks() pinned_mempool.free_all_blocks() return res
def _compute_bispectrum(kind, kn, kcoords, nsamples, sample_thresh, ndim, dim, shape, double, progress, exclude, blocksize, compute_point, *ffts): knyq = max(shape) // 2 shape = [cp.int16(Ni) for Ni in shape] if double: float, complex = cp.float64, cp.complex128 else: float, complex = cp.float32, cp.complex64 mempool = cp.get_default_memory_pool() pinned_mempool = cp.get_default_pinned_memory_pool() bispec = cp.full((dim, dim), cp.nan + 1.j * cp.nan, dtype=complex) binorm = cp.full((dim, dim), cp.nan, dtype=float) omega = np.zeros((dim, dim), dtype=np.int64) counts = cp.zeros((dim, dim), dtype=cp.int64) for i in range(dim): k1 = kn[i] k1ind = kind[i] nk1 = k1ind.size for j in range(i + 1): k2 = kn[j] if exclude and k1 + k2 > knyq: continue k2ind = kind[j] nk2 = k2ind.size nsamp = nsamples[i, j] nsamp = int(nsamp) if type(nsamp) is np.int64 \ else max(int(nsamp*nk1*nk2), 1) if nsamp < nk1 * nk2 or nsamp > sample_thresh: samp = cp.random.randint(0, nk1 * nk2, size=nsamp, dtype=cp.int64) count = nsamp else: samp = cp.arange(nk1 * nk2, dtype=cp.int64) count = nk1 * nk2 tpb = blocksize bpg = (count + (tpb - 1)) // tpb bispecbuf = cp.zeros(count, dtype=complex) binormbuf = cp.zeros(count, dtype=float) countbuf = cp.zeros(count, dtype=cp.int16) compute_point( (bpg, ), (tpb, ), (k1ind, k2ind, *kcoords, cp.int64(nk1), cp.int64(nk2), *shape, samp, cp.int64(count), bispecbuf, binormbuf, countbuf, *ffts)) N = countbuf.sum() value = bispecbuf.sum() norm = binormbuf.sum() bispec[i, j], bispec[j, i] = value, value binorm[i, j], binorm[j, i] = norm, norm omega[i, j], omega[j, i] = nk1 * nk2, nk1 * nk2 counts[i, j], counts[j, i] = N, N del bispecbuf, binormbuf, countbuf, samp mempool.free_all_blocks() pinned_mempool.free_all_blocks() if progress: _printProgressBar(i, dim - 1) return bispec.get(), binorm.get(), omega, counts.get()
def cleanup(self): self.eigs = None self.m_eigs = None if self.xp is cupy: mempool = cupy.get_default_memory_pool() pinned_mempool = cupy.get_default_pinned_memory_pool() mempool.free_all_blocks() pinned_mempool.free_all_blocks()
def cleanup(self): self.gtoep.cleanup() del(self.gtoep) self.diag = None if self.xp is cupy: mempool = cupy.get_default_memory_pool() pinned_mempool = cupy.get_default_pinned_memory_pool() mempool.free_all_blocks() pinned_mempool.free_all_blocks()
def get_dataset(self, raw_data, shape, batch_size): dataset = [] for batch_id in range(0, shape[0], batch_size): print(batch_id) batch = raw_data[batch_id:min(shape[0], batch_id + batch_size)] if (self.mode == "train"): tmp_weight = self.cal_weight(batch, batch.shape) weight = cp.asnumpy(tmp_weight) dataset.append( Data.TensorDataset( torch.from_numpy(batch / 256).float(), torch.from_numpy(weight).float())) del tmp_weight else: dataset.append( Data.TensorDataset(torch.from_numpy(batch / 256).float())) cp.get_default_memory_pool().free_all_blocks() return Data.ConcatDataset(dataset)
def __eq__(self, other): return isinstance(other, DummyDeviceType) def __ne__(self, other): return not (self == other) DummyDevice = DummyDeviceType() # ------------------------------------------------------------------------------ # Global states # ------------------------------------------------------------------------------ if available: # This is for backward compatibility memory_pool = cupy.get_default_memory_pool() pinned_memory_pool = cupy.get_default_pinned_memory_pool() _integer_types = six.integer_types + (numpy.integer,) # ------------------------------------------------------------------------------ # Device # ------------------------------------------------------------------------------ class GpuDevice(_backend.Device): def __init__(self, device): check_cuda_available() assert isinstance(device, Device)