def main(model_dir, train_dir, dev_dir, is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=32, nr_examples=-1): # Training params model_dir = pathlib.Path(model_dir) train_dir = pathlib.Path(train_dir) dev_dir = pathlib.Path(dev_dir) if is_runtime: dev_texts, dev_labels = read_data(dev_dir) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) print(acc) else: print("Read data") train_texts, train_labels = read_data(train_dir, limit=nr_examples) dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) print("Using GPU 0") #chainer.cuda.get_device(0).use() train_labels = xp.asarray(train_labels, dtype='i') dev_labels = xp.asarray(dev_labels, dtype='i') lstm = train(train_texts, train_labels, dev_texts, dev_labels, {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, 'nr_vector': 2000, 'nr_dim': 32}, {'dropout': 0.5, 'lr': learn_rate}, {}, nb_epoch=nb_epoch, batch_size=batch_size)
def _convert_array(xs, array_module): if array_module == 'all_numpy': return xs elif array_module == 'all_cupy': return cupy.asarray(xs) else: return [cupy.asarray(x) if numpy.random.random_integers(0, 1) else x for x in xs]
def _array_to_gpu(array, device, stream): if array is None: return None if isinstance(array, chainerx.ndarray): # TODO(niboshi): Update this logic once both CuPy and ChainerX support # the array interface. if array.device.backend.name == 'cuda': # Convert to cupy.ndarray on the same device as source array array = cupy.ndarray( array.shape, array.dtype, cupy.cuda.MemoryPointer( cupy.cuda.UnownedMemory( array.data_ptr + array.offset, array.data_size, array, array.device.index), 0), strides=array.strides) else: array = chainerx.to_numpy(array) elif isinstance(array, (numpy.number, numpy.bool_)): array = numpy.asarray(array) elif isinstance(array, intel64.mdarray): array = numpy.asarray(array) if isinstance(array, ndarray): if array.device == device: return array is_numpy = False elif isinstance(array, numpy.ndarray): is_numpy = True else: raise TypeError( 'The array sent to gpu must be an array or a NumPy scalar.' '\nActual type: {0}.'.format(type(array))) if stream is not None: with device: with stream: if is_numpy: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True) with device: if is_numpy: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def _fftconv(a, b, axes=(0, 1)): """Patched version of :func:`sporco.linalg.fftconv`.""" if cp.isrealobj(a) and cp.isrealobj(b): fft = cp.fft.rfftn ifft = cp.fft.irfftn else: fft = cp.fft.fftn ifft = cp.fft.ifftn dims = cp.maximum(cp.asarray([a.shape[i] for i in axes]), cp.asarray([b.shape[i] for i in axes])) dims = [int(d) for d in dims] af = fft(a, dims, axes) bf = fft(b, dims, axes) return ifft(af * bf, dims, axes)
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy :class:`cupy.ndarray` into specified device. """ check_cuda_available() assert stream is None # TODO(beam2d): FIX IT with get_device(device): dev_id = int(get_device(array)) if dev_id != -1 and dev_id != cupy.cuda.device.get_device_id(): # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True) else: return cupy.asarray(array)
def setup_method(self, method): np.random.seed(12345) N = 32 self.U = cp.ones((N, N, N)) self.U[:, 0:(old_div(N, 2)), :] = -1 self.V = 1e-1 * cp.asarray(np.random.randn(N, N, N)) self.D = self.U + self.V
def _list2array(lst): """Convert a list to a numpy array.""" if lst and isinstance(lst[0], cp.ndarray): return cp.hstack(lst) else: return cp.asarray(lst)
def check(self, func, n, gen, *args): @cupy.fuse(input_num=n) def f(*x): return func(*x) if type(gen) == tuple: ndata = [g(*a) for i, g, a in zip(range(n), list(gen), args)] else: ndata = [gen(*args) for i in range(n)] nret = func(*ndata) fnret = f(*ndata) nret = list(nret) if type(nret) == tuple else [nret] fnret = list(fnret) if type(fnret) == tuple else [fnret] for n, fn in zip(nret, fnret): numpy.testing.assert_array_almost_equal(n, fn) cdata = [cupy.asarray(_) for _ in ndata] cret = func(*cdata) fcret = f(*cdata) cret = list(cret) if type(cret) == tuple else [cret] fcret = list(fcret) if type(fcret) == tuple else [fcret] for n, c, fc in zip(nret, cret, fcret): numpy.testing.assert_array_almost_equal(n, c.get()) numpy.testing.assert_array_almost_equal(n, fc.get())
def to_cupy(array): # pragma: no cover import cupy if isinstance(array, np.ndarray): return cupy.asarray(array) return array
def _get_labelled_sentences(self, docs, doc_labels): labels = [] sentences = [] for doc, y in izip(docs, doc_labels): for sent in doc.sents: sentences.append(sent) labels.append(y) return sentences, xp.asarray(labels, dtype='i')
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy :class:`cupy.ndarray` into specified device. """ check_cuda_available() with _get_device(device): array_dev = get_device_from_array(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None: warnings.warn( 'The stream option is deprecated in chainer.cuda.to_gpu. ' 'Please remove it.', DeprecationWarning) if stream.ptr != 0: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array_dev: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def asanyarray(a, dtype=None): """Converts an object to array. This is equivalent to cupy.asarray. .. seealso:: :func:`cupy.asarray`, :func:`numpy.asanyarray` """ return cupy.asarray(a, dtype)
def check_usv(self, array, dtype): a_cpu = numpy.asarray(array, dtype=dtype) a_gpu = cupy.asarray(array, dtype=dtype) result_cpu = numpy.linalg.svd(a_cpu, full_matrices=self.full_matrices) result_gpu = cupy.linalg.svd(a_gpu, full_matrices=self.full_matrices) self.assertEqual(len(result_cpu), len(result_gpu)) for b_cpu, b_gpu in zip(result_cpu, result_gpu): # Use abs to support an inverse vector cupy.testing.assert_allclose( numpy.abs(b_cpu), cupy.abs(b_gpu), atol=1e-4)
def check_reduce(self, func, n, reduce_f, gen, *args): @cupy.fuse(input_num=n, reduce=reduce_f) def f(*x): return func(*x) ndata = [gen(*args) for i in range(n)] fnret = f(*ndata) cdata = [cupy.asarray(_) for _ in ndata] fcret = f(*cdata) numpy.testing.assert_array_almost_equal(fnret, fcret.get())
def check_mode(self, array, mode, dtype): a_cpu = numpy.asarray(array, dtype=dtype) a_gpu = cupy.asarray(array, dtype=dtype) result_cpu = numpy.linalg.qr(a_cpu, mode=mode) result_gpu = cupy.linalg.qr(a_gpu, mode=mode) if isinstance(result_cpu, tuple): for b_cpu, b_gpu in six.moves.zip(result_cpu, result_gpu): self.assertEqual(b_cpu.dtype, b_gpu.dtype) cupy.testing.assert_allclose(b_cpu, b_gpu, atol=1e-4) else: self.assertEqual(result_cpu.dtype, result_gpu.dtype) cupy.testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
def asanyarray(a, dtype=None): """Converts an object to array. This is currently equivalent to :func:`~cupy.asarray`, since there is no subclass of ndarray in CuPy. Note that the original :func:`numpy.asanyarray` returns the input array as is if it is an instance of a subtype of numpy.ndarray. .. seealso:: :func:`cupy.asarray`, :func:`numpy.asanyarray` """ return cupy.asarray(a, dtype)
def _array_to_gpu(array, device, stream): assert device is DummyDevice or isinstance(device, Device) if array is None: return None if isinstance(array, (numpy.number, numpy.bool_)): array = numpy.asarray(array) elif isinstance(array, intel64.mdarray): array = numpy.asarray(array) if not isinstance(array, (cupy.ndarray, numpy.ndarray)): raise TypeError( 'The array sent to gpu must be an array or a NumPy scalar.' '\nActual type: {0}.'.format(type(array))) array_dev = get_device_from_array(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None and stream.ptr != 0: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array_dev: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def ix_(*args): """Construct an open mesh from multiple sequences. This function takes N 1-D sequences and returns N outputs with N dimensions each, such that the shape is 1 in all but one dimension and the dimension with the non-unit shape value cycles through all N dimensions. Using `ix_` one can quickly construct index arrays that will index the cross product. ``a[cupy.ix_([1,3],[2,5])]`` returns the array ``[[a[1,2] a[1,5]], [a[3,2] a[3,5]]]``. Args: *args: 1-D sequences Returns: tuple of ndarrays: N arrays with N dimensions each, with N the number of input sequences. Together these arrays form an open mesh. Examples -------- >>> a = cupy.arange(10).reshape(2, 5) >>> a array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) >>> ixgrid = cupy.ix_([0,1], [2,4]) >>> ixgrid (array([[0], [1]]), array([[2, 4]])) .. seealso:: :func:`numpy.ix_` """ out = [] nd = len(args) for k, new in enumerate(args): new = cupy.asarray(new) if new.ndim != 1: raise ValueError("Cross index must be 1 dimensional") if new.size == 0: # Explicitly type empty arrays to avoid float default new = new.astype(numpy.intp) if cupy.issubdtype(new.dtype, cupy.bool_): new, = new.nonzero() new = new.reshape((1,) * k + (new.size,) + (1,) * (nd - k - 1)) out.append(new) return tuple(out)
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy :class:`cupy.ndarray` into specified device. """ check_cuda_available() with get_device(device): array_dev = get_device(array) if array_dev.id == cupy.cuda.device.get_device_id(): return array if stream is not None: ret = cupy.empty_like(array) if array_dev.id == -1: # cpu to gpu src = array.copy(order='C') ret.set(src, stream) else: # gpu to gpu with array_dev: src = array.copy() ret.data.copy_from_device_async(src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret if array_dev.id == -1: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to specified device. Args: array: Array to be sent to GPU. device: Device specifier. stream (cupy.cuda.Stream): CUDA stream. Returns: cupy.ndarray: Array on GPU. If ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. Note that this function does not copy cupy.ndarray into specified device. """ _check_cuda_available() assert stream is None # TODO(beam2d): FIX IT with get_device(device): return cupy.asarray(array)
def affine(volume: np.ndarray, transform_m: np.ndarray, interpolation: str = 'linear', reshape: bool = False, profile: bool = False, output=None, device: str = 'cpu'): if device not in AVAILABLE_DEVICES: raise ValueError( f'Unknown device ({device}), must be one of {AVAILABLE_DEVICES}') if device == 'cpu': if profile: t_start = time.time() # set parameters for scipy affine transform if interpolation == 'linear': order = 1 else: order = 3 if not interpolation.startswith('filt_bspline'): prefilter = False else: prefilter = True if reshape: pad_before, pad_after, output_shape = utils.compute_post_transform_dimensions( volume.shape, transform_m) # scipy will take care of padding in this case # but we need to apply pad_before offset to transform_m get full volume transform_m = np.dot( transform_m, translation_matrix(pad_before, transform_m.dtype)) else: output_shape = volume.shape # run affine transformation output_vol = affine_transform(volume, transform_m, output_shape=output_shape, output=output, order=order, prefilter=prefilter) if profile: t_end = time.time() time_took = (t_end - t_start) * 1000 print(f'transform finished in {time_took:.3f}ms') if output is not None: return output else: return output_vol elif device.startswith('gpu'): utils.switch_to_device(device) if profile: stream = cp.cuda.Stream.null t_start = stream.record() if reshape: pad_before, pad_after, output_shape = utils.compute_post_transform_dimensions( volume.shape, transform_m) # manually pad volume volume = np.pad(volume, list(zip(pad_before, pad_after)), mode='constant') # include pad_before offset: first apply offset, then apply negative offset transform_m = translation_matrix( -1 * pad_before) @ transform_m @ translation_matrix(pad_before) volume = cp.asarray(volume) volume_shape = volume.shape # texture setup ch = cp.cuda.texture.ChannelFormatDescriptor( 32, 0, 0, 0, cp.cuda.runtime.cudaChannelFormatKindFloat) arr = cp.cuda.texture.CUDAarray( ch, *volume_shape[::-1] ) # CUDAArray: last dimension=fastest changing dimension res = cp.cuda.texture.ResourceDescriptor( cp.cuda.runtime.cudaResourceTypeArray, cuArr=arr) tex = cp.cuda.texture.TextureDescriptor( (cp.cuda.runtime.cudaAddressModeBorder, cp.cuda.runtime.cudaAddressModeBorder, cp.cuda.runtime.cudaAddressModeBorder), cp.cuda.runtime.cudaFilterModeLinear, cp.cuda.runtime.cudaReadModeElementType) texobj = cp.cuda.texture.TextureObject(res, tex) # prefilter if required and upload to texture if interpolation.startswith('filt_bspline'): volume = _bspline_prefilter(volume) arr.copy_from(volume) else: arr.copy_from(volume) # kernel setup kernel = _get_transform_kernel(interpolation) dims = cp.asarray(volume_shape, dtype=cp.uint32) xform = cp.asarray(transform_m) dim_grid, dim_blocks = utils.compute_elementwise_launch_dims( volume_shape) if output is None: volume.fill(0.0) # reuse input array else: volume = output kernel(dim_grid, dim_blocks, (volume, texobj, xform, dims)) if profile: t_end = stream.record() t_end.synchronize() time_took = cp.cuda.get_elapsed_time(t_start, t_end) print(f'transform finished in {time_took:.3f}ms') if output is None: del texobj, xform, dims return volume.get() else: del texobj, xform, dims return None else: raise ValueError(f'No instructions for {device}.')
def test_compare_xp_gpu(self): noisyimg_gpu = cp.asarray(self.noisyimg) imgivar_gpu = cp.asarray(self.imgivar) A4_gpu = cp.asarray(self.A4) # Compare the "signal" decorrelation method flux0, ivar0, R0 = ex2d_patch(self.noisyimg, self.imgivar, self.A4, decorrelate='signal') flux1_gpu, ivar1_gpu, R1_gpu = xp_ex2d_patch(noisyimg_gpu, imgivar_gpu, A4_gpu, decorrelate='signal') flux1 = cp.asnumpy(flux1_gpu) ivar1 = cp.asnumpy(ivar1_gpu) R1 = cp.asnumpy(R1_gpu) eps_double = np.finfo(np.float64).eps where = np.where( ~np.isclose(flux0, flux1, rtol=1e5 * eps_double, atol=0)) np.testing.assert_allclose(flux0, flux1, rtol=1e5 * eps_double, atol=0, err_msg=f"where: {where}") self.assertTrue( np.allclose(ivar0, ivar1, rtol=1e3 * eps_double, atol=0)) self.assertTrue( np.allclose(np.diag(R0), np.diag(R1), rtol=1e2 * eps_double, atol=1e3 * eps_double)) self.assertTrue( np.allclose( np.abs(flux0 - flux1) / np.sqrt(1. / ivar0 + 1. / ivar1), np.zeros_like(flux0))) # Compare the "noise" decorrelation method flux0, ivar0, R0 = ex2d_patch(self.noisyimg, self.imgivar, self.A4, decorrelate='noise') flux1_gpu, ivar1_gpu, R1_gpu = xp_ex2d_patch(noisyimg_gpu, imgivar_gpu, A4_gpu, decorrelate='noise') flux1 = cp.asnumpy(flux1_gpu) ivar1 = cp.asnumpy(ivar1_gpu) R1 = cp.asnumpy(R1_gpu) self.assertTrue( np.allclose(flux0, flux1, rtol=1e5 * eps_double, atol=0)) self.assertTrue( np.allclose(ivar0, ivar1, rtol=1e3 * eps_double, atol=0)) self.assertTrue( np.allclose(np.diag(R0), np.diag(R1), rtol=1e2 * eps_double, atol=0)) self.assertTrue( np.allclose( np.abs(flux0 - flux1) / np.sqrt(1. / ivar0 + 1. / ivar1), np.zeros_like(flux0)))
def evaluate(self): self.load_from_file() # Perform MC reweighting if _GPU_ENABLED: import cupy as xp else: import numpy as xp m1_src = self.fiducial_binaries["mass_1_source"] m2_src = self.fiducial_binaries["mass_2_source"] spin_1x, spin_1y, spin_1z = [ self.fiducial_binaries[k] for k in ["spin_1x", "spin_1y", "spin_1z"] ] spin_2x, spin_2y, spin_2z = [ self.fiducial_binaries[k] for k in ["spin_2x", "spin_2y", "spin_2z"] ] pdf_mass_fiducial = self.pdf_mass_fiducial pdf_spin_fiducial = self.pdf_spin_fiducial # Move data to GPU if needed m1_src = xp.asarray(m1_src) m2_src = xp.asarray(m2_src) spin_1x = xp.asarray(spin_1x) spin_1y = xp.asarray(spin_1y) spin_1z = xp.asarray(spin_1z) spin_2x = xp.asarray(spin_2x) spin_2y = xp.asarray(spin_2y) spin_2z = xp.asarray(spin_2z) pdf_mass_fiducial = xp.asarray(pdf_mass_fiducial) pdf_spin_fiducial = xp.asarray(pdf_spin_fiducial) pdf_mass_pop = self.mass_src_pop_model.prob( { "mass_1_source": m1_src, "mass_2_source": m2_src }, axis=0) weights_mass = pdf_mass_pop / pdf_mass_fiducial pdf_spin_pop = self.spin_src_pop_model.prob({ "spin_1x": spin_1x, "spin_1y": spin_1y, "spin_1z": spin_1z, "spin_2x": spin_2x, "spin_2y": spin_2y, "spin_2z": spin_2z, }) weights_spin = pdf_spin_pop / pdf_spin_fiducial weights_source = weights_mass * weights_spin z = self.fiducial_z pz = NotLensedSourceRedshiftProbDist( merger_rate_density=self.merger_rate_density_src_pop_model, optical_depth=self.optical_depth) pdf_z_fiducial = self.pdf_z_fiducial pdf_z_pop = pz.prob(z) # NOTE p_z still uses CPU-only code weights_z = xp.asarray(pdf_z_pop / pdf_z_fiducial) predictions = xp.asarray(self.predictions) alpha = xp.sum(predictions * weights_source * weights_z).astype(float) / (float(self.N_inj)) self.f.close() # NOTE If using numpy, alpha is a scalar but if using cupy, alpha is a 0-d array return float(alpha)
def svd_wrapper(matrix, mode, ncomp, verbose, full_output=False, random_state=None, to_numpy=True): """ Wrapper for different SVD libraries (CPU and GPU). Parameters ---------- matrix : numpy ndarray, 2d 2d input matrix. mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy', 'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional Switch for the SVD method/library to be used. ``lapack``: uses the LAPACK linear algebra library through Numpy and it is the most conventional way of computing the SVD (deterministic result computed on CPU). ``arpack``: uses the ARPACK Fortran libraries accessible through Scipy (computation on CPU). ``eigen``: computes the singular vectors through the eigendecomposition of the covariance M.M' (computation on CPU). ``randsvd``: uses the randomized_svd algorithm implemented in Sklearn (computation on CPU). ``cupy``: uses the Cupy library for GPU computation of the SVD as in the LAPACK version. ` `eigencupy``: offers the same method as with the ``eigen`` option but on GPU (through Cupy). ``randcupy``: is an adaptation of the randomized_svd algorithm, where all the computations are done on a GPU (through Cupy). ` `pytorch``: uses the Pytorch library for GPU computation of the SVD. ``eigenpytorch``: offers the same method as with the ``eigen`` option but on GPU (through Pytorch). ``randpytorch``: is an adaptation of the randomized_svd algorithm, where all the linear algebra computations are done on a GPU (through Pytorch). ncomp : int Number of singular vectors to be obtained. In the cases when the full SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular vectors is truncated. verbose: bool If True intermediate information is printed out. full_output : bool optional If True the 3 terms of the SVD factorization are returned. If ``mode`` is eigen then only S and V are returned. random_state : int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Used for ``randsvd`` mode. to_numpy : bool, optional If True (by default) the arrays computed in GPU are transferred from VRAM and converted to numpy ndarrays. Returns ------- V : numpy ndarray The right singular vectors of the input matrix. If ``full_output`` is True it returns the left and right singular vectors and the singular values of the input matrix. If ``mode`` is set to eigen then only S and V are returned. References ---------- * For ``lapack`` SVD mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html http://www.netlib.org/lapack/ * For ``eigen`` mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html * For ``arpack`` SVD mode see: https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html http://www.caam.rice.edu/software/ARPACK/ * For ``randsvd`` SVD mode see: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * For ``cupy`` SVD mode see: https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html * For ``eigencupy`` mode see: https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html * For ``pytorch`` SVD mode see: http://pytorch.org/docs/master/torch.html#torch.svd * For ``eigenpytorch`` mode see: http://pytorch.org/docs/master/torch.html#torch.eig """ if matrix.ndim != 2: raise TypeError('Input matrix is not a 2d array') if ncomp > min(matrix.shape[0], matrix.shape[1]): msg = '{} PCs cannot be obtained from a matrix with size [{},{}].' msg += ' Increase the size of the patches or request less PCs' raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1])) if mode == 'eigen': # building C as np.dot(matrix.T,matrix) is slower and takes more memory C = np.dot(matrix, matrix.T) # covariance matrix e, EV = linalg.eigh(C) # EVals and EVs pc = np.dot(EV.T, matrix) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since we need the last EVs S = np.sqrt(np.abs(e)) # SVals = sqrt(EVals) S = S[::-1] # reverse since EVals go in increasing order for i in range(V.shape[1]): V[:, i] /= S # scaling EVs by the square root of EVals V = V[:ncomp] if verbose: print('Done PCA with numpy linalg eigh functions') elif mode == 'lapack': # n_frames is usually smaller than n_pixels. In this setting taking # the SVD of M' and keeping the left (transposed) SVs is faster than # taking the SVD of M (right SVs) U, S, V = linalg.svd(matrix.T, full_matrices=False) V = V[:ncomp] # we cut projection matrix according to the # of PCs U = U[:, :ncomp] S = S[:ncomp] if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)') elif mode == 'arpack': U, S, V = svds(matrix, k=ncomp) if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)') elif mode == 'randsvd': U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose='auto', random_state=random_state) if verbose: print('Done SVD/PCA with randomized SVD') elif mode == 'cupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True, compute_uv=True) V = vh_gpu[:ncomp] if to_numpy: V = cupy.asnumpy(V) if full_output: S = s_gpu[:ncomp] if to_numpy: S = cupy.asnumpy(S) U = u_gpu[:, :ncomp] if to_numpy: U = cupy.asnumpy(U) if verbose: print('Done SVD/PCA with cupy (GPU)') elif mode == 'randcupy': if no_cupy: raise RuntimeError('Cupy is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy') if to_numpy: V = cupy.asnumpy(V) S = cupy.asnumpy(S) U = cupy.asnumpy(U) if verbose: print('Done randomized SVD/PCA with cupy (GPU)') elif mode == 'eigencupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device C = cupy.dot(a_gpu, a_gpu.T) # covariance matrix e, EV = cupy.linalg.eigh(C) # eigenvalues and eigenvectors pc = cupy.dot(EV.T, a_gpu) # using a compact trick when cov is MM' V = pc[::-1] # reverse to get last eigenvectors S = cupy.sqrt(e)[::-1] # reverse since EVals go in increasing order for i in range(V.shape[1]): V[:, i] /= S # scaling by the square root of eigvals V = V[:ncomp] if to_numpy: V = cupy.asnumpy(V) if verbose: print('Done PCA with cupy eigh function (GPU)') elif mode == 'pytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T)) u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu) V = vh_gpu[:ncomp] S = s_gpu[:ncomp] U = torch.transpose(u_gpu, 0, 1)[:ncomp] if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done SVD/PCA with pytorch (GPU)') elif mode == 'eigenpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32'))) C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1)) e, EV = torch.eig(C, eigenvectors=True) V = torch.mm(torch.transpose(EV, 0, 1), a_gpu) S = torch.sqrt(e[:, 0]) for i in range(V.shape[1]): V[:, i] /= S V = V[:ncomp] if to_numpy: V = np.array(V) if verbose: print('Done PCA with pytorch eig function') elif mode == 'randpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch') if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done randomized SVD/PCA with randomized pytorch (GPU)') else: raise ValueError('The SVD `mode` is not recognized') if full_output: if mode == 'lapack': return V.T, S, U.T elif mode == 'pytorch': if to_numpy: return V.T, S, U.T else: return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1) elif mode in ('eigen', 'eigencupy', 'eigenpytorch'): return S, V else: return U, S, V else: if mode == 'lapack': return U.T elif mode == 'pytorch': return U else: return V
def preprocess(ctx): # function rez = preprocessDataSub(ops) # this function takes an ops struct, which contains all the Kilosort2 settings and file paths # and creates a new binary file of preprocessed data, logging new variables into rez. # The following steps are applied: # 1) conversion to float32 # 2) common median subtraction # 3) bandpass filtering # 4) channel whitening # 5) scaling to int16 values params = ctx.params probe = ctx.probe raw_data = ctx.raw_data ir = ctx.intermediate fs = params.fs fshigh = params.fshigh fslow = params.fslow Nbatch = ir.Nbatch NT = params.NT NTbuff = params.NTbuff Wrot = cp.asarray(ir.Wrot) logger.info("Loading raw data and applying filters.") with open(ir.proc_path, 'wb') as fw: # open for writing processed data for ibatch in tqdm(range(Nbatch), desc="Preprocessing"): # we'll create a binary file of batches of NT samples, which overlap consecutively # on params.ntbuff samples # in addition to that, we'll read another params.ntbuff samples from before and after, # to have as buffers for filtering # number of samples to start reading at. i = max(0, (NT - params.ntbuff) * ibatch - 2 * params.ntbuff) if ibatch == 0: # The very first batch has no pre-buffer, and has to be treated separately ioffset = 0 else: ioffset = params.ntbuff buff = raw_data[:, i:i + NTbuff] if buff.size == 0: logger.error("Loaded buffer has an empty size!") break # this shouldn't really happen, unless we counted data batches wrong nsampcurr = buff.shape[ 1] # how many time samples the current batch has if nsampcurr < NTbuff: buff = np.concatenate( (buff, np.tile(buff[:, nsampcurr - 1][:, np.newaxis], (1, NTbuff))), axis=1) # apply filters and median subtraction buff = cp.asarray(buff, dtype=np.float32) datr = gpufilter(buff, chanMap=probe.chanMap, fs=fs, fshigh=fshigh, fslow=fslow) datr = datr[ioffset:ioffset + NT, :] # remove timepoints used as buffers datr = cp.dot( datr, Wrot) # whiten the data and scale by 200 for int16 range # convert to int16, and gather on the CPU side datcpu = cp.asnumpy(datr).astype(np.int16) # write this batch to binary file fw.write(datcpu.tobytes('F'))
def to_gpu(x): import cupy if type(x) == cupy.ndarray: return x return cupy.asarray(x)
y = np.arange(0, int(col), 1) z = np.arange(0, int(sta), 1) X, Y, Z = np.meshgrid(x, y, z) OSS_alpha = float(row) OSS_alpha_step = (float(row) - 1.0 / float(row)) / ( float(iteration) / float(OSS_interval) - 1.0) #重心計算関係 cx = cp.arange(0, int(row), 1) cy = cp.arange(0, int(col), 1) cz = cp.arange(0, int(sta), 1) #numpy配列 ⇒ cupy配列に変換 cp_diff_amp = cp.asarray(np_diff_amp, dtype="float32") cp_sup = cp.asarray(np_sup, dtype="float32") cp_initial_dens = cp.asarray(np_initial_dens, dtype="float32") cp_dens = cp.asarray(np_dens) print("iteration scale_factor Rfactor OS_ratio gamma") with open(log_path, mode='a') as log: log.write("iteration scale_factor Rfactor OS_ratio gamma") for i in range(int(iteration) + int(additional_iteration)): cp_structure_factor = cp.fft.fftn(cp_dens, axes=(0, 1, 2), norm="ortho") #【フーリエ変換】 cp_structure_factor = cp.fft.fftshift( cp_structure_factor) #fftshiftを使ってシフト cp_amp = cp.absolute(cp_structure_factor) #絶対値をとる
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False, random_state=None, to_numpy=True): """ Wrapper for different SVD libraries (CPU and GPU). Parameters ---------- matrix : array_like, 2d 2d input matrix. mode : {'lapack', 'arpack', 'eigen', 'randsvd', 'cupy', 'eigencupy', 'randcupy', 'pytorch', 'eigenpytorch', 'randpytorch'}, str optional Switch for the SVD method/library to be used. ``lapack`` uses the LAPACK linear algebra library through Numpy and it is the most conventional way of computing the SVD (deterministic result computed on CPU). ``arpack`` uses the ARPACK Fortran libraries accessible through Scipy (computation on CPU). ``eigen`` computes the singular vectors through the eigendecomposition of the covariance M.M' (computation on CPU). ``randsvd`` uses the randomized_svd algorithm implemented in Sklearn (computation on CPU). ``cupy`` uses the Cupy library for GPU computation of the SVD as in the LAPACK version. ``eigencupy`` offers the same method as with the ``eigen`` option but on GPU (through Cupy). ``randcupy`` is an adaptation f the randomized_svd algorithm, where all the computations are done on a GPU (through Cupy). ``pytorch`` uses the Pytorch library for GPU computation of the SVD. ``eigenpytorch`` offers the same method as with the ``eigen`` option but on GPU (through Pytorch). ``randpytorch`` is an adaptation of the randomized_svd algorithm, where all the linear algebra computations are done on a GPU (through Pytorch). ncomp : int Number of singular vectors to be obtained. In the cases when the full SVD is computed (LAPACK, ARPACK, EIGEN, CUPY), the matrix of singular vectors is truncated. debug : bool If True the explained variance ratio is computed and displayed. verbose: bool If True intermediate information is printed out. usv : bool optional If True the 3 terms of the SVD factorization are returned. random_state : int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Used for ``randsvd`` mode. to_numpy : bool, optional If True (by default) the arrays computed in GPU are transferred from VRAM and converted to numpy ndarrays. Returns ------- V : array_like The right singular vectors of the input matrix. If ``usv`` is True it returns the left and right singular vectors and the singular values of the input matrix. References ---------- * For ``lapack`` SVD mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html http://www.netlib.org/lapack/ * For ``eigen`` mode see: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.eigh.html * For ``arpack`` SVD mode see: https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html http://www.caam.rice.edu/software/ARPACK/ * For ``randsvd`` SVD mode see: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * For ``cupy`` SVD mode see: https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.linalg.svd.html * For ``eigencupy`` mode see: https://docs-cupy.chainer.org/en/master/reference/generated/cupy.linalg.eigh.html * For ``pytorch`` SVD mode see: http://pytorch.org/docs/master/torch.html#torch.svd * For ``eigenpytorch`` mode see: http://pytorch.org/docs/master/torch.html#torch.eig """ def reconstruction(ncomp, U, S, V, var=1): if mode == 'lapack': rec_matrix = np.dot(U[:, :ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp])) rec_matrix = rec_matrix.T print(' Matrix reconstruction with {} PCs:'.format(ncomp)) print(' Mean Absolute Error =', MAE(matrix, rec_matrix)) print(' Mean Squared Error =', MSE(matrix, rec_matrix)) # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) elif mode == 'eigen': exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) else: rec_matrix = np.dot(U, np.dot(np.diag(S), V)) print(' Matrix reconstruction MAE =', MAE(matrix, rec_matrix)) exp_var = (S ** 2) / (S.shape[0] - 1) full_var = np.var(matrix, axis=0).sum() explained_variance_ratio = exp_var / full_var # % of variance explained by each PC if var == 1: pass else: explained_variance_ratio = explained_variance_ratio[::-1] ratio_cumsum = np.cumsum(explained_variance_ratio) msg = ' This info makes sense when the matrix is mean centered ' msg += '(temp-mean scaling)' print(msg) lw = 2; alpha = 0.4 fig = plt.figure(figsize=vip_figsize) fig.subplots_adjust(wspace=0.4) ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2) ax1.step(range(explained_variance_ratio.shape[0]), explained_variance_ratio, alpha=alpha, where='mid', label='Individual EVR', lw=lw) ax1.plot(ratio_cumsum, '.-', alpha=alpha, label='Cumulative EVR', lw=lw) ax1.legend(loc='best', frameon=False, fontsize='medium') ax1.set_ylabel('Explained variance ratio (EVR)') ax1.set_xlabel('Principal components') ax1.grid(linestyle='solid', alpha=0.2) ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10) ax1.set_ylim(0, 1) trunc = 20 ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1) # plt.setp(ax2.get_yticklabels(), visible=False) ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha, where='mid', lw=lw) ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw) ax2.set_xlabel('Principal components') ax2.grid(linestyle='solid', alpha=0.2) ax2.set_xlim(-2, trunc + 2) ax2.set_ylim(0, 1) msg = ' Cumulative explained variance ratio for {} PCs = {:.5f}' # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight') print(msg.format(ncomp, ratio_cumsum[ncomp - 1])) # -------------------------------------------------------------------------- if matrix.ndim != 2: raise TypeError('Input matrix is not a 2d array') if usv: if mode not in ('lapack', 'arpack', 'randsvd', 'cupy', 'randcupy', 'pytorch', 'randpytorch'): msg = "Returning USV is supported with modes lapack, arpack, " msg += "randsvd, cupy, randcupy, pytorch or randpytorch" raise ValueError(msg) if ncomp > min(matrix.shape[0], matrix.shape[1]): msg = '{} PCs cannot be obtained from a matrix with size [{},{}].' msg += ' Increase the size of the patches or request less PCs' raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1])) if mode == 'eigen': # building C as np.dot(matrix.T,matrix) is slower and takes more memory C = np.dot(matrix, matrix.T) # covariance matrix e, EV = linalg.eigh(C) # EVals and EVs pc = np.dot(EV.T, matrix) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since we need the last EVs S = np.sqrt(np.abs(e)) # SVals = sqrt(EVals) S = S[::-1] # reverse since EVals go in increasing order if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S # scaling EVs by the square root of EVals V = V[:ncomp] if verbose: print('Done PCA with numpy linalg eigh functions') elif mode == 'lapack': # n_frames is usually smaller than n_pixels. In this setting taking the SVD of M' # and keeping the left (transposed) SVs is faster than taking the SVD of M (right SVs) U, S, V = linalg.svd(matrix.T, full_matrices=False) if debug: reconstruction(ncomp, U, S, V) V = V[:ncomp] # we cut projection matrix according to the # of PCs U = U[:, :ncomp] S = S[:ncomp] if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)') elif mode == 'arpack': U, S, V = svds(matrix, k=ncomp) if debug: reconstruction(ncomp, U, S, V, -1) if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)') elif mode == 'randsvd': U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, transpose='auto', random_state=random_state) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done SVD/PCA with randomized SVD') elif mode == 'cupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device u_gpu, s_gpu, vh_gpu = cupy.linalg.svd(a_gpu, full_matrices=True, compute_uv=True) V = vh_gpu[:ncomp] if to_numpy: V = cupy.asnumpy(V) if usv: S = s_gpu[:ncomp] if to_numpy: S = cupy.asnumpy(S) U = u_gpu[:, :ncomp] if to_numpy: U = cupy.asnumpy(U) if verbose: print('Done SVD/PCA with cupy (GPU)') elif mode == 'randcupy': if no_cupy: raise RuntimeError('Cupy is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='cupy') if to_numpy: V = cupy.asnumpy(V) S = cupy.asnumpy(S) U = cupy.asnumpy(U) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done randomized SVD/PCA with cupy (GPU)') elif mode == 'eigencupy': if no_cupy: raise RuntimeError('Cupy is not installed') a_gpu = cupy.array(matrix) a_gpu = cupy.asarray(a_gpu) # move the data to the current device C = cupy.dot(a_gpu, a_gpu.T) # covariance matrix e, EV = cupy.linalg.eigh(C) # eigenvalues and eigenvectors pc = cupy.dot(EV.T, a_gpu) # PCs using a compact trick when cov is MM' V = pc[::-1] # reverse since last eigenvectors are the ones we want S = cupy.sqrt(e)[::-1] # reverse since eigenvalues are in increasing order if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S # scaling by the square root of eigenvalues V = V[:ncomp] if to_numpy: V = cupy.asnumpy(V) if verbose: print('Done PCA with cupy eigh function (GPU)') elif mode == 'pytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32').T)) u_gpu, s_gpu, vh_gpu = torch.svd(a_gpu) V = vh_gpu[:ncomp] S = s_gpu[:ncomp] U = torch.transpose(u_gpu, 0, 1)[:ncomp] if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if verbose: print('Done SVD/PCA with pytorch (GPU)') elif mode == 'eigenpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') a_gpu = torch.Tensor.cuda(torch.from_numpy(matrix.astype('float32'))) C = torch.mm(a_gpu, torch.transpose(a_gpu, 0, 1)) e, EV = torch.eig(C, eigenvectors=True) V = torch.mm(torch.transpose(EV, 0, 1), a_gpu) S = torch.sqrt(e[:, 0]) if debug: reconstruction(ncomp, None, S, None) for i in range(V.shape[1]): V[:, i] /= S V = V[:ncomp] if to_numpy: V = np.array(V) if verbose: print('Done PCA with pytorch eig function') elif mode == 'randpytorch': if no_torch: raise RuntimeError('Pytorch is not installed') U, S, V = randomized_svd_gpu(matrix, ncomp, n_iter=2, lib='pytorch') if to_numpy: V = np.array(V) S = np.array(S) U = np.array(U) if debug: reconstruction(ncomp, U, S, V) if verbose: print('Done randomized SVD/PCA with randomized pytorch (GPU)') else: raise ValueError('The SVD mode is not available') if usv: if mode == 'lapack': return V.T, S, U.T elif mode == 'pytorch': if to_numpy: return V.T, S, U.T else: return torch.transpose(V, 0, 1), S, torch.transpose(U, 0, 1) else: return U, S, V else: if mode == 'lapack': return U.T elif mode == 'pytorch': return U else: return V
def unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None, *, equal_nan=True): """Find the unique elements of an array. Returns the sorted unique elements of an array. There are three optional outputs in addition to the unique elements: * the indices of the input array that give the unique values * the indices of the unique array that reconstruct the input array * the number of times each unique value comes up in the input array Args: ar(array_like): Input array. This will be flattened if it is not already 1-D. return_index(bool, optional): If True, also return the indices of `ar` (along the specified axis, if provided, or in the flattened array) that result in the unique array. return_inverse(bool, optional): If True, also return the indices of the unique array (for the specified axis, if provided) that can be used to reconstruct `ar`. return_counts(bool, optional): If True, also return the number of times each unique item appears in `ar`. axis(int or None, optional): Not supported yet. equal_nan(bool, optional): If True, collapse multiple NaN values in the return array into one. Returns: cupy.ndarray or tuple: If there are no optional outputs, it returns the :class:`cupy.ndarray` of the sorted unique values. Otherwise, it returns the tuple which contains the sorted unique values and followings. * The indices of the first occurrences of the unique values in the original array. Only provided if `return_index` is True. * The indices to reconstruct the original array from the unique array. Only provided if `return_inverse` is True. * The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. .. warning:: This function may synchronize the device. .. seealso:: :func:`numpy.unique` """ if axis is not None: raise NotImplementedError('axis option is not supported yet.') ar = cupy.asarray(ar).flatten() if return_index or return_inverse: perm = ar.argsort() aux = ar[perm] else: ar.sort() aux = ar mask = cupy.empty(aux.shape, dtype=cupy.bool_) mask[:1] = True mask[1:] = aux[1:] != aux[:-1] if equal_nan: _unique_update_mask_equal_nan(mask[1:], aux[:-1]) ret = aux[mask] if not return_index and not return_inverse and not return_counts: return ret ret = ret, if return_index: ret += perm[mask], if return_inverse: imask = cupy.cumsum(mask) - 1 inv_idx = cupy.empty(mask.shape, dtype=cupy.intp) inv_idx[perm] = imask ret += inv_idx, if return_counts: nonzero = cupy.nonzero(mask)[0] # may synchronize idx = cupy.empty((nonzero.size + 1,), nonzero.dtype) idx[:-1] = nonzero idx[-1] = mask.size ret += idx[1:] - idx[:-1], return ret
out_h, out_w = h - kh + 1 + ph * 2, w - kw + 1 + pw * 2 # TODO elif mode == 'valid': ph, pw = 0, 0 out_h, out_w = h - kh + 1, w - kw + 1 # TODO else: raise NotImplementedError y = cp.empty((n, out_c, out_h, out_w), dtype=in1.dtype) col = im2col_gpu(in1, kh, kw, 1, 1, ph, pw) y = cp.tensordot(col, in2, ((1, 2, 3), (1, 2, 3))).astype(in1.dtype, copy=False) y = cp.rollaxis(y, 3, 1) return y.transpose(2, 3, 0, 1) if __name__ == '__main__': import cupy as cp import numpy as np from scipy.signal import convolve a = np.random.randn(5, 5, 5, 1) + 1j * np.random.randn(5, 5, 5, 1) b = np.random.randn(3, 3, 1, 1) + 1j * np.random.randn(3, 3, 1, 1) y_cpu = convolve(a, b, 'valid') x = cp.asarray(a) w = cp.asarray(b) y_gpu = convolve2d(x, w, 'valid') np.allclose(y_gpu.get().squeeze(), y_cpu.squeeze(), atol=1e-6)
volume = np.repeat(volume, args.slice, axis=2) w, h, z = volume.shape # convert to a tensor b = c = 1 x = volume.reshape(b, c, w, h, z).astype(np.float32) # reshape the tensor: [b(=1),c(=1),w,h,z] -> [b*z(=z),c(=1),w,h] x = x.transpose(0, 4, 1, 2, 3) x = x.reshape(b * z, c, w, h) # to gpu if args.gpu >= 0: import cupy as xp x = xp.asarray(x) x = chainer.Variable(x) print(x.shape) # do radon = Radon(theta=np.linspace(0, 180, args.angle)) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() radon.to_gpu() import tqdm for _ in tqdm.tqdm(range(args.trial)): ret = radon(x) print(ret.shape)
def histogramdd(sample, bins=10, range=None, weights=None, density=False): """ Compute the multidimensional histogram of some data. Parameters ---------- sample : (N, D) array, or (D, N) array_like The data to be histogrammed. Note the unusual interpretation of sample when an array_like: * When an array, each row is a coordinate in a D-dimensional space - such as ``histogramdd(cupy.array([p1, p2, p3]))``. * When an array_like, each element is the list of values for single coordinate - such as ``histogramdd((X, Y, Z))``. The first form should be preferred. bins : sequence or int, optional The bin specification: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values. density : bool, optional If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. weights : (N,) array_like, optional An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. The values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. Returns ------- H : ndarray The multidimensional histogram of sample x. See normed and weights for the different possible semantics. edges : list A list of D arrays describing the bin edges for each dimension. See Also -------- histogram: 1-D histogram histogram2d: 2-D histogram Examples -------- >>> r = cupy.random.randn(100,3) >>> H, edges = cupy.histogramdd(r, bins = (5, 8, 4)) >>> H.shape, edges[0].size, edges[1].size, edges[2].size ((5, 8, 4), 6, 9, 5) """ if isinstance(sample, cupy.ndarray): # Sample is an ND-array. if sample.ndim == 1: sample = sample[:, cupy.newaxis] nsamples, ndim = sample.shape else: sample = cupy.stack(sample, axis=-1) nsamples, ndim = sample.shape nbin = numpy.empty(ndim, int) edges = ndim * [None] dedges = ndim * [None] if weights is not None: weights = cupy.asarray(weights) try: nbins = len(bins) if nbins != ndim: raise ValueError( "The dimension of bins must be equal to the dimension of the " " sample x." ) except TypeError: # bins is an integer bins = ndim * [bins] # normalize the range argument if range is None: range = (None,) * ndim elif len(range) != ndim: raise ValueError("range argument must have one entry per dimension") # Create edge arrays for i in _range(ndim): if cnp.ndim(bins[i]) == 0: if bins[i] < 1: raise ValueError( "`bins[{}]` must be positive, when an integer".format(i) ) smin, smax = _get_outer_edges(sample[:, i], range[i]) num = int(bins[i] + 1) # synchronize! edges[i] = cupy.linspace(smin, smax, num) elif cnp.ndim(bins[i]) == 1: edges[i] = cupy.asarray(bins[i]) if (edges[i][:-1] > edges[i][1:]).any(): raise ValueError( "`bins[{}]` must be monotonically increasing, when an array".format( i ) ) else: raise ValueError( "`bins[{}]` must be a scalar or 1d array".format(i) ) nbin[i] = len(edges[i]) + 1 # includes an outlier on each end dedges[i] = cupy.diff(edges[i]) # Compute the bin number each sample falls into. ncount = tuple( # avoid cupy.digitize to work around gh-11022 cupy.searchsorted(edges[i], sample[:, i], side="right") for i in _range(ndim) ) # Using digitize, values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right edge to be # counted in the last bin, and not as an outlier. for i in _range(ndim): # Find which points are on the rightmost edge. on_edge = sample[:, i] == edges[i][-1] # Shift these points one bin to the left. ncount[i][on_edge] -= 1 # Compute the sample indices in the flattened histogram matrix. # This raises an error if the array is too large. xy = cnp.ravel_multi_index(ncount, nbin) # Compute the number of repetitions in xy and assign it to the # flattened histmat. hist = cupy.bincount(xy, weights, minlength=numpy.prod(nbin)) # Shape into a proper matrix hist = hist.reshape(nbin) # This preserves the (bad) behavior observed in gh-7845, for now. hist = hist.astype(float) # Note: NumPy uses casting='safe' here too # Remove outliers (indices 0 and -1 for each dimension). core = ndim * (slice(1, -1),) hist = hist[core] if density: # calculate the probability density function s = hist.sum() for i in _range(ndim): shape = [1] * ndim shape[i] = nbin[i] - 2 hist = hist / dedges[i].reshape(shape) hist /= s if any(hist.shape != numpy.asarray(nbin) - 2): raise RuntimeError("Internal Shape Error") return hist, edges
def _get_bin_edges(a, bins, range): """ Computes the bins used internally by `histogram`. Args: a (ndarray): Ravelled data array bins (int or ndarray): Forwarded argument from `histogram`. range (None or tuple): Forwarded argument from `histogram`. Returns: bin_edges (ndarray): Array of bin edges uniform_bins (Number, Number, int): The upper bound, lowerbound, and number of bins, used in the implementation of `histogram` that works on uniform bins. """ # parse the overloaded bins argument n_equal_bins = None bin_edges = None # if isinstance(bins, cupy.ndarray) and bins.ndim == 0: # # allow uint8 array, etc # if bins.dtype not in 'bui': # raise TypeError( # "`bins` must be an integer, a string, or an array") # bins = int(bins) # synchronize if isinstance(bins, int): # will not allow 0-dimensional cupy array # if cupy.ndim(bins) == 0: try: n_equal_bins = operator.index(bins) except TypeError: raise TypeError("`bins` must be an integer, a string, or an array") if n_equal_bins < 1: raise ValueError("`bins` must be positive, when an integer") first_edge, last_edge = _get_outer_edges(a, range) elif isinstance(bins, cupy.ndarray): if bins.ndim == 1: # cupy.ndim(bins) == 0: bin_edges = cupy.asarray(bins) if (bin_edges[:-1] > bin_edges[1:]).any(): # synchronize! raise ValueError( "`bins` must increase monotonically, when an array" ) elif isinstance(bins, str): raise NotImplementedError("only integer and array bins are implemented") if n_equal_bins is not None: # numpy's gh-10322 means that type resolution rules are dependent on # array shapes. To avoid this causing problems, we pick a type now and # stick with it throughout. bin_type = cupy.result_type(first_edge, last_edge, a) if cupy.issubdtype(bin_type, cupy.integer): bin_type = cupy.result_type(bin_type, float) # bin edges must be computed bin_edges = cupy.linspace( first_edge, last_edge, n_equal_bins + 1, endpoint=True, dtype=bin_type, ) return bin_edges, (first_edge, last_edge, n_equal_bins) else: return bin_edges, None
def evolve(model): # Plot init plt.style.use('ggplot') train_loss = [] train_acc = [] test_loss = [] test_acc = [] n_train_batches = N_train / batchsize # early stopping patience = 5000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf test_score = 0 done_looping = False # Learning loop epoch = 0 while (epoch < n_epoch) and (not done_looping): epoch = epoch + 1 print 'epoch {}'.format(epoch) # training perm = np.random.permutation(N_train) sum_train_accuracy = 0 sum_train_loss = 0 for i in xrange(0, N_train, batchsize): x = chainer.Variable(cp.asarray(x_train[perm[i:i + batchsize]])) t = chainer.Variable(cp.asarray(y_train[perm[i:i + batchsize]])) # Pass the loss function (Classifier defines it) and its arguments optimizer.update(model, x, t) sum_train_loss += float(model.loss.data) * len(t.data) sum_train_accuracy += float(model.accuracy.data) * len(t.data) # generate network graph # if epoch == 1 and i == 0: # with open('netgraph.dot', 'w') as o: # g = computational_graph.build_computational_graph( # (model.loss, ), remove_split=True) # o.write(g.dump()) # print 'net graph generated' # validation batch_index = (i / batchsize) iter = (epoch - 1) * n_train_batches + batch_index if (iter + 1) % validation_frequency == 0: sum_validate_accuracy = 0 sum_validate_loss = 0 for i in xrange(0, N_validate, batchsize): x = chainer.Variable(cp.asarray(x_test[i:i + batchsize]), volatile='on') t = chainer.Variable(cp.asarray(y_test[i:i + batchsize]), volatile='on') loss = model(x, t) sum_validate_loss += float(loss.data) * len(t.data) sum_validate_accuracy += float(model.accuracy.data) * len(t.data) this_validate_loss = sum_validate_loss / N_validate this_validate_accuracy = sum_validate_accuracy / N_validate print 'validation epoch{}, minibatch{}/{}'.format(epoch, batch_index + 1, n_train_batches) print ' mean loss={}, accuracy={}'.format( this_validate_loss, sum_validate_accuracy / N_validate) if this_validate_loss < best_validation_loss: if this_validate_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) print " iter {} / patience {}".format(iter+1, patience) best_validation_loss = this_validate_loss if patience <= iter: done_looping = True break train_loss.append(sum_train_loss / N_train) train_acc.append(sum_train_accuracy / N_train) print 'train mean loss={}, accuracy={}'.format( sum_train_loss / N_train, sum_train_accuracy / N_train) # evaluation sum_test_accuracy = 0 sum_test_loss = 0 for i in xrange(0, N_test, batchsize): x = chainer.Variable(cp.asarray(x_test[i:i + batchsize]), volatile='on') t = chainer.Variable(cp.asarray(y_test[i:i + batchsize]), volatile='on') loss = model(x, t) sum_test_loss += float(loss.data) * len(t.data) sum_test_accuracy += float(model.accuracy.data) * len(t.data) test_loss.append(sum_test_loss / N_test) test_acc.append(sum_test_accuracy / N_test) print 'test mean loss={}, accuracy={}'.format( sum_test_loss / N_test, sum_test_accuracy / N_test) print 'train finish' print 'draw graph' # draw graph # このversionでの推移 plt.figure(figsize=(8,6)) # plt.xlim([0, epoch]) # plt.ylim([0.95, 1.0]) plt.plot(xrange(1,len(train_acc)+1), train_acc) plt.plot(xrange(1,len(test_acc)+1), test_acc) plt.legend(["train_acc","test_acc"],loc=4) plt.title("Accuracy of digit recognition.") plt.plot() plt.savefig("graph_v%5d.png" % (version)) plt.close() # このversionでの推移 範囲[0.95, 1.0] plt.figure(figsize=(8,6)) plt.xlim([0, epoch]) plt.ylim([0.95, 1.0]) plt.plot(xrange(1,len(train_acc)+1), train_acc) plt.plot(xrange(1,len(test_acc)+1), test_acc) plt.legend(["train_acc","test_acc"],loc=4) plt.title("Accuracy of digit recognition. range [0.95, 1.0]") plt.plot() plt.savefig("graph_095_v%5d.png" % (version)) plt.close() # 各versionにおける精度の変化 version_loss.append(best_validation_loss) version_acc.append(test_acc[-1]) plt.figure(figsize=(8,6)) # plt.ylim([0.50, 1.0]) plt.plot(xrange(1,len(version_acc)+1), version_acc) plt.legend(["version_acc"],loc=4) plt.title("Accuracy of digit recognition. (version)") plt.plot() plt.savefig("graph_version_v%5d.png" % (version)) plt.close() # 今まで全ての推移(x軸epoch) all_train_acc.extend(train_acc) all_test_acc.extend(test_acc) plt.figure(figsize=(8,6)) # plt.ylim([0.95, 1.0]) plt.plot(xrange(1,len(all_train_acc)+1), all_train_acc) plt.plot(xrange(1,len(all_test_acc)+1), all_test_acc) plt.legend(["all_train_acc","all_test_acc"],loc=4) plt.title("Accuracy of digit recognition.") plt.plot() plt.savefig("graph_allepoch_v%5d.png" % (version)) plt.close() plt.figure(figsize=(8,6)) plt.ylim([0.95, 1.0]) plt.plot(xrange(1,len(all_train_acc)+1), all_train_acc) plt.plot(xrange(1,len(all_test_acc)+1), all_test_acc) plt.legend(["all_train_acc","all_test_acc"],loc=4) plt.title("Accuracy of digit recognition. range [0.95, 1.0]") plt.plot() plt.savefig("graph_allepoch095_v%5d.png" % (version)) plt.close() plt.close('all') # Save the model and the optimizer print 'save the model' model.to_cpu() serializers.save_hdf5("v%5d.model" % (version), model) print 'save the optimizer' serializers.save_hdf5('v%5d.state' % (version), optimizer) finishtime = time.time() print 'execute time = {}'.format(finishtime - starttime) # plt.show() return sum_test_accuracy / N_test
def Allreduce_mean(self, x, **kwargs): """Multi-process multi-GPU based mean.""" src = self.pool.reduce_mean(x, **kwargs) mean = self.mpi.Allreduce(cp.asnumpy(src)) / self.mpi.size return cp.asarray(mean)
def walsh_transform(self,keys=None): if keys is None: keys = ['kernel'] + list(self.constraints.keys()) else: keys = keys is_stored = dict() for key in keys: is_stored[key] = False if os.path.exists(self.fname): with h5py.File(self.fname,mode='r') as f: for key in keys: try: if '3' in f[key].keys(): is_stored[key] = True if key == 'depth': res = f['depth']['constraint'][:] - self.constraints['depth'] res = np.linalg.norm(res)/np.linalg.norm(self.constraints['depth']) if res > 1.0e-3: is_stored[key] = False except KeyError: continue self._gen_walsh_matrix() logn = int(np.ceil(np.log2(self._nx*self._ny*self._nz))) norm_walsh = 1./(np.sqrt(2)**logn) blocks = ['0','1','2','3'] matvec_op = {'kernel':self.kernel_op.gtoep.matvec, 'dx': lambda x: self._dxyzvec(x,key='dx'), 'dy': lambda x: self._dxyzvec(x,key='dy'), 'dz': lambda x: self._dxyzvec(x,key='dz'), 'refer': lambda x: self._diagvec(x,diag=self.constraints['refer']), 'depth': lambda x: self._diagvec(x,diag=np.sqrt(self.constraints['depth'])) } is_stored['refer'] = True for key in keys: if is_stored[key]: print('walsh transformation of {} already exists.'.format(key)) continue print('performing walsh transformation on {}.'.format(key)) step = self.nx*self.ny*self.nz // 4 if key == 'depth': step = self._nz with h5py.File(self.fname,mode='a') as f: try: del f[key] except KeyError: pass dxyz_group = f.create_group(key) walsh_group = f['walsh_matrix'] for i in range(4): print("\t progress {}/4".format(i)) part_walsh = walsh_group[blocks[i]][:] if key == 'depth': part_walsh = walsh_group[blocks[i]][:self._nz] part_walsh = matvec_op[key](part_walsh) with cp.cuda.Device(2): res = cp.zeros((step,step)) j = 0 while j*step < part_walsh.shape[1]: tmp_block_gpu = cp.asarray(part_walsh[:,j*step:(j+1)*step]) res += tmp_block_gpu @ tmp_block_gpu.T j += 1 res = cp.asnumpy(res) if key in self._smooth_components: res[np.abs(res)<1.0e-1*norm_walsh] = 0. tmp_block_gpu = None mempool = cp.get_default_memory_pool() pinned_mempool = cp.get_default_pinned_memory_pool() mempool.free_all_blocks() pinned_mempool.free_all_blocks() dxyz_group.create_dataset(blocks[i],data=res) if ('depth' in keys) and (not is_stored['depth']): with h5py.File(self.fname,mode='a') as f: try: del f['depth_constraint'] except KeyError: pass dxyz_group = f['depth'] dxyz_group.create_dataset('constraint',data=self.constraints['depth'])
import cupy as cp import numpy as np from cupy import testing from skimage import data from cupyimg.skimage import color from cupyimg.skimage.util import img_as_bool from cupyimg.skimage.morphology import binary, grey, selem from cupyimg.scipy import ndimage as ndi import pytest img = color.rgb2gray(cp.asarray(data.astronaut())) bw_img = img > 100 / 255.0 def test_non_square_image(): strel = selem.square(3) binary_res = binary.binary_erosion(bw_img[:100, :200], strel) grey_res = img_as_bool(grey.erosion(bw_img[:100, :200], strel)) testing.assert_array_equal(binary_res, grey_res) def test_binary_erosion(): strel = selem.square(3) binary_res = binary.binary_erosion(bw_img, strel) grey_res = img_as_bool(grey.erosion(bw_img, strel)) testing.assert_array_equal(binary_res, grey_res) def test_binary_dilation():
def to_gpu(self, arr): return cupy.asarray(arr)
def to_gpu(*args): """ Upload numpy arrays to GPU and return them""" if len(args) > 1: return (cp.asarray(x) for x in args) else: return cp.asarray(args[0])
def corner_peaks( image, min_distance=1, threshold_abs=None, threshold_rel=None, exclude_border=True, indices=True, num_peaks=np.inf, footprint=None, labels=None, *, num_peaks_per_label=np.inf, p_norm=np.inf, ): """Find peaks in corner measure response image. This differs from `skimage.feature.peak_local_max` in that it suppresses multiple connected peaks with the same accumulator value. Parameters ---------- image : ndarray Input image. min_distance : int, optional The minimal allowed distance separating peaks. * : * See :py:meth:`skimage.feature.peak_local_max`. p_norm : float Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may cause a ValueError if overflow can occur. ``inf`` corresponds to the Chebyshev distance and 2 to the Euclidean distance. Returns ------- output : ndarray or ndarray of bools * If `indices = True` : (row, column, ...) coordinates of peaks. * If `indices = False` : Boolean array shaped like `image`, with peaks represented by True values. See also -------- skimage.feature.peak_local_max Notes ----- .. versionchanged:: 0.18 The default value of `threshold_rel` has changed to None, which corresponds to letting `skimage.feature.peak_local_max` decide on the default. This is equivalent to `threshold_rel=0`. The `num_peaks` limit is applied before suppression of connected peaks. To limit the number of peaks after suppression, set `num_peaks=np.inf` and post-process the output of this function. Examples -------- >>> from cupyimg.skimage.feature import peak_local_max >>> response = cp.zeros((5, 5)) >>> response[2:4, 2:4] = 1 >>> response array([[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 1., 1., 0.], [0., 0., 1., 1., 0.], [0., 0., 0., 0., 0.]]) >>> peak_local_max(response) array([[2, 2], [2, 3], [3, 2], [3, 3]]) >>> corner_peaks(response) array([[2, 2]]) """ if cp.isinf(num_peaks): num_peaks = None # Get the coordinates of the detected peaks coords = peak_local_max( image, min_distance=min_distance, threshold_abs=threshold_abs, threshold_rel=threshold_rel, exclude_border=exclude_border, num_peaks=np.inf, footprint=footprint, labels=labels, num_peaks_per_label=num_peaks_per_label, ) if len(coords): # TODO: modify to do KDTree on the GPU (cuSpatial?) coords = cp.asnumpy(coords) # Use KDtree to find the peaks that are too close to each other tree = spatial.cKDTree(coords) rejected_peaks_indices = set() for idx, point in enumerate(coords): if idx not in rejected_peaks_indices: candidates = tree.query_ball_point(point, r=min_distance, p=p_norm) candidates.remove(idx) rejected_peaks_indices.update(candidates) # Remove the peaks that are too close to each other coords = np.delete(coords, tuple(rejected_peaks_indices), axis=0)[:num_peaks] coords = cp.asarray(coords) if indices: return coords peaks = cp.zeros_like(image, dtype=bool) peaks[tuple(coords.T)] = True return peaks
def deformation(self, prm): """ Apply 2D Gaussian and Planar deformation. Computation is parallelized on GPU using cupy. """ import cupy as cp xy_cp = cp.asarray(prm.xy) a_cp = cp.asarray(self.a) b_cp = cp.asarray(self.b) c_cp = cp.asarray(self.c) d_cp = cp.asarray(self.d) sigma_cp = cp.asarray(self.sigma) e_cp = cp.asarray(self.e) f_cp = cp.asarray(self.f) g_cp = cp.asarray(self.g) z_cp = cp.asarray(prm.z) func_planar = cp.ElementwiseKernel( in_params='T x, T y, T e, T f, T g', out_params='T z', operation= \ ''' z = e + f*x + g*y; ''', name='func_planar' ) func_gauss2d = cp.ElementwiseKernel( in_params='T x, T y, T b, T c, T d, T sigma', out_params='T z', operation= \ ''' z = b*expf(-(powf(x-c,2) + powf(y-d,2))/(2*powf(sigma,2))); ''', name='func_gauss2d' ) gauss_2d_cp = cp.zeros_like(xy_cp[:, 0]) for i in range(len(self.b)): gauss_2d_cp += func_gauss2d(xy_cp[:, 0], xy_cp[:, 1], b_cp[i], c_cp[i], d_cp[i], sigma_cp[i]) s1_cp = a_cp + (1.5 / z_cp) * cp.outer(cp.transpose(gauss_2d_cp), z_cp) s2_cp = func_planar(xy_cp[:, 0], xy_cp[:, 1], e_cp, f_cp, g_cp) refl_cp = cp.asarray(self.refl) for i in range(prm.nxy_tr): s = s1_cp[i, :] + s2_cp[i] + z_cp mat = cp.tile(z_cp, (len(s), 1)) - cp.tile(cp.expand_dims(s, 1), (1, len(z_cp))) refl_cp[i, :] = cp.dot(refl_cp[i, :], cp.sinc(mat)) return np.reshape(cp.asnumpy(refl_cp), [prm.nxy_tr, prm.nz_tr])
def get_good_channels(raw_data=None, probe=None, params=None): """ of the channels indicated by the user as good (chanMap) further subset those that have a mean firing rate above a certain value (default is ops.minfr_goodchannels = 0.1Hz) needs the same filtering parameters in ops as usual also needs to know where to start processing batches (twind) and how many channels there are in total (NchanTOT) """ fs = params.fs fshigh = params.fshigh fslow = params.fslow Nbatch = get_Nbatch(raw_data, params) NT = params.NT spkTh = params.spkTh nt0 = params.nt0 minfr_goodchannels = params.minfr_goodchannels chanMap = probe.chanMap # Nchan = probe.Nchan NchanTOT = len(chanMap) ich = [] k = 0 ttime = 0 # skip every 100 batches for ibatch in tqdm(range(0, Nbatch, int(ceil(Nbatch / 100))), desc="Finding good channels"): i = NT * ibatch buff = raw_data[:, i:i + NT] assert np.isfortran(buff) if buff.size == 0: break # Put on GPU. buff = cp.asarray(buff, dtype=np.float32) assert cp.isfortran(buff) datr = gpufilter(buff, chanMap=chanMap, fs=fs, fshigh=fshigh, fslow=fslow) # very basic threshold crossings calculation s = cp.std(datr, axis=0) datr = datr / s # standardize each channel ( but don't whiten) mdat = my_min( datr, 30, 0) # get local minima as min value in +/- 30-sample range # take local minima that cross the negative threshold xi, xj = cp.nonzero((datr < mdat + 1e-3) & (datr < spkTh)) # filtering may create transients at beginning or end. Remove those. xj = xj[(xi >= nt0) & (xi <= NT - nt0)] # collect the channel identities for the detected spikes ich.append(xj) k += xj.size # keep track of total time where we took spikes from ttime += datr.shape[0] / fs ich = cp.concatenate(ich) # count how many spikes each channel got nc, _ = cp.histogram(ich, cp.arange(NchanTOT + 1)) # divide by total time to get firing rate nc = nc / ttime # keep only those channels above the preset mean firing rate igood = cp.asnumpy(nc >= minfr_goodchannels) logger.info('Found %d threshold crossings in %2.2f seconds of data.' % (k, ttime)) logger.info('Found %d/%d bad channels.' % (np.sum(~igood), len(igood))) return igood
def test_with_strides(self, dtype): a = testing.shaped_arange((2, 3, 4), cupy, dtype).T b = cupy.asarray( DummyObjectWithCudaArrayInterface(a, self.ver, self.strides)) assert a.strides == b.strides assert a.nbytes == b.data.mem.size
def get_whitening_matrix(raw_data=None, probe=None, params=None): """ based on a subset of the data, compute a channel whitening matrix this requires temporal filtering first (gpufilter) """ Nbatch = get_Nbatch(raw_data, params) ntbuff = params.ntbuff NTbuff = params.NTbuff whiteningRange = params.whiteningRange scaleproc = params.scaleproc NT = params.NT fs = params.fs fshigh = params.fshigh nSkipCov = params.nSkipCov xc = probe.xc yc = probe.yc chanMap = probe.chanMap Nchan = probe.Nchan chanMap = probe.chanMap # Nchan is obtained after the bad channels have been removed CC = cp.zeros((Nchan, Nchan)) for ibatch in tqdm(range(0, Nbatch, nSkipCov), desc="Computing the whitening matrix"): # WARNING: we use Fortran order, so raw_data is NchanTOT x nsamples i = max(0, (NT - ntbuff) * ibatch - 2 * ntbuff) buff = raw_data[:, i:i + NT - ntbuff] nsampcurr = buff.shape[1] if nsampcurr < NTbuff: buff = np.concatenate( (buff, np.tile(buff[:, nsampcurr - 1][:, np.newaxis], (1, NTbuff))), axis=1) buff_g = cp.asarray(buff, dtype=np.float32) # apply filters and median subtraction datr = gpufilter(buff_g, fs=fs, fshigh=fshigh, chanMap=chanMap) CC = CC + cp.dot(datr.T, datr) / NT # sample covariance CC = CC / ceil((Nbatch - 1) / nSkipCov) if whiteningRange < np.inf: # if there are too many channels, a finite whiteningRange is more robust to noise # in the estimation of the covariance whiteningRange = min(whiteningRange, Nchan) # this function performs the same matrix inversions as below, just on subsets of # channels around each channel Wrot = whiteningLocal(CC, yc, xc, whiteningRange) else: Wrot = whiteningFromCovariance(CC) Wrot = Wrot * scaleproc logger.info("Computed the whitening matrix.") return Wrot
def test_not_copied(self, dtype): a = testing.shaped_arange((2, 3, 4), cupy, dtype) b = cupy.asarray( DummyObjectWithCudaArrayInterface(a, self.ver, self.strides)) a.fill(0) testing.assert_array_equal(a, b)
def evaluate(self): self.load_from_file() m1 = self.fiducial_binaries["mass_1"] m2 = self.fiducial_binaries["mass_2"] # Note that spins are redshift independent spin_1x, spin_1y, spin_1z = [ self.fiducial_binaries[k] for k in ["spin_1x", "spin_1y", "spin_1z"] ] spin_2x, spin_2y, spin_2z = [ self.fiducial_binaries[k] for k in ["spin_2x", "spin_2y", "spin_2z"] ] if _GPU_ENABLED: import cupy as xp else: import numpy as xp m1 = xp.asarray(m1) m2 = xp.asarray(m2) spin_1x = xp.asarray(spin_1x) spin_1y = xp.asarray(spin_1y) spin_1z = xp.asarray(spin_1z) spin_2x = xp.asarray(spin_2x) spin_2y = xp.asarray(spin_2y) spin_2z = xp.asarray(spin_2z) pdf_spin_fiducial = xp.asarray(self.pdf_spin_fiducial) pdf_mass_fiducial = xp.asarray(self.pdf_mass_fiducial) pdf_spin_pop = self.spin_src_pop_model.prob({ "spin_1x": spin_1x, "spin_1y": spin_1y, "spin_1z": spin_1z, "spin_2x": spin_2x, "spin_2y": spin_2y, "spin_2z": spin_2z, }) weights_spin = pdf_spin_pop / pdf_spin_fiducial for img in range(self.N_img): self.predictions[img] = xp.asarray(self.predictions[img]) self.pdf_dLs_fiducial[img] = xp.asarray(self.pdf_dLs_fiducial[img]) self.apparent_dLs[img] = xp.asarray(self.apparent_dLs[img]) def epsilon(z_src): det_mass_pop_dist = DetectorFrameComponentMassesFromSourceFrame( self.mass_src_pop_model, z_src) pdf_mass_pop = det_mass_pop_dist.prob({"mass_1": m1, "mass_2": m2}) weights_mass = pdf_mass_pop / pdf_mass_fiducial weights_source = weights_mass * weights_spin integrand = weights_source for img in range(self.N_img): pdf_dL_fiducial = self.pdf_dLs_fiducial[img] dL_pop_dist = LuminosityDistancePriorFromAbsoluteMagnificationRedshift( self.abs_magn_dist[img], z_src) pdf_dL_pop = dL_pop_dist.prob(self.apparent_dLs[img]) weights_dL = pdf_dL_pop / pdf_dL_fiducial integrand *= self.predictions[img] * weights_dL return float(xp.sum(integrand) / float(self.N_inj)) logger = logging.getLogger(__prog__) logger.info("Integrating over source redshift") z_dist = LensedSourceRedshiftProbDist( merger_rate_density=self.merger_rate_density_src_pop_model, optical_depth=self.optical_depth) zs = z_dist.sample(size=self.N_z) if _GPU_ENABLED: import cupy as cp zs = cp.asnumpy(zs) epsilons = [] for z in tqdm.tqdm(zs): epsilons.append(epsilon(z)) beta = np.sum(epsilons).astype(float) / self.N_z self.f.close() return beta
def calc_log_prior_total_det(self): self.log_prior_det_val = 0 self.log_total_det_val = 0 blocks = ['0','1','2','3'] prior_eigs = np.zeros(self._nx*self._ny*self._nz) total_eigs = np.zeros(self._nx*self._ny*self._nz) step = self._nx*self._ny*self._nz//4 try: depth_weight = self._weights['depth'] except KeyError: depth_weight = 1. with h5py.File(self.fname,mode='r') as f: if 'depth' in self._weights.keys(): depth_walsh = f['depth']['0'][:] for i_b,block in enumerate(blocks): tmp_block = np.zeros((step,step)) for dxyz_name in self._smooth_components: try: dxyz_walsh = f[dxyz_name][block][:].reshape(step//self._nz, self._nz, step//self._nz, self._nz) ein_path = np.einsum_path('mi,xiyj,jn->xmyn', depth_walsh.T, dxyz_walsh, depth_walsh, optimize='optimal')[0] tmp_multi = np.einsum('mi,xiyj,jn->xmyn', depth_walsh.T, dxyz_walsh, depth_walsh, optimize=ein_path) tmp_block += depth_weight*self._weights[dxyz_name]*tmp_multi.reshape(step,step) except KeyError: pass if 'refer' in self._weights.keys(): tmp_multi_small = depth_walsh.T@depth_walsh for i in range(step//self._nz): tmp_block[i*self._nz:(i+1)*self._nz, i*self._nz:(i+1)*self._nz] += depth_weight*self._weights['refer']*tmp_multi_small with cp.cuda.Device(2): tmp_block_gpu = cp.asarray(tmp_block,dtype=np.float32) eigs = cp.linalg.eigvalsh(tmp_block_gpu) prior_eigs[i_b*step:(i_b+1)*step] = cp.asnumpy(eigs) self.log_prior_det_val += cp.asnumpy(cp.sum(cp.log(eigs))) tmp_block_gpu = None eigs = None free_gpu() tmp_block += self._weights['obs']*f['kernel'][block][:] with cp.cuda.Device(2): tmp_block_gpu = cp.asarray(tmp_block,dtype=np.float32) eigs = cp.linalg.eigvalsh(tmp_block_gpu) total_eigs[i_b*step:(i_b+1)*step] = cp.asnumpy(eigs) self.log_total_det_val += cp.asnumpy(cp.sum(cp.log(eigs))) tmp_block_gpu = None eigs = None free_gpu() self.log_prior_det_val = cp.asnumpy(self.log_prior_det_val) self.log_total_det_val = cp.asnumpy(self.log_total_det_val) self.eigs = {'prior':prior_eigs,'total':total_eigs} return self.log_prior_det_val,self.log_total_det_val
def to_sp_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \ isinstance(cudf_or_array, cudf.DataFrame): dtypes = np.unique(cudf_or_array.dtypes) if len(dtypes) > 1: raise ValueError("DataFrame should contain only a single dtype") dtype = dtypes[0] else: dtype = cudf_or_array.dtype meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) if isinstance(cudf_or_array, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. parts = client.sync(_extract_partitions, cudf_or_array) cudf_or_array = [ client.submit(_conv_np_to_df, part, workers=[w]) for w, part in parts ] cudf_or_array = to_dask_cudf(cudf_or_array) if isinstance(cudf_or_array, dask.dataframe.DataFrame): """ Dask.Dataframe needs special attention since it has multiple dtypes. Just use the first (and assume all the rest are the same) """ cudf_or_array = cudf_or_array.map_partitions( _conv_df_to_sp, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return cudf_or_array else: if scipy.sparse.isspmatrix(cudf_or_array): cudf_or_array = \ cupyx.scipy.sparse.csr_matrix(cudf_or_array.tocsr()) elif cupyx.scipy.sparse.isspmatrix(cudf_or_array): pass elif isinstance(cudf_or_array, cudf.DataFrame): cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, cudf_or_array, dtype=cudf_or_array.dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, cp.core.core.ndarray): cudf_or_array = cupyx.scipy.sparse.csr_matrix(cudf_or_array) else: raise ValueError("Unexpected input type %s" % type(cudf_or_array)) # Push to worker cudf_or_array = client.scatter(cudf_or_array) return dask.array.from_delayed(cudf_or_array, shape=shape, meta=meta)
test_x /= 255 cuda.check_cuda_available() all_results = None for j in range(NUM_MODELS): model = MLP() model.to_gpu() optimizer = optimizers.Adam() optimizer.setup(model) for epoch in range(1, NUM_EPOCH + 1): perm = np.random.permutation(NUM_TRAIN) train_accuracy, train_loss = 0, 0 for i in range(0, NUM_TRAIN, BATCH_SIZE): x = chainer.Variable(cp.asarray(train_x[perm[i:i + BATCH_SIZE]]), volatile='off') t = chainer.Variable(cp.asarray(train_y[perm[i:i + BATCH_SIZE]]), volatile='off') optimizer.update(model, x, t) train_loss += float(model.loss.data) * len(t.data) train_accuracy += float(model.accuracy.data) * len(t.data) epoch_result = None test_accuracy, test_loss = 0, 0 for i in range(0, NUM_TEST, BATCH_SIZE): x = chainer.Variable(cp.asarray(test_x[i:i + BATCH_SIZE]), volatile='on') t = chainer.Variable(cp.asarray(test_y[i:i + BATCH_SIZE]), volatile='on') batch_result = model(x, t, False) if epoch == NUM_EPOCH: if i == 0:
def _array_to_gpu(array, device, stream): if array is None: return None if isinstance(array, chainerx.ndarray): # TODO(niboshi): Update this logic once both CuPy and ChainerX support # the array interface. if array.device.backend.name == 'cuda': # Convert to cupy.ndarray on the same device as source array array = cupy.ndarray( array.shape, array.dtype, cupy.cuda.MemoryPointer( cupy.cuda.UnownedMemory( array.data_ptr + array.offset, array.data_size, array, array.device.index), 0), strides=array.strides) else: array = chainerx.to_numpy(array) elif isinstance(array, (numpy.number, numpy.bool_)): array = numpy.asarray(array) elif isinstance(array, intel64.mdarray): array = numpy.asarray(array) if isinstance(array, ndarray): if array.device == device: return array is_numpy = False elif isinstance(array, numpy.ndarray): is_numpy = True else: raise TypeError( 'The array sent to gpu must be an array or a NumPy scalar.' '\nActual type: {0}.'.format(type(array))) if stream is not None and stream.ptr != 0: ret = cupy.empty_like(array) if is_numpy: # cpu to gpu mem = cupy.cuda.alloc_pinned_memory(array.nbytes) src = numpy.frombuffer( mem, array.dtype, array.size).reshape(array.shape) src[...] = array ret.set(src, stream) cupy.cuda.pinned_memory._add_to_watch_list( stream.record(), mem) else: # gpu to gpu with array.device: src = array.copy() event = Stream.null.record() stream.wait_event(event) ret.data.copy_from_device_async( src.data, src.nbytes, stream) # to hold a reference until the end of the asynchronous # memcpy stream.add_callback(lambda *x: None, (src, ret)) return ret with device: if is_numpy: return cupy.asarray(array) # Need to make a copy when an array is copied to another device return cupy.array(array, copy=True)
def _calc_array(self, cpu_c_flat: np.ndarray) -> np.ndarray: gpu_c_flat = cp.asarray(cpu_c_flat) gpu_iteration_flat = self.server.compute_flat_array(gpu_c_flat) cpu_iteration_flat = cp.asnumpy(gpu_iteration_flat) # cpu_iteration_flat = compute_array.ComputeGpu.compute(cpu_c_flat) return cpu_iteration_flat
def FC_to_FCWB(source, target, tftype='affine'): source = [[(xyz[0] * + 475)/1.7, (xyz[1] -185) / -2, (xyz[2]-125) / -2.5] for xyz in source] source = cp.asarray(source, dtype=cp.float32) target = cp.asarray(target, dtype=cp.float32) tf_param, _, _ = cpd.registration_cpd(source, target, tf_type_name=tftype, use_cuda=use_cuda) return tf_param
def FC_to_FCWB_transform(object, tf_param): object = [[(xyz[0] * + 475) / 1.7, (xyz[1] - 185) / -2, (xyz[2] - 125) / -2.5] for xyz in list(object)] return to_cpu(tf_param.transform(cp.asarray(object, dtype=cp.float32)))
def __call__(self, loc, score, anchor, img_size, scale=1.): """input should be ndarray Propose RoIs. Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed by the same index. On notations, :math:`R` is the total number of anchors. This is equal to product of the height and the width of an image and the number of anchor bases per pixel. Type of the output is same as the inputs. Args: loc (array): Predicted offsets and scaling to anchors. Its shape is :math:`(R, 4)`. score (array): Predicted foreground probability for anchors. Its shape is :math:`(R,)`. anchor (array): Coordinates of anchors. Its shape is :math:`(R, 4)`. img_size (tuple of ints): A tuple :obj:`height, width`, which contains image size after scaling. scale (float): The scaling factor used to scale an image after reading it from a file. Returns: array: An array of coordinates of proposal boxes. Its shape is :math:`(S, 4)`. :math:`S` is less than :obj:`self.n_test_post_nms` in test time and less than :obj:`self.n_train_post_nms` in train time. :math:`S` depends on the size of the predicted bounding boxes and the number of bounding boxes discarded by NMS. """ # NOTE: when test, remember # faster_rcnn.eval() # to set self.traing = False if self.parent_model.training: n_pre_nms = self.n_train_pre_nms n_post_nms = self.n_train_post_nms else: n_pre_nms = self.n_test_pre_nms n_post_nms = self.n_test_post_nms # Convert anchors into proposal via bbox transformations. # roi = loc2bbox(anchor, loc) roi = loc2bbox(anchor, loc) # Clip predicted boxes to image. roi[:, slice(0, 4, 2)] = np.clip( roi[:, slice(0, 4, 2)], 0, img_size[0]) roi[:, slice(1, 4, 2)] = np.clip( roi[:, slice(1, 4, 2)], 0, img_size[1]) # Remove predicted boxes with either height or width < threshold. min_size = self.min_size * scale hs = roi[:, 2] - roi[:, 0] ws = roi[:, 3] - roi[:, 1] keep = np.where((hs >= min_size) & (ws >= min_size))[0] roi = roi[keep, :] score = score[keep] # Sort all (proposal, score) pairs by score from highest to lowest. # Take top pre_nms_topN (e.g. 6000). order = score.ravel().argsort()[::-1] if n_pre_nms > 0: order = order[:n_pre_nms] roi = roi[order, :] # Apply nms (e.g. threshold = 0.7). # Take after_nms_topN (e.g. 300). # unNOTE: somthing is wrong here! # TODO: remove cuda.to_gpu keep = non_maximum_suppression( cp.ascontiguousarray(cp.asarray(roi)), thresh=self.nms_thresh) if n_post_nms > 0: keep = keep[:n_post_nms] roi = roi[keep] return roi
def FCWB_to_FC_transform(object, tf_param): return to_cpu(tf_param.transform(cp.asarray(object, dtype=cp.float32)))
def randomized_svd_gpu(M, n_components, n_oversamples=10, n_iter='auto', transpose='auto', random_state=0, lib='cupy'): """Computes a truncated randomized SVD on GPU. Adapted from Sklearn. Parameters ---------- M : ndarray or sparse matrix Matrix to decompose n_components : int Number of singular values and vectors to extract. n_oversamples : int (default is 10) Additional number of random vectors to sample the range of M so as to ensure proper conditioning. The total number of random vectors used to find the range of M is n_components + n_oversamples. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. n_iter : int or 'auto' (default is 'auto') Number of power iterations. It can be used to deal with very noisy problems. When 'auto', it is set to 4, unless `n_components` is small (< .1 * min(X.shape)) `n_iter` in which case is set to 7. This improves precision with few components. transpose : True, False or 'auto' (default) Whether the algorithm should be applied to M.T instead of M. The result should approximately be the same. The 'auto' mode will trigger the transposition if M.shape[1] > M.shape[0] since this implementation of randomized SVD tend to be a little faster in that case. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. lib : {'cupy', 'pytorch'}, str optional Chooses the GPU library to be used. Notes ----- This algorithm finds a (usually very good) approximate truncated singular value decomposition using randomization to speed up the computations. It is particularly fast on large matrices on which you wish to extract only a small number of components. In order to obtain further speed up, `n_iter` can be set <=2 (at the cost of loss of precision). References ---------- * Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert * An implementation of a randomized algorithm for principal component analysis A. Szlam et al. 2014 """ random_state = check_random_state(random_state) n_random = n_components + n_oversamples n_samples, n_features = M.shape if n_iter == 'auto': # Checks if the number of iterations is explicitly specified n_iter = 7 if n_components < .1 * min(M.shape) else 4 if transpose == 'auto': transpose = n_samples < n_features if transpose: M = M.T # this implementation is a bit faster with smaller shape[1] if lib == 'cupy': M = cupy.array(M) M = cupy.asarray(M) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = random_state.normal(size=(M.shape[1], n_random)) Q = cupy.array(Q) Q = cupy.asarray(Q) # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = cupy.dot(M, Q) Q = cupy.dot(M.T, Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = cupy.linalg.qr(cupy.dot(M, Q), mode='reduced') # project M to the (k + p) dimensional space using the basis vectors B = cupy.dot(Q.T, M) B = cupy.array(B) Q = cupy.array(Q) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = cupy.linalg.svd(B, full_matrices=False, compute_uv=True) del B U = cupy.dot(Q, Uhat) if transpose: # transpose back the results according to the input convention return V[:n_components, :].T, s[:n_components], U[:, :n_components].T else: return U[:, :n_components], s[:n_components], V[:n_components, :] elif lib == 'pytorch': M_gpu = torch.Tensor.cuda(torch.from_numpy(M.astype('float32'))) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = torch.cuda.FloatTensor(M_gpu.shape[1], n_random).normal_() # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = torch.mm(M_gpu, Q) Q = torch.mm(torch.transpose(M_gpu, 0, 1), Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = torch.qr(torch.mm(M_gpu, Q)) # project M to the (k + p) dimensional space using the basis vectors B = torch.mm(torch.transpose(Q, 0, 1), M_gpu) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = torch.svd(B) del B U = torch.mm(Q, Uhat) if transpose: # transpose back the results according to the input convention return (torch.transpose(V[:n_components, :], 0, 1), s[:n_components], torch.transpose(U[:, :n_components], 0, 1)) else: return U[:, :n_components], s[:n_components], V[:n_components, :]
def to_cupy(self, copy=False): self.host_to_device() if copy: return cp.array(self) return cp.asarray(self)