def test_torch_function_with_multiple_output_on_local_var(self): x = Var(torch.FloatTensor([[1, 2], [2, 3], [5, 6]])) t, s = torch.max(x, 1) assert (t == Var(torch.FloatTensor([2, 3, 6]))).all() assert (s == Var(torch.LongTensor([1, 1, 1]))).all() x = Var(torch.FloatTensor([[0, 0], [0, 0]])) y, z = torch.eig(x, True) assert (y == Var(torch.FloatTensor([[0, 0], [0, 0]]))).all() assert (z == Var(torch.FloatTensor([[1, 0.], [0, 1]]))).all() x = Var(torch.FloatTensor([[0, 0], [1, 0]])) y, z = torch.qr(x) assert (y == Var(torch.FloatTensor([[0, -1], [-1, 0]]))).all() assert (z == Var(torch.FloatTensor([[-1, 0], [0, 0]]))).all() x = Var(torch.arange(1, 6)) y, z = torch.kthvalue(x, 4) assert (y == Var(torch.FloatTensor([4]))).all() assert (z == Var(torch.LongTensor([3]))).all() x = Var(torch.zeros(3, 3)) w, y, z = torch.svd(x) assert (w == Var(torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))).all() assert (y == Var(torch.FloatTensor([0, 0, 0]))).all() assert (z == Var(torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))).all()
def test_local_tensor_multi_var_methods(self): x = torch.FloatTensor([[1, 2], [2, 3], [5, 6]]) t, s = torch.max(x, 1) assert (t == torch.FloatTensor([2, 3, 6])).float().sum() == 3 assert (s == torch.LongTensor([1, 1, 1])).float().sum() == 3 x = torch.FloatTensor([[0, 0], [1, 1]]) y, z = torch.eig(x, True) assert (y == torch.FloatTensor([[1, 0], [0, 0]])).all() assert (torch.equal(z == torch.FloatTensor([[0, 0], [1, 0]]), torch.ByteTensor([[1, 0], [1, 0]]))) x = torch.FloatTensor([[0, 0], [1, 0]]) y, z = torch.qr(x) assert (y == torch.FloatTensor([[0, -1], [-1, 0]])).all() assert (z == torch.FloatTensor([[-1, 0], [0, 0]])).all() x = torch.arange(1, 6) y, z = torch.kthvalue(x, 4) assert (y == torch.FloatTensor([4])).all() assert (z == torch.LongTensor([3])).all() x = torch.zeros(3, 3) w, y, z = torch.svd(x) assert (w == torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])).all() assert (y == torch.FloatTensor([0, 0, 0])).all() assert (z == torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])).all()
def test_remote_tensor_multi_var_methods(self): hook = TorchHook(verbose=False) local = hook.local_worker remote = VirtualWorker(hook, 1) local.add_worker(remote) x = torch.FloatTensor([[1, 2], [4, 3], [5, 6]]) x.send(remote) y, z = torch.max(x, 1) assert torch.equal(y.get(), torch.FloatTensor([2, 4, 6])) assert torch.equal(z.get(), torch.LongTensor([1, 0, 1])) x = torch.FloatTensor([[0, 0], [1, 0]]).send(remote) y, z = torch.qr(x) assert (y.get() == torch.FloatTensor([[0, -1], [-1, 0]])).all() assert (z.get() == torch.FloatTensor([[-1, 0], [0, 0]])).all() x = torch.arange(1, 6).send(remote) y, z = torch.kthvalue(x, 4) assert (y.get() == torch.FloatTensor([4])).all() assert (z.get() == torch.LongTensor([3])).all() x = torch.FloatTensor([[0, 0], [1, 1]]).send(remote) y, z = torch.eig(x, True) assert (y.get() == torch.FloatTensor([[1, 0], [0, 0]])).all() assert ((z.get() == torch.FloatTensor([[0, 0], [1, 0]])) == torch.ByteTensor([[1, 0], [1, 0]])).all() x = torch.zeros(3, 3).send(remote) w, y, z = torch.svd(x) assert (w.get() == torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])).all() assert (y.get() == torch.FloatTensor([0, 0, 0])).all() assert (z.get() == torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])).all()
def test_torch_function_with_multiple_output_on_remote_var(self): hook = TorchHook(verbose=False) me = hook.local_worker remote = VirtualWorker(id=2, hook=hook) me.add_worker(remote) x = Var(torch.FloatTensor([[1, 2], [4, 3], [5, 6]])) x.send(remote) y, z = torch.max(x, 1) y.get() assert torch.equal(y, Var(torch.FloatTensor([2, 4, 6]))) x = Var(torch.FloatTensor([[0, 0], [1, 0]])).send(remote) y, z = torch.qr(x) assert (y.get() == Var(torch.FloatTensor([[0, -1], [-1, 0]]))).all() assert (z.get() == Var(torch.FloatTensor([[-1, 0], [0, 0]]))).all() x = Var(torch.arange(1, 6)).send(remote) y, z = torch.kthvalue(x, 4) assert (y.get() == Var(torch.FloatTensor([4]))).all() assert (z.get() == Var(torch.LongTensor([3]))).all() x = Var(torch.FloatTensor([[0, 0], [0, 0]])) x.send(remote) y, z = torch.eig(x, True) assert (y.get() == Var(torch.FloatTensor([[0, 0], [0, 0]]))).all() assert (z.get() == Var(torch.FloatTensor([[1, 0.], [0, 1]]))).all() x = Var(torch.zeros(3, 3)).send(remote) w, y, z = torch.svd(x) assert (w.get() == Var(torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))).all() assert (y.get() == Var(torch.FloatTensor([0, 0, 0]))).all() assert (z.get() == Var(torch.FloatTensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))).all()
def random_orthogonal(size): """ Returns a random orthogonal matrix as a 2-dim tensor of shape [size, size]. """ # Use the QR decomposition of a random Gaussian matrix. x = torch.randn(size, size) q, _ = torch.qr(x) return q
def make_linear_dataset(n, d, seed=0): """ Create a dataset for training a deep linear network with n datapoints of dimension d. """ torch.manual_seed(seed) X = (torch.qr(torch.randn(n, d))[0] * sqrt(n)).cuda() A = torch.randn(d, d).cuda() Y = X.mm(A.t()) return TensorDataset(X, Y), TensorDataset(X, Y)
def qr(A: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: # pragma: no cover """ Like torch.linalg.qr. """ if hasattr(torch, "linalg") and hasattr(torch.linalg, "qr"): # PyTorch version >= 1.9 return torch.linalg.qr(A) return torch.qr(A)
def __init__(self, in_channel, out_channel=None): super().__init__() if out_channel is None: out_channel = in_channel weight = torch.randn(in_channel, out_channel) q, _ = torch.qr(weight) weight = q.unsqueeze(2).unsqueeze(3) self.weight = nn.Parameter(weight)
def _init_cache_for_non_constant_diag(self, eye, batch_shape, n): # With non-constant diagonals, we cant factor out the noise as easily self._q_cache, self._r_cache = torch.qr(torch.cat((self._piv_chol_self / self._noise.sqrt(), eye), dim=-2)) self._q_cache = self._q_cache[..., :n, :] / self._noise.sqrt() # Use the matrix determinant lemma for the logdet, using the fact that R'R = L_k'L_k + s*I logdet = self._r_cache.diagonal(dim1=-1, dim2=-2).abs().log().sum(-1).mul(2) logdet -= (1.0 / self._noise).log().sum([-1, -2]) self._precond_logdet_cache = logdet.view(*batch_shape) if len(batch_shape) else logdet.squeeze()
def randomly_rotate(X): """Randomly rotate d,n data matrix X""" d, n = X.shape z = torch.randn((d, d), dtype=X.dtype) q, r = torch.qr(z) d = torch.diag(r) ph = d / abs(d) rot_mat = q * ph return rot_mat @ X
def orthogonal_square(): """ Create orthogonal square matrix using Gram-Schmidt Return: orthogonal random features """ q, _ = torch.qr(self.iid_gaussian(d, d)) return q.T
def fit_A(self, data, sample='gaussian', weight=None, ndata_A=None, MSWD_p=2, MSWD_max_iter=200, pool=None, verbose=True): #fit the directions A to apply 1D transform if verbose: tstart = start_timing(self.A.device) if ndata_A is None or ndata_A > len(data): ndata_A = len(data) if sample != 'gaussian': if ndata_A > len(sample): ndata_A = len(sample) if ndata_A == len(sample): sample = sample.to(self.A.device) else: sample = sample[torch.randperm( len(sample), device=sample.device)[:ndata_A]].to(self.A.device) if ndata_A == len(data): data = data.to(self.A.device) if weight is not None: weight = weight.to(self.A.device) else: order = torch.randperm(len(data), device=data.device)[:ndata_A] data = data[order].to(self.A.device) if weight is not None: weight = weight[order].to(self.A.device) if weight is not None: weight = weight / torch.sum(weight) select = weight > 0 data = data[select] weight = weight[select] A, SWD = maxKSWDdirection(data, x2=sample, weight=weight, K=self.K, maxiter=MSWD_max_iter, p=MSWD_p) with torch.no_grad(): SWD, indices = torch.sort(SWD, descending=True) A = A[:, indices] self.A[:] = torch.qr(A)[0] if verbose: t = end_timing(tstart, self.A.device) print('Fit A:', 'Time:', t, 'Wasserstein Distance:', SWD.tolist()) return self
def orthogonal_init(layers, mean=0.0, std=0.01): k = len(layers) ou_f = layers[0].out_features in_f = layers[0].in_features random = torch.randn((ou_f, in_f, k)) * std + mean q, r = torch.qr(random, some=True) for detector, init in zip(layers, q.permute(2, 0, 1)): detector.weight.data.copy_(init) nn.init.zeros_(detector.bias)
def hutchpp(linear_operator, dimension, n_queries): A = linear_operator d = dimension m = n_queries S = torch.randn(d, m // 3, device=A.device) G = torch.randn(d, m // 3, device=A.device) Q, _ = torch.qr(A.matvec(S)) proj = G - Q @ (Q.T @ G) return torch.trace( Q.T @ A.matvec(Q)) + (3.0 / m) * torch.trace(proj.T @ A.matvec(proj))
def cca_by_qr(x: Tensor, y: Tensor) -> Tuple[Tensor, Tensor, Tensor]: """ CCA using QR and SVD. For more details, check Press 2011 "Canonical Correlation Clarified by Singular Value Decomposition" Args: x: input tensor of Shape DxH y: input tensor of shape DxW Returns: x-side coefficients, y-side coefficients, diagonal """ q_1, r_1 = torch.qr(x) q_2, r_2 = torch.qr(y) qq = q_1.t() @ q_2 u, diag, v = torch.svd(qq) a = torch.inverse(r_1) @ u b = torch.inverse(r_2) @ v return a, b, diag
def get_logdet(jacobian): """ Use QR factorisation to compute log absolute determinant of the jacobian matrix. NOTE: prefer `torch.slogdet` from Pytorch >= 0.4 :param jacobian: (M, M) :return: log-determinant jacobian """ Q, R = torch.qr(jacobian) log_det = torch.log(torch.diag(R).abs()).sum() return log_det
def qr_retraction(tan_vec): # tan_vec, p-by-n, p <= n [p, n] = tan_vec.size() tan_vec.t_() q, r = torch.qr(tan_vec) d = torch.diag(r, 0) ph = d.sign() q *= ph.expand_as(q) q.t_() return q
def QR_features(x, degree=6): # x:[time, h, w, 1] time, h, w, _ = x.shape poly_x = poly_features(x, degree + 1).permute(1, 2, 0, -1) # [h, w, time, features] ans = t.zeros((h, w, time, degree)) for i in range(256 * 256): row = i // 256 col = i % 256 ans[row, col] = t.qr(poly_x[row, col])[0][:, 1:] return ans.permute(2, 0, 1, -1)
def OrthogonalRandomFeature(self): n = self.n_features//self.depth remainder = self.n_features%self.depth orthogonal_features = [] for _ in range(n): normal_feature = torch.rand(self.depth, self.depth) orthogonal_feature, _ = torch.qr(normal_feature) orthogonal_features.append(orthogonal_feature) if remainder > 0 : normal_feature = torch.rand(self.depth, self.depth) orthogonal_feature, _ = torch.qr(normal_feature) orthogonal_features.append(orthogonal_feature[0: remainder]) orthogonal_features = torch.cat(orthogonal_features) mutilplier = torch.randn(self.n_features, self.depth).norm(dim=1) final_features = torch.matmul(torch.diag(mutilplier), orthogonal_features) return final_features
def _init_cache_for_constant_diag(self, eye, batch_shape, n, k): # We can factor out the noise for for both QR and solves. self._noise = self._noise.narrow(-2, 0, 1) self._q_cache, self._r_cache = torch.qr(torch.cat((self._piv_chol_self, self._noise.sqrt() * eye), dim=-2)) self._q_cache = self._q_cache[..., :n, :] # Use the matrix determinant lemma for the logdet, using the fact that R'R = L_k'L_k + s*I logdet = self._r_cache.diagonal(dim1=-1, dim2=-2).abs().log().sum(-1).mul(2) logdet = logdet + (n - k) * self._noise.squeeze(-2).squeeze(-1).log() self._precond_logdet_cache = logdet.view(*batch_shape) if len(batch_shape) else logdet.squeeze()
def orthgonal_init(fc1, fc2): out_dim, in_dim = fc1.weight.data.shape device = fc1.weight.data.device for i in np.arange(out_dim): weight_random = torch.randn([in_dim, 2]).to(device) Q, R = torch.qr(weight_random) fc1.weight.data[i, :] = Q[:, 0] fc2.weight.data[i, :] = Q[:, 1] fc1.bias.data.zero_() fc2.bias.data.zero_()
def orthogonolize(self, last_core=True): for k in range(self.d): tmp = torch_utils.reshape_torch(self.cores[k].data, [self.r[k]*self.n[k], -1], use_batch=False) if k > 0: tmp = r.mm(tmp) if (k == self.d-1) and (not last_core): self.cores[k].data = torch_utils.reshape_torch(tmp, [self.r[k], self.n[k], -1], use_batch=False) continue q, r = torch.qr(tmp) self.cores[k].data = torch_utils.reshape_torch(q, [self.r[k], self.n[k], -1], use_batch=False)
def oneStepVarQR(J, Q): Z = torch.matmul( torch.transpose(J.float(), 1, 2), Q) # Linear extrapolation of the network in many directions q, r = torch.qr(Z, some=True) # QR decomposition of new directions s = torch.diag_embed(torch.sign(torch.diagonal( r, dim1=1, dim2=2))) # extract sign of each leading r value return torch.matmul(q, s), torch.diagonal( torch.matmul(s, r), dim1=1, dim2=2) # return positive r values and corresponding vectors
def _orthogonalize_tt_cores_right_to_left(tt): """Orthogonalize TT-cores of a TT-object in the right to left order. Args: tt: TenosorTrain or a TensorTrainBatch. Returns: The same type as the input `tt` (TenosorTrain or a TensorTrainBatch). """ # Left to right orthogonalization. ndims = tt.ndims raw_shape = tt.get_raw_shape() tt_ranks = tt.get_tt_ranks() prev_rank = tt_ranks[ndims] # Copy cores reference so we can change the cores. tt_cores = list(tt.tt_cores) for core_idx in range(ndims - 1, 0, -1): curr_core = tt_cores[core_idx] # TT-ranks could have changed on the previous iteration, so `tt_ranks` can # be outdated for the current TT-rank, but should be valid for the next # TT-rank. curr_rank = prev_rank prev_rank = tt_ranks[core_idx] if tt.is_tt_matrix(): curr_mode_left = raw_shape[0][core_idx] curr_mode_right = raw_shape[1][core_idx] curr_mode = curr_mode_left * curr_mode_right else: curr_mode = raw_shape[0][core_idx] qr_shape = (prev_rank, curr_mode * curr_rank) curr_core = curr_core.reshape(qr_shape) curr_core, triang = torch.qr(curr_core.t()) curr_core = curr_core.t() triang_shape = triang.shape # The TT-rank could have changed: if qr_shape is e.g. 4 x 10, than q would # be of size 4 x 4 and r would be 4 x 10, which means that the next rank # should be changed to 4. prev_rank = triang_shape[1] if tt.is_tt_matrix(): new_core_shape = (prev_rank, curr_mode_left, curr_mode_right, curr_rank) else: new_core_shape = (prev_rank, curr_mode, curr_rank) tt_cores[core_idx] = curr_core.reshape(new_core_shape) prev_core = tt_cores[core_idx - 1].reshape(-1, triang_shape[0]) tt_cores[core_idx - 1] = torch.mm(prev_core, triang) if tt.is_tt_matrix(): first_core_shape = (1, raw_shape[0][0], raw_shape[1][0], prev_rank) else: first_core_shape = (1, raw_shape[0][0], prev_rank) tt_cores[0] = tt_cores[0].reshape(first_core_shape) # TODO: infer the tt_ranks return TensorTrain(tt_cores, tt.get_raw_shape())
def gpu_tsvd( A: torch.Tensor, k: int, n_iter: int = 2, n_oversamples: int = 8 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ GPU Truncated SVD. Based on fbpca's version. Parameters ---------- A : (M, N) torch.Tensor k : int n_iter : int n_oversamples : int Returns ------- u : (M, k) torch.Tensor s : (k,) torch.Tensor vt : (k, N) torch.Tensor """ m, n = A.shape Q = torch.rand(n, k + n_oversamples) Q = A @ Q Q, _ = torch.qr(Q) # Power iterations for _ in range(n_iter): Q = (Q.t() @ A).t() Q, _ = torch.qr(Q) Q = A @ Q Q, _ = torch.qr(Q) QA = Q.t() @ A # Transpose QA to make it tall-skinny as MAGMA has optimisations for this # (USVt)t = VStUt Va, s, R = torch.svd(QA.t(), some=True) U = Q @ R.t() return U[:, :k], s[:k], Va.t()[:k, :]
def random_svd(self, A, k): if self._bsize == 0: bsize = k u = A.new_zeros((1, A.shape[1])) l = A.new_zeros((A.shape[0], 1)) if A.shape[0] < A.shape[1]: n = A.shape[0] ind = 0 else: n = A.shape[1] ind = 1 tpose = False if ind == 0: tpose = True l = torch.t(u) u = A.new_ones((1, A.shape[0])) A = torch.t(A) K = A.new_zeros((A.shape[1], bsize * self._q)) block = torch.randn(A.shape[1], bsize).to(A.device) block, _ = torch.qr(block) T = A.new_zeros((A.shape[1], bsize)) for i in range(self._q): T = torch.matmul(A, block) - torch.matmul(l, torch.matmul( u, block)) block = torch.matmul(torch.t(A), T) - torch.matmul( torch.t(u), torch.matmul(torch.t(l), T)) block, _ = torch.qr(block) K[:, int(i * bsize):int((i + 1) * bsize)] = block.clone().detach() Q, _ = torch.qr(K) T = torch.matmul(A, Q) - torch.matmul(l, torch.matmul(u, Q)) Ut, St, Vt = torch.svd(T) S = St[0:k] if tpose: V = Ut[:, 0:k] U = torch.matmul(Q, Vt[:, 0:k]) else: U = Ut[:, 0:k] V = torch.matmul(Q, Vt[:, 0:k]) return U, S, V
def projected(self): a = self.active L = self.K[a][:, a].cholesky() sigma = self.sigma A = torch.cat([self.K[:, a] / sigma.view(-1, 1), L.t()]) O = torch.zeros(L.size(0)).type(L.type()) Y = torch.cat([self.y / sigma, O]) Q, R = torch.qr(A) mu = R.inverse() @ Q.t() @ Y delta = ((self.K[:, a] @ mu).view(-1) - self.y).abs() return a, mu, delta.mean(), delta.max()
def variable_with_orth_weight_decay(self, shape): s1 = torch.tensor(shape[1], dtype=torch.int32).to('cuda') s2 = torch.tensor(shape[1] / 2, dtype=torch.int32).to('cuda') w0_init, _ = torch.qr(torch.normal(0, 1, size=(s1, s2))) w0 = torch.nn.Parameter(w0_init).to('cuda') tmp1 = w0.view(1, s1, s2) tmp2 = w0.transpose(0, 1).view(1, s2, s1) tmp1 = self.tile(tmp1, 0, shape[0]) tmp2 = self.tile(tmp2, 0, shape[0]) return tmp1, tmp2
def to_orthogonal_matrix(self, seed): """ return orthogonal matrix given arbitrary square matrix seed. random matrices are uniformly distributed according to the Haar measure as explained here: https://arxiv.org/pdf/math-ph/0609050.pdf """ q, r = torch.qr(seed) d = r.diag() ph = d / d.abs() output = q @ ph.diag() @ q return output
def __init__(self, dim, scramble=False, hetero=True, hidden=False, child=True, ones=True, noise_identity=True): self.hetero = hetero self.hidden = hidden self.dim = dim // 2 print("ones" + str(ones)) if ones: self.wxy = torch.eye(self.dim) if child: print("child " + str(child)) self.wyz = torch.eye(self.dim) else: self.wyz = torch.zeros(self.dim, self.dim) else: self.wxy = torch.randn(self.dim, self.dim) / dim if child: self.wyz = torch.randn(self.dim, self.dim) / dim else: self.wyz = torch.zeros(self.dim, self.dim) if scramble: self.scramble, _ = torch.qr(torch.randn(dim, dim)) else: self.scramble = torch.eye(dim) if hidden: if noise_identity == 0: print("noise_identity " + str(noise_identity)) self.whx = torch.randn(self.dim, self.dim) / dim self.why = torch.randn(self.dim, self.dim) / dim self.whz = torch.randn(self.dim, self.dim) / dim else: if noise_identity == 1: print("noise_identity " + str(noise_identity)) self.whx = torch.eye(self.dim, self.dim) self.why = torch.eye(self.dim, self.dim) self.whz = torch.eye(self.dim, self.dim) else: if noise_identity == 2: print("noise_identity " + str(noise_identity)) self.whx = torch.rand(self.dim, self.dim) / dim self.why = torch.rand(self.dim, self.dim) / dim self.whz = torch.rand(self.dim, self.dim) / dim else: self.whx = torch.eye(self.dim, self.dim) self.why = torch.zeros(self.dim, self.dim) self.whz = torch.zeros(self.dim, self.dim)
def __init__(self): super(Invertible_1x1_Conv, self).__init__() assert hp.Decoder.Num_Split % 2 == 0 weight = torch.qr( torch.FloatTensor(hp.Decoder.Num_Split, hp.Decoder.Num_Split).normal_())[0] if torch.det(weight) < 0: weight[:, 0] = -weight[:, 0] self.weight = torch.nn.Parameter(weight)
def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): super().__init__() assert(n_split % 2 == 0) self.channels = channels self.n_split = n_split self.no_jacobian = no_jacobian w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] if torch.det(w_init) < 0: w_init[:,0] = -1 * w_init[:,0] self.weight = nn.Parameter(w_init)
def forward(ctx, input): [bs, r, d] = input.size() Q = torch.zeros(bs, r, d, dtype=torch.float64).double().cuda() R = torch.zeros(bs, d, d, dtype=torch.float64).double().cuda() for id in range(bs): Q[id, :, :], R[id, :, :] = torch.qr(input[id, :, :].squeeze()) ctx.save_for_backward(input, Q, R) return Q, R
def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): unstructured_block = torch.randn((cols, cols), device=device) q, r = torch.qr(unstructured_block.cpu(), some=True) q, r = map(lambda t: t.to(device), (q, r)) # proposed by @Parskatt # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf if qr_uniform_q: d = torch.diag(r, 0) q *= d.sign() return q.t()
def from_eigen(eigen): '''Construct a random matrix with given the eigenvalues. To construct such a matrix form the eigenvalue decomposition, (i.e. U * Sigma * U.t()), we need to find a unitary matrix U and Sigma is the diagonal matrix of the eigenvalues `eigen`. The matrix U can be the unitary matrix Q from the QR-decomposition of a randomly generated matrix. Args: eigen: A vector of size (Batch, Size). Returns: A random matrix of size (Batch, Size, Size). ''' size = eigen.size(-1) Q, _ = torch.qr(torch.randn( (size, size), dtype=eigen.dtype, device=eigen.device)) return mul_diag(Q, eigen).matmul(Q.t())
def orthogonal(tensor, gain=1): """Fills the input Tensor or Variable with a (semi) orthogonal matrix, as described in "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. (2013). The input tensor must have at least 2 dimensions, and for tensors with more than 2 dimensions the trailing dimensions are flattened. Args: tensor: an n-dimensional torch.Tensor or autograd.Variable, where n >= 2 gain: optional scaling factor Examples: >>> w = torch.Tensor(3, 5) >>> nn.init.orthogonal(w) """ if isinstance(tensor, Variable): orthogonal(tensor.data, gain=gain) return tensor if tensor.ndimension() < 2: raise ValueError("Only tensors with 2 or more dimensions are supported") rows = tensor.size(0) cols = tensor[0].numel() flattened = torch.Tensor(rows, cols).normal_(0, 1) # Compute the qr factorization q, r = torch.qr(flattened) # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf d = torch.diag(r, 0) ph = d.sign() q *= ph.expand_as(q) # Pad zeros to Q (if rows smaller than cols) if rows < cols: padding = torch.zeros(rows, cols - rows) if q.is_cuda: q = torch.cat([q, padding.cuda()], 1) else: q = torch.cat([q, padding], 1) tensor.view_as(q).copy_(q) tensor.mul_(gain) return tensor
def orthogonal_(tensor, gain=1): r"""Fills the input `Tensor` with a (semi) orthogonal matrix, as described in "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. (2013). The input tensor must have at least 2 dimensions, and for tensors with more than 2 dimensions the trailing dimensions are flattened. Args: tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2` gain: optional scaling factor Examples: >>> w = torch.empty(3, 5) >>> nn.init.orthogonal_(w) """ if tensor.ndimension() < 2: raise ValueError("Only tensors with 2 or more dimensions are supported") rows = tensor.size(0) cols = tensor[0].numel() flattened = tensor.new(rows, cols).normal_(0, 1) if rows < cols: flattened.t_() # Compute the qr factorization q, r = torch.qr(flattened) # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf d = torch.diag(r, 0) ph = d.sign() q *= ph if rows < cols: q.t_() with torch.no_grad(): tensor.view_as(q).copy_(q) tensor.mul_(gain) return tensor
def randomized_svd_gpu(M, n_components, n_oversamples=10, n_iter='auto', transpose='auto', random_state=0, lib='cupy'): """Computes a truncated randomized SVD on GPU. Adapted from Sklearn. Parameters ---------- M : ndarray or sparse matrix Matrix to decompose n_components : int Number of singular values and vectors to extract. n_oversamples : int (default is 10) Additional number of random vectors to sample the range of M so as to ensure proper conditioning. The total number of random vectors used to find the range of M is n_components + n_oversamples. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. n_iter : int or 'auto' (default is 'auto') Number of power iterations. It can be used to deal with very noisy problems. When 'auto', it is set to 4, unless `n_components` is small (< .1 * min(X.shape)) `n_iter` in which case is set to 7. This improves precision with few components. transpose : True, False or 'auto' (default) Whether the algorithm should be applied to M.T instead of M. The result should approximately be the same. The 'auto' mode will trigger the transposition if M.shape[1] > M.shape[0] since this implementation of randomized SVD tend to be a little faster in that case. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. lib : {'cupy', 'pytorch'}, str optional Chooses the GPU library to be used. Notes ----- This algorithm finds a (usually very good) approximate truncated singular value decomposition using randomization to speed up the computations. It is particularly fast on large matrices on which you wish to extract only a small number of components. In order to obtain further speed up, `n_iter` can be set <=2 (at the cost of loss of precision). References ---------- * Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 * A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert * An implementation of a randomized algorithm for principal component analysis A. Szlam et al. 2014 """ random_state = check_random_state(random_state) n_random = n_components + n_oversamples n_samples, n_features = M.shape if n_iter == 'auto': # Checks if the number of iterations is explicitly specified n_iter = 7 if n_components < .1 * min(M.shape) else 4 if transpose == 'auto': transpose = n_samples < n_features if transpose: M = M.T # this implementation is a bit faster with smaller shape[1] if lib == 'cupy': M = cupy.array(M) M = cupy.asarray(M) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = random_state.normal(size=(M.shape[1], n_random)) Q = cupy.array(Q) Q = cupy.asarray(Q) # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = cupy.dot(M, Q) Q = cupy.dot(M.T, Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = cupy.linalg.qr(cupy.dot(M, Q), mode='reduced') # project M to the (k + p) dimensional space using the basis vectors B = cupy.dot(Q.T, M) B = cupy.array(B) Q = cupy.array(Q) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = cupy.linalg.svd(B, full_matrices=False, compute_uv=True) del B U = cupy.dot(Q, Uhat) if transpose: # transpose back the results according to the input convention return V[:n_components, :].T, s[:n_components], U[:, :n_components].T else: return U[:, :n_components], s[:n_components], V[:n_components, :] elif lib == 'pytorch': M_gpu = torch.Tensor.cuda(torch.from_numpy(M.astype('float32'))) # Generating normal random vectors with shape: (M.shape[1], n_random) Q = torch.cuda.FloatTensor(M_gpu.shape[1], n_random).normal_() # Perform power iterations with Q to further 'imprint' the top # singular vectors of M in Q for i in range(n_iter): Q = torch.mm(M_gpu, Q) Q = torch.mm(torch.transpose(M_gpu, 0, 1), Q) # Sample the range of M using by linear projection of Q. Extract an orthonormal basis Q, _ = torch.qr(torch.mm(M_gpu, Q)) # project M to the (k + p) dimensional space using the basis vectors B = torch.mm(torch.transpose(Q, 0, 1), M_gpu) # compute the SVD on the thin matrix: (k + p) wide Uhat, s, V = torch.svd(B) del B U = torch.mm(Q, Uhat) if transpose: # transpose back the results according to the input convention return (torch.transpose(V[:n_components, :], 0, 1), s[:n_components], torch.transpose(U[:, :n_components], 0, 1)) else: return U[:, :n_components], s[:n_components], V[:n_components, :]