def _scipy_apply(x): x = torch.from_numpy(x) x = utils.maybe_fp16(x, fp16) if use_gpu: x = x.cuda() out = operator.apply(x) out = utils.maybe_fp16(out, fp16) out = out.cpu().numpy() return out
def power_iteration( operator: Operator, steps: int = 20, error_threshold: float = 1e-4, momentum: float = 0.0, use_gpu: bool = True, fp16: bool = False, init_vec: torch.Tensor = None, ) -> Tuple[float, torch.Tensor]: """ Compute dominant eigenvalue/eigenvector of a matrix operator: linear Operator giving us matrix-vector product access steps: number of update steps to take returns: (principal eigenvalue, principal eigenvector) pair """ vector_size = operator.size # input dimension of operator if init_vec is None: vec = torch.rand(vector_size) else: vec = init_vec vec = utils.maybe_fp16(vec, fp16) if use_gpu: vec = vec.cuda() prev_lambda = 0.0 prev_vec = utils.maybe_fp16(torch.randn_like(vec), fp16) for i in range(steps): prev_vec = vec / (torch.norm(vec) + 1e-6) new_vec = utils.maybe_fp16(operator.apply(vec), fp16) - momentum * prev_vec # need to handle case where we end up in the nullspace of the operator. # in this case, we are done. if torch.norm(new_vec).item() == 0.0: return 0.0, new_vec lambda_estimate = vec.dot(new_vec).item() diff = lambda_estimate - prev_lambda vec = new_vec.detach() / torch.norm(new_vec) if lambda_estimate == 0.0: # for low-rank error = 1.0 else: error = np.abs(diff / lambda_estimate) utils.progress_bar(i, steps, "power iter error: %.4f" % error) if error < error_threshold: break prev_lambda = lambda_estimate return lambda_estimate, vec
def _prepare_grad(self): """ Compute gradient w.r.t loss over all parameters and vectorize """ try: all_inputs, all_targets = next(self.dataloader_iter) except StopIteration: self.dataloader_iter = iter(self.dataloader) all_inputs, all_targets = next(self.dataloader_iter) num_chunks = max(1, len(all_inputs) // self.max_samples) grad_vec = None input_chunks = all_inputs.chunk(num_chunks) target_chunks = all_targets.chunk(num_chunks) for input, target in zip(input_chunks, target_chunks): if self.use_gpu: input = input.cuda() target = target.cuda() output = self.model(input) loss = self.criterion(output, target) grad_dict = torch.autograd.grad( loss, self.model.parameters(), create_graph=True ) if grad_vec is not None: grad_vec += torch.cat([g.contiguous().view(-1) for g in grad_dict]) else: grad_vec = torch.cat([g.contiguous().view(-1) for g in grad_dict]) grad_vec = utils.maybe_fp16(grad_vec, self.fp16) grad_vec /= num_chunks self.grad_vec = grad_vec return self.grad_vec
def _apply_batch(self, vec): # compute original gradient, tracking computation graph self.zero_grad() grad_vec = self._prepare_grad() self.zero_grad() # take the second gradient grad_grad = torch.autograd.grad( grad_vec, self.model.parameters(), grad_outputs=vec, only_inputs=True ) # concatenate the results over the different components of the network hessian_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad]) hessian_vec_prod = utils.maybe_fp16(hessian_vec_prod, self.fp16) return hessian_vec_prod
def _apply_batch(self, vec: torch.Tensor) -> torch.Tensor: """ Computes the Hessian-vector product for a mini-batch from the dataset. """ # compute original gradient, tracking computation graph self._zero_grad() grad_vec = self._prepare_grad() self._zero_grad() # take the second gradient # this is the derivative of <grad_vec, v> where <,> is an inner product. hessian_vec_prod_dict = torch.autograd.grad(grad_vec, self.model.parameters(), grad_outputs=vec, only_inputs=True) # concatenate the results over the different components of the network hessian_vec_prod = torch.cat( [g.contiguous().view(-1) for g in hessian_vec_prod_dict]) hessian_vec_prod = utils.maybe_fp16(hessian_vec_prod, self.fp16) return hessian_vec_prod
def _prepare_grad(self) -> torch.Tensor: """ Compute gradient w.r.t loss over all parameters and vectorize """ try: all_inputs, all_targets = next(self.dataloader_iter) except StopIteration: self.dataloader_iter = iter(self.dataloader) all_inputs, all_targets = next(self.dataloader_iter) num_chunks = max(1, len(all_inputs) // self.max_possible_gpu_samples) grad_vec = None # This will do the "gradient chunking trick" to create micro-batches # when the batch size is larger than what will fit in memory. # WARNING: this may interact poorly with batch normalization. input_microbatches = all_inputs.chunk(num_chunks) target_microbatches = all_targets.chunk(num_chunks) for input, target in zip(input_microbatches, target_microbatches): if self.use_gpu: input = input.cuda() target = target.cuda() output = self.model(input) loss = self.criterion(output, target) grad_dict = torch.autograd.grad(loss, self.model.parameters(), create_graph=True) if grad_vec is not None: grad_vec += torch.cat( [g.contiguous().view(-1) for g in grad_dict]) else: grad_vec = torch.cat( [g.contiguous().view(-1) for g in grad_dict]) grad_vec = utils.maybe_fp16(grad_vec, self.fp16) grad_vec /= num_chunks self.grad_vec = grad_vec return self.grad_vec
def lanczos( operator, num_eigenthings=10, which="LM", max_steps=20, tol=1e-6, num_lanczos_vectors=None, init_vec=None, use_gpu=False, fp16=False, ): """ Use the scipy.sparse.linalg.eigsh hook to the ARPACK lanczos algorithm to find the top k eigenvalues/eigenvectors. Parameters ------------- operator: power_iter.Operator linear operator to solve. num_eigenthings : int number of eigenvalue/eigenvector pairs to compute which : str ['LM', SM', 'LA', SA'] L,S = largest, smallest. M, A = in magnitude, algebriac SM = smallest in magnitude. LA = largest algebraic. max_steps : int maximum number of arnoldi updates tol : float relative accuracy of eigenvalues / stopping criterion num_lanczos_vectors : int number of lanczos vectors to compute. if None, > 2*num_eigenthings init_vec: [torch.Tensor, torch.cuda.Tensor] if None, use random tensor. this is the init vec for arnoldi updates. use_gpu: bool if true, use cuda tensors. fp16: bool if true, keep operator input/output in fp16 instead of fp32. Returns ---------------- eigenvalues : np.ndarray array containing `num_eigenthings` eigenvalues of the operator eigenvectors : np.ndarray array containing `num_eigenthings` eigenvectors of the operator """ if isinstance(operator.size, int): size = operator.size else: size = operator.size[0] shape = (size, size) if num_lanczos_vectors is None: num_lanczos_vectors = min(2 * num_eigenthings, size - 1) if num_lanczos_vectors < 2 * num_eigenthings: warn( "[lanczos] number of lanczos vectors should usually be > 2*num_eigenthings" ) def _scipy_apply(x): x = torch.from_numpy(x) x = utils.maybe_fp16(x, fp16) if use_gpu: x = x.cuda() out = operator.apply(x) out = utils.maybe_fp16(out, fp16) out = out.cpu().numpy() return out scipy_op = ScipyLinearOperator(shape, _scipy_apply) if init_vec is None: init_vec = np.random.rand(size) elif isinstance(init_vec, torch.Tensor): init_vec = init_vec.cpu().numpy() init_vec = utils.maybe_fp16(init_vec, fp16) eigenvals, eigenvecs = eigsh( A=scipy_op, k=num_eigenthings, which=which, maxiter=max_steps, tol=tol, ncv=num_lanczos_vectors, return_eigenvectors=True, ) return eigenvals, eigenvecs.T
def _new_op_fn(x, op=current_op, val=eigenval, vec=eigenvec): return utils.maybe_fp16(op.apply(x), fp16) - _deflate(x, val, vec)