def solve(self, trust_radius): """Solve quadratic subproblem""" # Compute the Newton point. # This is the optimum for the quadratic model function. # If it is inside the trust radius then return this point. p_best = self.newton_point() if norm(p_best) < trust_radius: hits_boundary = False return p_best, hits_boundary # Compute the Cauchy point. # This is the predicted optimum along the direction of steepest descent. p_u = self.cauchy_point() # If the Cauchy point is outside the trust region, # then return the point where the path intersects the boundary. p_u_norm = norm(p_u) if p_u_norm >= trust_radius: p_boundary = p_u * (trust_radius / p_u_norm) hits_boundary = True return p_boundary, hits_boundary # Compute the intersection of the trust region boundary # and the line segment connecting the Cauchy and Newton points. # This requires solving a quadratic equation. # ||p_u + t*(p_best - p_u)||**2 == trust_radius**2 # Solve this for positive time t using the quadratic formula. _, tb = self.get_boundaries_intersections(p_u, p_best - p_u, trust_radius) p_boundary = p_u + tb * (p_best - p_u) hits_boundary = True return p_boundary, hits_boundary
def identify_bias_between_word_sets(self, social_group_word_sets, n_components=10, freq_spaces=None, k=2): embeddings = find_embedding_layer(self.model) matrix = [] for word_set in social_group_word_sets: word_ids = self.tokenizer.convert_tokens_to_ids(word_set) target_embeddings = embeddings(torch.tensor(word_ids)) if freq_spaces: freq_subspaces = torch.load(freq_spaces) freq_subspaces = torch.from_numpy(freq_subspaces).float()[:k] freq_norms = LA.norm(freq_subspaces, dim=-1).view(-1, 1) embed_norms = LA.norm(target_embeddings, dim=-1) freq_subspaces = (((target_embeddings.mm(freq_subspaces.T)) * freq_subspaces / freq_norms).T * embed_norms).T target_embeddings -= freq_subspaces center = target_embeddings.mean(dim=0) matrix.extend((target_embeddings - center).detach()) matrix = torch.stack(matrix) return self.__do_pca(matrix, n_components)
def create_cp(dims, rank, sparsity=None, method='rand', weights=False, return_tensor=False, noise=None, sparse_noise=True): # TODO: investigate performance impact of setting backend here tl.set_backend('pytorch') if method == 'rand': randfunc = torch.rand elif method == 'randn': randfunc = torch.randn else: raise NotImplementedError(f'Unknown random method: {method}') n_dims = len(dims) factors = [randfunc((dim, rank)) for dim in dims] if sparsity is not None: if isinstance(sparsity, float): sparsity = [sparsity for _ in range(n_dims)] elif not isinstance(sparsity, list) and not isinstance( sparsity, tuple): raise ValueError( 'Sparsity parameter should either be a float or tuple/list.') # Sparsify factors for dim in range(n_dims): n_el = dims[dim] * rank to_del = round(sparsity[dim] * n_el) if to_del == 0: continue idxs = torch.tensor(random.sample(range(n_el), to_del)) factors[dim].view(-1)[idxs] = 0 # torch.randperm(n_el, device=device)[:n_select] ten = None # Add noise if noise is not None: ten = tl.cp_to_tensor((torch.ones(rank), factors)) if (sparsity is None or not sparse_noise): nten = torch.randn(ten.size()) ten += noise * (norm(ten) / norm(nten)) * nten else: flat = ten.view(-1) nzs = torch.nonzero(flat, as_tuple=True)[0] nvec = torch.randn(nzs.size(0)) flat[nzs] += noise * (norm(ten) / norm(nvec)) * nvec if return_tensor: if ten is None: return tl.cp_to_tensor((torch.ones(rank), factors)) return ten if weights: return torch.ones(rank), factors return factors
def forward(self, input: Tensor, target: Tensor) -> Tensor: dist_euc = LA.norm(input - target, dim=1) norm_input = LA.norm(input, dim=1) norm_target = LA.norm(target, dim=1) dist_hype = torch.acosh(1 + 2 * dist_euc**2 / ((1 - norm_input**2) * (1 - norm_target**2))) loss = torch.mean(dist_hype) return loss
def NMELoss(predicted_landmark, target_landmark): # landmark is a numpy array which has shape [5, 2] num_face_landmark = 5 leye_nouse_vec = torch.from_numpy(target_landmark[0] - target_landmark[2]) reye_nouse_vec = torch.from_numpy(target_landmark[1] - target_landmark[2]) inter_occular_distance = LA.norm(leye_nouse_vec) + LA.norm(reye_nouse_vec) loss = nn.MSELoss(reduction="sum") preloss = loss(torch.from_numpy(predicted_landmark), torch.from_numpy(target_landmark)) nme_loss = torch.sqrt(preloss) / (inter_occular_distance * num_face_landmark) return nme_loss
def norm_squared(vi, vj): if ML_ENGINE == "PyTorch": return LA.norm(vi - vj).item()**2 else: fvi = np.concatenate([x.ravel() for x in vi]) fvj = np.concatenate([x.ravel() for x in vj]) return np.linalg.norm(fvi - fvj)**2
def generate_change_tensor( self, preprocessed_image: torch.Tensor) -> torch.Tensor: """ Generates change tensor by iteratively going towards linearized minimal distance to hyperplane that is approximation for the decision boundary. Arguments: - preprocessed_image (torch.Tensor): normalized and preprocessed image with shape [channels, height, width] Returns: torch.Tensor: tensor to be added to the image to change prediction """ self.model.classifier.eval() with torch.no_grad(): original_prediction = self.model.classifier( preprocessed_image.unsqueeze(0))[0] original_prediction_class = torch.argmax(original_prediction) perturbated_img = preprocessed_image.clone().detach() perturbation = torch.zeros_like(perturbated_img) for _ in range(self.max_iter): with torch.no_grad(): perturbated_img = clipped_renormalize(perturbated_img) predicted = self.model.classifier( perturbated_img.unsqueeze(0))[0] predicted_class = torch.argmax(predicted) if predicted_class != original_prediction_class: return perturbation jacobian = agf.jacobian( lambda x: self.model.classifier(x.unsqueeze(0))[0], perturbated_img) with torch.no_grad(): w = torch.cat([ jacobian[:predicted_class], jacobian[(predicted_class + 1):] ]) - jacobian[predicted_class] f = torch.cat([ predicted[:predicted_class], predicted[(predicted_class + 1):] ]) - predicted[predicted_class] l = torch.argmin( torch.abs(f) / la.norm(torch.flatten(w, start_dim=1), dim=1)) r = (torch.abs(f[l]) / la.norm(torch.flatten(w[l]))**2) * w[l] perturbation = perturbation + 1.1 * r perturbated_img = perturbated_img + 1.1 * r return perturbation
def __init__(self, x, fun, k_easy=0.1, k_hard=0.2): super().__init__(x, fun) # When the trust-region shrinks in two consecutive # calculations (``tr_radius < previous_tr_radius``) # the lower bound ``lambda_lb`` may be reused, # facilitating the convergence. To indicate no # previous value is known at first ``previous_tr_radius`` # is set to -1 and ``lambda_lb`` to None. self.previous_tr_radius = -1 self.lambda_lb = None self.niter = 0 self.EPS = torch.finfo(x.dtype).eps # ``k_easy`` and ``k_hard`` are parameters used # to determine the stop criteria to the iterative # subproblem solver. Take a look at pp. 194-197 # from reference _[1] for a more detailed description. self.k_easy = k_easy self.k_hard = k_hard # Get Lapack function for cholesky decomposition. try: # incomplete cholesky only available in # pytorch >= 1.9.0.dev20210504 func = torch.linalg.cholesky_ex self.torch_cholesky = True except AttributeError: # if we don't have torch cholesky, use potrf from scipy self.cholesky, = get_lapack_funcs(('potrf', ), (self.hess.cpu().numpy(), )) self.torch_cholesky = False # Get info about Hessian self.dimension = len(self.hess) self.hess_gershgorin_lb, self.hess_gershgorin_ub = gershgorin_bounds( self.hess) self.hess_inf = norm(self.hess, float('inf')) self.hess_fro = norm(self.hess, 'fro') # A constant such that for vectors smaler than that # backward substituition is not reliable. It was stabilished # based on Golub, G. H., Van Loan, C. F. (2013). # "Matrix computations". Forth Edition. JHU press., p.165. self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
def forward(self, x, eps=1e-8): desc = [] for b in self.blocks: x = b(x) b_desc = x.amax(dim=(-2, -1)) desc.append(b_desc) desc = torch.cat(desc, dim=1) return desc / tla.norm(desc, dim=1, keepdim=True).clamp(min=eps)
def als_loss(self, *args): z = self(*args) self.update_covariances(*z) covariance_inv = [compute_matrix_power(cov, -0.5, self.eps) for cov in self.covs] preds = [matmul(z, covariance_inv[i]).detach() for i, z in enumerate(z)] losses = [mean(norm(z_i - preds[-i], dim=0)) for i, z_i in enumerate(z, start=1)] obj = self.objective.loss(*z) return losses, obj
def estimate_smallest_singular_value(U) -> Tuple[Tensor, Tensor]: """Given upper triangular matrix ``U`` estimate the smallest singular value and the correspondent right singular vector in O(n**2) operations. A vector `e` with components selected from {+1, -1} is selected so that the solution `w` to the system `U.T w = e` is as large as possible. Implementation based on algorithm 3.5.1, p. 142, from reference [1]_ adapted for lower triangular matrix. References ---------- .. [1] G.H. Golub, C.F. Van Loan. "Matrix computations". Forth Edition. JHU press. pp. 140-142. """ U = torch.atleast_2d(U) UT = U.T m, n = U.shape if m != n: raise ValueError("A square triangular matrix should be provided.") p = torch.zeros(n, dtype=U.dtype, device=U.device) w = torch.empty(n, dtype=U.dtype, device=U.device) for k in range(n): wp = (1 - p[k]) / UT[k, k] wm = (-1 - p[k]) / UT[k, k] pp = p[k + 1:] + UT[k + 1:, k] * wp pm = p[k + 1:] + UT[k + 1:, k] * wm if wp.abs() + norm(pp, 1) >= wm.abs() + norm(pm, 1): w[k] = wp p[k + 1:] = pp else: w[k] = wm p[k + 1:] = pm # The system `U v = w` is solved using backward substitution. v = torch.triangular_solve(w.view(-1, 1), U)[0].view(-1) v_norm = norm(v) s_min = norm(w) / v_norm # Smallest singular value z_min = v / v_norm # Associated vector return s_min, z_min
def cosine_loss(X, mu_tilde, pi_tilde, alpha): """ Computes the Cosine loss. Arguments: X: array-like, shape=(batch_size, n_features) Input batch matrix. mu_tilde: array-like, shape=(batch_size, n_features) Matrix in which each row represents the assigned mean vector. Returns: loss: array-like, shape=(batch_size, ) Computed loss for each sample. """ X_norm = LA.norm(X, 2, axis=1) mu_tilde_norm = LA.norm(mu_tilde, 2, axis=1) return torch.sum((1 - torch.sum(X_norm * mu_tilde_norm, axis=1)) - torch.log(pi_tilde) / alpha)
def pair_norm(labels, features): norm = 0 count = 0 for i in range(len(labels)): for j in range(i + 1, len(labels)): if labels[i] == labels[j]: count += 1 norm += la.norm(features[i] - features[j], ord=2, dim=0).sum() return norm / count
def forward(self, x, eps=1e-8): # Mean and STD as expected by pretrained Torch models (from https://pytorch.org/docs/stable/torchvision/models.html ) # but scaled to match the -1 to 1 scale x = ttf.normalize(x, [0.485 * 2 - 1, 0.456 * 2 - 1, 0.406 * 2 - 1], [0.229 * 2, 0.224 * 2, 0.225 * 2]) x = self.block1(x) desc_1 = x.amax(dim=(-2, -1)) x = self.block2(x) desc_2 = x.amax(dim=(-2, -1)) desc = torch.cat((desc_1, desc_2), dim=1) return desc / tla.norm(desc, dim=1, keepdim=True).clamp(min=eps)
def check_convergence(self, x): z = (x - self._old_repr)[:, :self._converged_unit + 1].reshape(-1) difference = linalg.norm(z) / len(z) if difference <= self._tol: self._sequence += 1 if self._sequence == self._sequence_bound: self._sequence = 0 self._converged_unit += 1 if self._converged_unit == self._dropout_dim: self._has_converged = True else: self._sequence = 0
def interpolate(self, num_samples, z1=None, z2=None): if z1 is None and z2 is None: #z1 = torch.randn(1, *self.shape, device=self.buffer.device, dtype=self.buffer.dtype) * 0.01 #z2 = (torch.round(torch.rand(1, *self.shape, device=self.buffer.device, dtype=self.buffer.dtype)) * 2.0) - 1.0 # select z2 as point on tail with ~5% probability z2 = torch.randn(1, *self.shape, device=self.buffer.device, dtype=self.buffer.dtype) z2 = 1.4 * z2 / norm(z2) z1 = z2 * -1 # opposite tail elif z2 is None: # find unit vector throuh z1, and scale it to a point with ~5% probability z2 = 1.4 * z1 / norm(z1) z1 = z2 * -1 # opposite tail else: assert z1.shape == z2.shape return torch.cat( [w * z2 + (1.0 - w) * z1 for w in np.linspace(0, 1, num_samples)], dim=0)
def _get_weights(self, class_idx, scores=None): """Computes the weight coefficients of the hooked activation maps""" # Normalize the activation upsampled_a = self.hook_a # self._normalize(self.hook_a) upsampled_a = (upsampled_a - upsampled_a.min()) / (upsampled_a.max() - upsampled_a.min()) # Â Upsample it to input_size # 1 * O * M * N # upsampled_a = F.interpolate(upsampled_a, self._input.shape[-2:], mode='bilinear', align_corners=False) # Use it as a mask # O * I * H * W # Initialize weights # weights = torch.zeros(upsampled_a.shape[0], dtype=upsampled_a.dtype).to(device=upsampled_a.device) import torch.linalg as LA norm = LA.norm(upsampled_a.view(*upsampled_a.shape[:-2], -1, 1), 1, dim=2) norm = (norm - norm.min()) / (norm.max() - norm.min()) max_tensor = upsampled_a.view(1, upsampled_a.shape[1], -1, 1).max(dim=2) print(max_tensor[0]) max_sum = max_tensor[0].sum(dim=1) weights = max_tensor[0] / max_sum weights = weights.squeeze(0).squeeze(-1) norm = norm.squeeze(0).squeeze(-1) weights = (weights - weights.min()) / (weights.max() - weights.min()) weights *= (1 - norm) weights = weights ** self.pow # print(weights,max_tensor[0]) # Disable hook updates self._hooks_enabled = False # Â Process by chunk (GPU RAM limitation) ''' for idx in range(math.ceil(weights.shape[0] / self.bs)): selection_slice = slice(idx * self.bs, min((idx + 1) * self.bs, weights.shape[0])) with torch.no_grad(): #Â Get the softmax probabilities of the target class weights[selection_slice] = F.softmax(self.model(masked_input[selection_slice]), dim=1)[:, class_idx] ''' # Reenable hook updates self._hooks_enabled = True return weights
def check_unit_convergence(autoencoder, batch: torch.Tensor, old_repr: torch.Tensor, unit: int, succession: list, eps: float, bound: int) -> bool: new_repr = autoencoder.encode(batch) difference = linalg.norm( (new_repr - old_repr)[:, :unit + 1]) / (len(batch) * (unit + 1)) if difference <= eps: succession[0] += 1 else: succession[0] = 0 if succession[0] == bound: succession[0] = 0 return True return False
def _matrix_normalize(input: Tensor, dim: int) -> Tensor: """ Center and normalize according to the forbenius norm of the centered data. Note: - this does not create standardized random variables in a random vectors. ref: - https://stats.stackexchange.com/questions/544812/how-should-one-normalize-activations-of-batches-before-passing-them-through-a-si :param input: :param dim: :return: """ from torch.linalg import norm X_centered: Tensor = _zero_mean(input, dim=dim) X_star: Tensor = X_centered / norm(X_centered, "fro") return X_star
def forward(self, x_p, x_np, y, edge_index_p, edge_index_np): #h_p = x_p #h_np = x_np h_p = self.wrnn(x_p, edge_index_p) h_np = self.wrnn(x_np, edge_index_np) scale_factor = LA.norm(h_p, dim=0) scale_factor = scale_factor[0] h_p = h_p / scale_factor h_np = h_np / scale_factor h_p = self.wrnn(h_p, edge_index_p) h_np = self.wrnn(h_np, edge_index_np) h_p = h_p / scale_factor h_np = h_np / scale_factor batch_size = y.size(0) p_list = torch.zeros(batch_size, self.walk_len) np_list = torch.zeros(1, self.walk_len) for i in range(0, self.walk_len): h_p = self.wrnn(h_p, edge_index_p) h_np = self.wrnn(h_np, edge_index_np) h_p = h_p / scale_factor h_np = h_np / scale_factor #h_p = h_p.relu() #h_np = h_np.relu() val = torch.trace(h_np) #np_list[0,i] = torch.sign(val)*torch.log(torch.abs(val)) np_list[0, i] = val for j in range(batch_size): val = torch.trace(h_p[j * n:j * n + n, :]) #p_list[j,i] = torch.sign(val)*torch.log(torch.abs(val)) #p_list[j,i] = torch.sign(val)*torch.log(torch.abs(val)) p_list[j, i] = val np_list = np_list.repeat(batch_size, 1) p_list = p_list - np_list p_list = p_list.to(device) p_list = p_list * 100 for i in range(0, batch_size): p_list[i, :] = torch.mul(p_list[i, :], (y[i, 0] - 0.5) * 2) mu = torch.mean(p_list, dim=0, keepdim=False) std = torch.std(p_list, dim=0, keepdim=False) p_list = (p_list - mu) / std #print('P_list,shape',p_list.shape) return p_list
def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: # (n_batch, d_hyper) direction = self.dir_encoder(inputs, mask) # (n_batch, 1) dir_norm = LA.norm(direction, dim=1, keepdim=True) # (n_batch, d_hyper) unit vectors direction = direction / dir_norm # (n_batch, d_norm) norm = self.norm_encoder(inputs, mask) # (n_batch, 1) norm = self.fc(norm) norm = self.sigmoid(norm) # (n_batch, d_hyper) embed_hyper = direction * norm return embed_hyper
def mdd_loss(features, labels, left_weight=1, right_weight=1): softmax_out = F.softmax(features, dim=1) batch_size = features.size(0) if float(batch_size) % 2 != 0: raise Exception("Incorrect batch size provided") batch_left = softmax_out[: int(0.5 * batch_size)] batch_right = softmax_out[int(0.5 * batch_size) :] loss = la.norm(batch_left - batch_right, ord=2, dim=1).sum() / float( batch_size ) labels_left = labels[: int(0.5 * batch_size)] batch_left_loss = pair_norm(labels_left, batch_left) labels_right = labels[int(0.5 * batch_size) :] batch_right_loss = pair_norm(labels_right, batch_right) return ( loss + left_weight * batch_left_loss + right_weight * batch_right_loss )
def compute_distance(emb, prototypes, l2_norm=False): if l2_norm: emb = emb / norm(emb, ord=2, dim=1, keepdim=True) # 1 x 32 x h x w # prototypes = prototypes / norm(prototypes, ord=2, dim=-1, keepdim=True) n_classes = prototypes.shape[0] h, w = emb.shape[2:] grid = torch.zeros((n_classes, h, w), dtype=torch.float32) # prototypes: n_classes x 1 x 32 for i, p in enumerate(prototypes): # p: 1 x 32 p = p.unsqueeze(dim=2).unsqueeze(dim=3) # 1 x 32 x 1 x 1 p = p.repeat(1, 1, h, w) # 1 x 32 x h x w sim = F.cosine_similarity(p, emb, dim=1) # 1 x h x w # dist = ((emb - p) ** 2).sqrt() # .sum(dim=1).sqrt() # dist = torch.exp(-dist).mean(dim=1) # 1 x h x w # print(dist.shape) # grid[i] = dist.squeeze() # h x w grid[i] = sim.squeeze() # h x w return grid # n_classes x h x w
def gradient_penalty_loss(discriminator, from_real, from_fake): """Computes the gradient penalty in the WGAN-GP loss. Since MSG-GAN computes the loss using different sized versions of the same image, gradient penalty is computed seperately for each size and the return value is their average. """ epsilon = rand( size=( len(from_real.keys()), from_real[0].shape[0], *np.ones(len(from_real[0].shape) - 1 ).astype(int)), device=from_real[0].device, requires_grad=True ) x_hat = OrderedDict() for layer in range(discriminator.num_blocks): x_hat[layer] = ( epsilon[layer] * from_real[layer] + (1-epsilon[layer]) * from_fake[layer] ).requires_grad_(True) dis_out = discriminator(x_hat).sum() grads = grad( dis_out, [x_hat[i] for i in x_hat.keys()], create_graph=True, retain_graph=True ) output = cat( [(( norm(i.reshape(from_real[0].shape[0], -1), ord=2, dim=1) - ones(from_real[0].shape[0], requires_grad=True, device=from_real[0].device) ) ** 2.).unsqueeze(1) for i in grads], 1) output = output.sum(dim=0).mean() return output
def interpolate(self, num_samples, z1=None, z2=None): if z1 is None or z2 is None: #z1 = torch.randn(1, *self.shape, device=self.loc.device, dtype=self.loc.dtype) * 0.01 + self.loc #eps = (torch.round(torch.rand(1, *self.shape, device=self.loc.device, dtype=self.loc.dtype)) * 2.0) - 1.0 # select z2 as point on tail with ~5% probability eps = torch.randn(1, *self.shape, device=self.loc.device, dtype=self.loc.dtype) z2 = 1.4 * eps / norm(eps) z2 = self.loc + self.log_scale.exp() * eps z1 = z2 * -1 # opposite tail elif z2 is None: # rename points so that z1 still represents point near the origin z2 = z1 z1 = z2 * -1.0 # opposite tail else: assert z1.shape == z2.shape return torch.cat( [w * z2 + (1.0 - w) * z1 for w in np.linspace(0, 1, num_samples)], dim=0)
def solve(self, trust_radius): """Solve the subproblem using a conjugate gradient method. Parameters ---------- trust_radius : float We are allowed to wander only this far away from the origin. Returns ------- p : Tensor The proposed step. hits_boundary : bool True if the proposed step is on the boundary of the trust region. """ # get the norm of jacobian and define the origin p_origin = torch.zeros_like(self.jac) # define a default tolerance tolerance = self.jac_mag * self.jac_mag.sqrt().clamp(max=0.5) # Stop the method if the search direction # is a direction of nonpositive curvature. if self.jac_mag < tolerance: hits_boundary = False return p_origin, hits_boundary # init the state for the first iteration z = p_origin r = self.jac d = -r # Search for the min of the approximation of the objective function. while True: # do an iteration Bd = self.hessp(d) dBd = d.dot(Bd) if dBd <= 0: # Look at the two boundary points. # Find both values of t to get the boundary points such that # ||z + t d|| == trust_radius # and then choose the one with the predicted min value. ta, tb = self.get_boundaries_intersections(z, d, trust_radius) pa = z + ta * d pb = z + tb * d p_boundary = torch.where(self(pa).lt(self(pb)), pa, pb) hits_boundary = True return p_boundary, hits_boundary r_squared = r.dot(r) alpha = r_squared / dBd z_next = z + alpha * d if norm(z_next) >= trust_radius: # Find t >= 0 to get the boundary point such that # ||z + t d|| == trust_radius ta, tb = self.get_boundaries_intersections(z, d, trust_radius) p_boundary = z + tb * d hits_boundary = True return p_boundary, hits_boundary r_next = r + alpha * Bd r_next_squared = r_next.dot(r_next) if r_next_squared.sqrt() < tolerance: hits_boundary = False return z_next, hits_boundary beta_next = r_next_squared / r_squared d_next = -r_next + beta_next * d # update the state for the next iteration z = z_next r = r_next d = d_next
def jac_mag(self): """Magnitude of jacobian of objective function at current iteration.""" if self._g_mag is None: self._g_mag = norm(self.jac) return self._g_mag
def step(self, closure=None): loss = None if closure is not None and isinstance(closure, collections.Callable): with torch.grad(): loss = closure() param_size = 0 variance_ma_sum = 0.0 weight_norm = 0 # phase 1 - accumulate all of the variance_ma_sum to use in stable weight decay for i, group in enumerate(self.param_groups): for j, p in enumerate(group["params"]): if p.grad is None: continue if not self.param_size: param_size += p.numel() grad = p.grad if grad.is_sparse: raise RuntimeError("sparse matrix not supported atm") state = self.state[p] current_weight_norm = LA.norm(p.data) #print(f"running norm = {current_weight_norm}") weight_norm += current_weight_norm.item() # State initialization if len(state) == 0: # print("init state") state["step"] = 0 # Exponential moving average of gradient values state["grad_ma"] = torch.zeros_like( p, memory_format=torch.preserve_format ) # Exponential moving average of squared gradient values state["variance_ma"] = torch.zeros_like( p, memory_format=torch.preserve_format ) # centralize gradients if self.use_gc: grad = centralize_gradient( grad, gc_conv_only=self.gc_conv_only, ) # else: # grad = uncentralized_grad state["step"] += 1 beta1, beta2 = group["betas"] grad_ma = state["grad_ma"] variance_ma = state["variance_ma"] bias_correction2 = 1 - beta2 ** state["step"] # update the exp averages grad_ma.mul_(beta1).add_(grad, alpha=1 - beta1) variance_ma.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) variance_ma_debiased = variance_ma / bias_correction2 variance_ma_sum += variance_ma_debiased.sum() # print(f"variance hat sum = {exp_avg_sq_hat_sum}") # Calculate the sqrt of the mean of all elements in exp_avg_sq_hat # we will run this first epoch only and then memoize if not self.param_size: self.param_size = param_size print(f"params size saved") print(f"total param groups = {i+1}") print(f"total params in groups = {j+1}") if not self.param_size: raise ValueError("failed to set param size") # debugging self.variance_sum_tracking.append(variance_ma_sum.item()) variance_normalized = math.sqrt(variance_ma_sum / self.param_size) # print(f"variance mean sqrt = {variance_normalized}") # phase 2 - apply weight decay and step for group in self.param_groups: for p in group["params"]: if p.grad is None: continue state = self.state[p] step = state["step"] # Perform stable weight decay decay = group["weight_decay"] eps = group["eps"] #lr = group["lr"] lr = self.current_lr if self.use_warmup: lr = self.warmup_dampening(lr, step) # if step < 10: # print(f"warmup dampening at step {step} = {lr} vs {group['lr']}") if decay: p.data.mul_(1 - decay * lr / variance_normalized) beta1, beta2 = group["betas"] grad_exp_avg = state["grad_ma"] variance_ma = state["variance_ma"] bias_correction1 = 1 - beta1 ** step bias_correction2 = 1 - beta2 ** step variance_biased_ma = variance_ma / bias_correction2 denom = variance_biased_ma.sqrt().add(eps) weight_mod = grad_exp_avg / denom step_size = lr / bias_correction1 # update weights #p.data.add_(weight_mod, alpha=-step_size) p.addcdiv_(grad_exp_avg, denom, value=-step_size) # abel step abel_result = self.abel_update(None, weight_norm, self.current_lr) if abel_result is not None: self.current_lr = abel_result return loss
samples_ind.append( functional.one_hot( sample_ind, num_classes=variable_num).float()) # with num_classes works ok return variables, samples_ind if __name__ == '__main__': print('--> starting sample generation') variables, samples_ind = generate_training_data( 100000, 10, 8, True) # takes some time with large number of samples print('--> starting training') model = Estimate(10) print('--> model created') mse_loss = MSELoss() optimizer = optim.Adam(model.parameters(), weight_decay=1e-5) for epoch in range(10): print(f'--> training epoch {epoch}') for sample in samples_ind: optimizer.zero_grad() output = model(sample) loss = mse_loss(output, tensor([8]).float()) loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() print(loss.item()) print('Actual Variables:', variables, 'Estimated Variables:', exp(model.variables)) print('Norm:', linalg.norm(variables - exp(model.variables)) ) # check the norm to estimate the quality of the model
def solve(self, tr_radius): """Solve quadratic subproblem""" lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius) n = self.dimension hits_boundary = True already_factorized = False self.niter = 0 while True: # Compute Cholesky factorization if already_factorized: already_factorized = False else: H = self.hess.clone() H.diagonal().add_(lambda_current) if self.torch_cholesky: U, info = torch.linalg.cholesky_ex(H) U = U.t().contiguous() else: U, info = self.cholesky(H.cpu().numpy(), lower=False, overwrite_a=False, clean=True) U = H.new_tensor(U) self.niter += 1 # Check if factorization succeeded if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO: # Successful factorization # Solve `U.T U p = s` p = solve_cholesky(U, -self.jac, upper=True) p_norm = norm(p) # Check for interior convergence if p_norm <= tr_radius and lambda_current == 0: hits_boundary = False break # Solve `U.T w = p` w = solve_triangular(U, p, transpose=True) w_norm = norm(w) # Compute Newton step accordingly to # formula (4.44) p.87 from ref [2]_. delta_lambda = (p_norm / w_norm)**2 * (p_norm - tr_radius) / tr_radius lambda_new = lambda_current + delta_lambda if p_norm < tr_radius: # Inside boundary s_min, z_min = estimate_smallest_singular_value(U) ta, tb = self.get_boundaries_intersections( p, z_min, tr_radius) # Choose `step_len` with the smallest magnitude. # The reason for this choice is explained at # ref [3]_, p. 6 (Immediately before the formula # for `tau`). step_len = torch.min(ta.abs(), tb.abs()) # Compute the quadratic term (p.T*H*p) quadratic_term = p.dot(H.mv(p)) # Check stop criteria relative_error = ( (step_len**2 * s_min**2) / (quadratic_term + lambda_current * tr_radius**2)) if relative_error <= self.k_hard: p.add_(step_len * z_min) break # Update uncertanty bounds lambda_ub = lambda_current lambda_lb = torch.max(lambda_lb, lambda_current - s_min**2) # Compute Cholesky factorization H = self.hess.clone() H.diagonal().add_(lambda_new) if self.torch_cholesky: _, info = torch.linalg.cholesky_ex(H) else: _, info = self.cholesky(H.cpu().numpy(), lower=False, overwrite_a=False, clean=True) if info == 0: lambda_current = lambda_new already_factorized = True else: lambda_lb = torch.max(lambda_lb, lambda_new) lambda_current = torch.max( torch.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) else: # Outside boundary # Check stop criteria relative_error = torch.abs(p_norm - tr_radius) / tr_radius if relative_error <= self.k_easy: break # Update uncertanty bounds lambda_lb = lambda_current # Update damping factor lambda_current = lambda_new elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO: # jac_mag very close to zero # Check for interior convergence if lambda_current == 0: p = self.jac.new_zeros(n) hits_boundary = False break s_min, z_min = estimate_smallest_singular_value(U) step_len = tr_radius # Check stop criteria if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2: p = step_len * z_min break # Update uncertainty bounds and dampening factor lambda_ub = lambda_current lambda_lb = torch.max(lambda_lb, lambda_current - s_min**2) lambda_current = torch.max( torch.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) else: # Unsuccessful factorization delta, v = singular_leading_submatrix(H, U, info) v_norm = norm(v) lambda_lb = torch.max(lambda_lb, lambda_current + delta / v_norm**2) # Update damping factor lambda_current = torch.max( torch.sqrt(lambda_lb * lambda_ub), lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb)) self.lambda_lb = lambda_lb self.lambda_current = lambda_current self.previous_tr_radius = tr_radius return p, hits_boundary