Example #1
0
    def solve(self, trust_radius):
        """Solve quadratic subproblem"""

        # Compute the Newton point.
        # This is the optimum for the quadratic model function.
        # If it is inside the trust radius then return this point.
        p_best = self.newton_point()
        if norm(p_best) < trust_radius:
            hits_boundary = False
            return p_best, hits_boundary

        # Compute the Cauchy point.
        # This is the predicted optimum along the direction of steepest descent.
        p_u = self.cauchy_point()

        # If the Cauchy point is outside the trust region,
        # then return the point where the path intersects the boundary.
        p_u_norm = norm(p_u)
        if p_u_norm >= trust_radius:
            p_boundary = p_u * (trust_radius / p_u_norm)
            hits_boundary = True
            return p_boundary, hits_boundary

        # Compute the intersection of the trust region boundary
        # and the line segment connecting the Cauchy and Newton points.
        # This requires solving a quadratic equation.
        # ||p_u + t*(p_best - p_u)||**2 == trust_radius**2
        # Solve this for positive time t using the quadratic formula.
        _, tb = self.get_boundaries_intersections(p_u, p_best - p_u,
                                                  trust_radius)
        p_boundary = p_u + tb * (p_best - p_u)
        hits_boundary = True
        return p_boundary, hits_boundary
Example #2
0
    def identify_bias_between_word_sets(self,
                                        social_group_word_sets,
                                        n_components=10,
                                        freq_spaces=None,
                                        k=2):
        embeddings = find_embedding_layer(self.model)

        matrix = []
        for word_set in social_group_word_sets:
            word_ids = self.tokenizer.convert_tokens_to_ids(word_set)
            target_embeddings = embeddings(torch.tensor(word_ids))
            if freq_spaces:
                freq_subspaces = torch.load(freq_spaces)
                freq_subspaces = torch.from_numpy(freq_subspaces).float()[:k]
                freq_norms = LA.norm(freq_subspaces, dim=-1).view(-1, 1)
                embed_norms = LA.norm(target_embeddings, dim=-1)

                freq_subspaces = (((target_embeddings.mm(freq_subspaces.T)) *
                                   freq_subspaces / freq_norms).T *
                                  embed_norms).T
                target_embeddings -= freq_subspaces
            center = target_embeddings.mean(dim=0)
            matrix.extend((target_embeddings - center).detach())

        matrix = torch.stack(matrix)
        return self.__do_pca(matrix, n_components)
Example #3
0
def create_cp(dims,
              rank,
              sparsity=None,
              method='rand',
              weights=False,
              return_tensor=False,
              noise=None,
              sparse_noise=True):
    # TODO: investigate performance impact of setting backend here
    tl.set_backend('pytorch')

    if method == 'rand':
        randfunc = torch.rand
    elif method == 'randn':
        randfunc = torch.randn
    else:
        raise NotImplementedError(f'Unknown random method: {method}')

    n_dims = len(dims)
    factors = [randfunc((dim, rank)) for dim in dims]

    if sparsity is not None:
        if isinstance(sparsity, float):
            sparsity = [sparsity for _ in range(n_dims)]
        elif not isinstance(sparsity, list) and not isinstance(
                sparsity, tuple):
            raise ValueError(
                'Sparsity parameter should either be a float or tuple/list.')

        # Sparsify factors
        for dim in range(n_dims):
            n_el = dims[dim] * rank
            to_del = round(sparsity[dim] * n_el)
            if to_del == 0:
                continue
            idxs = torch.tensor(random.sample(range(n_el), to_del))
            factors[dim].view(-1)[idxs] = 0
            # torch.randperm(n_el, device=device)[:n_select]

    ten = None
    # Add noise
    if noise is not None:
        ten = tl.cp_to_tensor((torch.ones(rank), factors))
        if (sparsity is None or not sparse_noise):
            nten = torch.randn(ten.size())
            ten += noise * (norm(ten) / norm(nten)) * nten
        else:
            flat = ten.view(-1)
            nzs = torch.nonzero(flat, as_tuple=True)[0]
            nvec = torch.randn(nzs.size(0))
            flat[nzs] += noise * (norm(ten) / norm(nvec)) * nvec

    if return_tensor:
        if ten is None:
            return tl.cp_to_tensor((torch.ones(rank), factors))
        return ten
    if weights:
        return torch.ones(rank), factors
    return factors
Example #4
0
 def forward(self, input: Tensor, target: Tensor) -> Tensor:
     dist_euc = LA.norm(input - target, dim=1)
     norm_input = LA.norm(input, dim=1)
     norm_target = LA.norm(target, dim=1)
     dist_hype = torch.acosh(1 + 2 * dist_euc**2 / ((1 - norm_input**2) *
                                                    (1 - norm_target**2)))
     loss = torch.mean(dist_hype)
     return loss
Example #5
0
def NMELoss(predicted_landmark, target_landmark):
    # landmark is a numpy array which has shape [5, 2]
    num_face_landmark = 5
    leye_nouse_vec = torch.from_numpy(target_landmark[0] - target_landmark[2])
    reye_nouse_vec = torch.from_numpy(target_landmark[1] - target_landmark[2])
    inter_occular_distance = LA.norm(leye_nouse_vec) + LA.norm(reye_nouse_vec)
    loss = nn.MSELoss(reduction="sum")
    preloss = loss(torch.from_numpy(predicted_landmark), torch.from_numpy(target_landmark))
    nme_loss = torch.sqrt(preloss) / (inter_occular_distance * num_face_landmark)
    return nme_loss
Example #6
0
def norm_squared(vi, vj):
    if ML_ENGINE == "PyTorch":
        return LA.norm(vi - vj).item()**2
    else:
        fvi = np.concatenate([x.ravel() for x in vi])
        fvj = np.concatenate([x.ravel() for x in vj])
        return np.linalg.norm(fvi - fvj)**2
    def generate_change_tensor(
            self, preprocessed_image: torch.Tensor) -> torch.Tensor:
        """
        Generates change tensor by iteratively going towards linearized minimal distance
        to hyperplane that is approximation for the decision boundary.

        Arguments:
        - preprocessed_image (torch.Tensor): normalized and preprocessed
            image with shape [channels, height, width]

        Returns:
        torch.Tensor: tensor to be added to the image to change prediction
        """
        self.model.classifier.eval()
        with torch.no_grad():
            original_prediction = self.model.classifier(
                preprocessed_image.unsqueeze(0))[0]
            original_prediction_class = torch.argmax(original_prediction)
            perturbated_img = preprocessed_image.clone().detach()
            perturbation = torch.zeros_like(perturbated_img)
        for _ in range(self.max_iter):
            with torch.no_grad():
                perturbated_img = clipped_renormalize(perturbated_img)
                predicted = self.model.classifier(
                    perturbated_img.unsqueeze(0))[0]
                predicted_class = torch.argmax(predicted)
                if predicted_class != original_prediction_class:
                    return perturbation
            jacobian = agf.jacobian(
                lambda x: self.model.classifier(x.unsqueeze(0))[0],
                perturbated_img)
            with torch.no_grad():
                w = torch.cat([
                    jacobian[:predicted_class],
                    jacobian[(predicted_class + 1):]
                ]) - jacobian[predicted_class]
                f = torch.cat([
                    predicted[:predicted_class],
                    predicted[(predicted_class + 1):]
                ]) - predicted[predicted_class]
                l = torch.argmin(
                    torch.abs(f) /
                    la.norm(torch.flatten(w, start_dim=1), dim=1))
                r = (torch.abs(f[l]) / la.norm(torch.flatten(w[l]))**2) * w[l]
                perturbation = perturbation + 1.1 * r
                perturbated_img = perturbated_img + 1.1 * r
        return perturbation
Example #8
0
    def __init__(self, x, fun, k_easy=0.1, k_hard=0.2):

        super().__init__(x, fun)

        # When the trust-region shrinks in two consecutive
        # calculations (``tr_radius < previous_tr_radius``)
        # the lower bound ``lambda_lb`` may be reused,
        # facilitating  the convergence. To indicate no
        # previous value is known at first ``previous_tr_radius``
        # is set to -1  and ``lambda_lb`` to None.
        self.previous_tr_radius = -1
        self.lambda_lb = None

        self.niter = 0
        self.EPS = torch.finfo(x.dtype).eps

        # ``k_easy`` and ``k_hard`` are parameters used
        # to determine the stop criteria to the iterative
        # subproblem solver. Take a look at pp. 194-197
        # from reference _[1] for a more detailed description.
        self.k_easy = k_easy
        self.k_hard = k_hard

        # Get Lapack function for cholesky decomposition.
        try:
            # incomplete cholesky only available in
            # pytorch >= 1.9.0.dev20210504
            func = torch.linalg.cholesky_ex
            self.torch_cholesky = True
        except AttributeError:
            # if we don't have torch cholesky, use potrf from scipy
            self.cholesky, = get_lapack_funcs(('potrf', ),
                                              (self.hess.cpu().numpy(), ))
            self.torch_cholesky = False

        # Get info about Hessian
        self.dimension = len(self.hess)
        self.hess_gershgorin_lb, self.hess_gershgorin_ub = gershgorin_bounds(
            self.hess)
        self.hess_inf = norm(self.hess, float('inf'))
        self.hess_fro = norm(self.hess, 'fro')

        # A constant such that for vectors smaler than that
        # backward substituition is not reliable. It was stabilished
        # based on Golub, G. H., Van Loan, C. F. (2013).
        # "Matrix computations". Forth Edition. JHU press., p.165.
        self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
Example #9
0
 def forward(self, x, eps=1e-8):
     desc = []
     for b in self.blocks:
         x = b(x)
         b_desc = x.amax(dim=(-2, -1))
         desc.append(b_desc)
     desc = torch.cat(desc, dim=1)
     return desc / tla.norm(desc, dim=1, keepdim=True).clamp(min=eps)
Example #10
0
 def als_loss(self, *args):
     z = self(*args)
     self.update_covariances(*z)
     covariance_inv = [compute_matrix_power(cov, -0.5, self.eps) for cov in self.covs]
     preds = [matmul(z, covariance_inv[i]).detach() for i, z in enumerate(z)]
     losses = [mean(norm(z_i - preds[-i], dim=0)) for i, z_i in enumerate(z, start=1)]
     obj = self.objective.loss(*z)
     return losses, obj
Example #11
0
def estimate_smallest_singular_value(U) -> Tuple[Tensor, Tensor]:
    """Given upper triangular matrix ``U`` estimate the smallest singular
    value and the correspondent right singular vector in O(n**2) operations.

    A vector `e` with components selected from {+1, -1}
    is selected so that the solution `w` to the system
    `U.T w = e` is as large as possible. Implementation
    based on algorithm 3.5.1, p. 142, from reference [1]_
    adapted for lower triangular matrix.

    References
    ----------
    .. [1] G.H. Golub, C.F. Van Loan. "Matrix computations".
           Forth Edition. JHU press. pp. 140-142.
    """

    U = torch.atleast_2d(U)
    UT = U.T
    m, n = U.shape
    if m != n:
        raise ValueError("A square triangular matrix should be provided.")

    p = torch.zeros(n, dtype=U.dtype, device=U.device)
    w = torch.empty(n, dtype=U.dtype, device=U.device)

    for k in range(n):
        wp = (1 - p[k]) / UT[k, k]
        wm = (-1 - p[k]) / UT[k, k]
        pp = p[k + 1:] + UT[k + 1:, k] * wp
        pm = p[k + 1:] + UT[k + 1:, k] * wm

        if wp.abs() + norm(pp, 1) >= wm.abs() + norm(pm, 1):
            w[k] = wp
            p[k + 1:] = pp
        else:
            w[k] = wm
            p[k + 1:] = pm

    # The system `U v = w` is solved using backward substitution.
    v = torch.triangular_solve(w.view(-1, 1), U)[0].view(-1)
    v_norm = norm(v)

    s_min = norm(w) / v_norm  # Smallest singular value
    z_min = v / v_norm  # Associated vector

    return s_min, z_min
Example #12
0
def cosine_loss(X, mu_tilde, pi_tilde, alpha):
    """
    Computes the Cosine loss.
    Arguments:
        X: array-like, shape=(batch_size, n_features)
            Input batch matrix.
        mu_tilde: array-like, shape=(batch_size, n_features)
            Matrix in which each row represents the assigned mean vector.
    Returns:
        loss: array-like, shape=(batch_size, )
            Computed loss for each sample.
    """

    X_norm = LA.norm(X, 2, axis=1)
    mu_tilde_norm = LA.norm(mu_tilde, 2, axis=1)
    return torch.sum((1 - torch.sum(X_norm * mu_tilde_norm, axis=1)) -
                     torch.log(pi_tilde) / alpha)
Example #13
0
def pair_norm(labels, features):
    norm = 0
    count = 0
    for i in range(len(labels)):
        for j in range(i + 1, len(labels)):
            if labels[i] == labels[j]:
                count += 1
                norm += la.norm(features[i] - features[j], ord=2, dim=0).sum()
    return norm / count
Example #14
0
    def forward(self, x, eps=1e-8):
        # Mean and STD as expected by pretrained Torch models (from https://pytorch.org/docs/stable/torchvision/models.html )
        # but scaled to match the -1 to 1 scale
        x = ttf.normalize(x, [0.485 * 2 - 1, 0.456 * 2 - 1, 0.406 * 2 - 1],
                          [0.229 * 2, 0.224 * 2, 0.225 * 2])

        x = self.block1(x)
        desc_1 = x.amax(dim=(-2, -1))
        x = self.block2(x)
        desc_2 = x.amax(dim=(-2, -1))
        desc = torch.cat((desc_1, desc_2), dim=1)
        return desc / tla.norm(desc, dim=1, keepdim=True).clamp(min=eps)
Example #15
0
    def check_convergence(self, x):
        z = (x - self._old_repr)[:, :self._converged_unit + 1].reshape(-1)
        difference = linalg.norm(z) / len(z)

        if difference <= self._tol:
            self._sequence += 1
            if self._sequence == self._sequence_bound:
                self._sequence = 0
                self._converged_unit += 1
                if self._converged_unit == self._dropout_dim:
                    self._has_converged = True
        else:
            self._sequence = 0
Example #16
0
    def interpolate(self, num_samples, z1=None, z2=None):
        if z1 is None and z2 is None:
            #z1 = torch.randn(1, *self.shape, device=self.buffer.device, dtype=self.buffer.dtype) * 0.01
            #z2 = (torch.round(torch.rand(1, *self.shape, device=self.buffer.device, dtype=self.buffer.dtype)) * 2.0) - 1.0

            # select z2 as point on tail with ~5% probability
            z2 = torch.randn(1,
                             *self.shape,
                             device=self.buffer.device,
                             dtype=self.buffer.dtype)
            z2 = 1.4 * z2 / norm(z2)
            z1 = z2 * -1  # opposite tail
        elif z2 is None:
            # find unit vector throuh z1, and scale it to a point with ~5% probability
            z2 = 1.4 * z1 / norm(z1)
            z1 = z2 * -1  # opposite tail
        else:
            assert z1.shape == z2.shape

        return torch.cat(
            [w * z2 + (1.0 - w) * z1 for w in np.linspace(0, 1, num_samples)],
            dim=0)
Example #17
0
    def _get_weights(self, class_idx, scores=None):
        """Computes the weight coefficients of the hooked activation maps"""

        # Normalize the activation
        upsampled_a = self.hook_a  # self._normalize(self.hook_a)
        upsampled_a = (upsampled_a - upsampled_a.min()) / (upsampled_a.max() - upsampled_a.min())
        #  Upsample it to input_size
        # 1 * O * M * N
        # upsampled_a = F.interpolate(upsampled_a, self._input.shape[-2:], mode='bilinear', align_corners=False)

        # Use it as a mask
        # O * I * H * W

        # Initialize weights
        # weights = torch.zeros(upsampled_a.shape[0], dtype=upsampled_a.dtype).to(device=upsampled_a.device)

        import torch.linalg as LA

        norm = LA.norm(upsampled_a.view(*upsampled_a.shape[:-2], -1, 1), 1, dim=2)
        norm = (norm - norm.min()) / (norm.max() - norm.min())

        max_tensor = upsampled_a.view(1, upsampled_a.shape[1], -1, 1).max(dim=2)
        print(max_tensor[0])

        max_sum = max_tensor[0].sum(dim=1)

        weights = max_tensor[0] / max_sum
        weights = weights.squeeze(0).squeeze(-1)
        norm = norm.squeeze(0).squeeze(-1)

        weights = (weights - weights.min()) / (weights.max() - weights.min())
        weights *= (1 - norm)
        weights = weights ** self.pow
        # print(weights,max_tensor[0])
        # Disable hook updates
        self._hooks_enabled = False
        #  Process by chunk (GPU RAM limitation)

        '''
        for idx in range(math.ceil(weights.shape[0] / self.bs)):

            selection_slice = slice(idx * self.bs, min((idx + 1) * self.bs, weights.shape[0]))
            with torch.no_grad():
                # Get the softmax probabilities of the target class
                weights[selection_slice] = F.softmax(self.model(masked_input[selection_slice]), dim=1)[:, class_idx]
        '''
        # Reenable hook updates
        self._hooks_enabled = True

        return weights
def check_unit_convergence(autoencoder, batch: torch.Tensor,
                           old_repr: torch.Tensor, unit: int, succession: list,
                           eps: float, bound: int) -> bool:
    new_repr = autoencoder.encode(batch)

    difference = linalg.norm(
        (new_repr - old_repr)[:, :unit + 1]) / (len(batch) * (unit + 1))
    if difference <= eps:
        succession[0] += 1
    else:
        succession[0] = 0

    if succession[0] == bound:
        succession[0] = 0
        return True
    return False
Example #19
0
def _matrix_normalize(input: Tensor, dim: int) -> Tensor:
    """
    Center and normalize according to the forbenius norm of the centered data.

    Note:
        - this does not create standardized random variables in a random vectors.
    ref:
        - https://stats.stackexchange.com/questions/544812/how-should-one-normalize-activations-of-batches-before-passing-them-through-a-si
    :param input:
    :param dim:
    :return:
    """
    from torch.linalg import norm
    X_centered: Tensor = _zero_mean(input, dim=dim)
    X_star: Tensor = X_centered / norm(X_centered, "fro")
    return X_star
Example #20
0
    def forward(self, x_p, x_np, y, edge_index_p, edge_index_np):
        #h_p = x_p
        #h_np = x_np
        h_p = self.wrnn(x_p, edge_index_p)
        h_np = self.wrnn(x_np, edge_index_np)
        scale_factor = LA.norm(h_p, dim=0)
        scale_factor = scale_factor[0]
        h_p = h_p / scale_factor
        h_np = h_np / scale_factor
        h_p = self.wrnn(h_p, edge_index_p)
        h_np = self.wrnn(h_np, edge_index_np)
        h_p = h_p / scale_factor
        h_np = h_np / scale_factor
        batch_size = y.size(0)

        p_list = torch.zeros(batch_size, self.walk_len)
        np_list = torch.zeros(1, self.walk_len)

        for i in range(0, self.walk_len):
            h_p = self.wrnn(h_p, edge_index_p)
            h_np = self.wrnn(h_np, edge_index_np)
            h_p = h_p / scale_factor
            h_np = h_np / scale_factor
            #h_p = h_p.relu()
            #h_np = h_np.relu()
            val = torch.trace(h_np)
            #np_list[0,i] =  torch.sign(val)*torch.log(torch.abs(val))
            np_list[0, i] = val
            for j in range(batch_size):
                val = torch.trace(h_p[j * n:j * n + n, :])
                #p_list[j,i] = torch.sign(val)*torch.log(torch.abs(val))
                #p_list[j,i] = torch.sign(val)*torch.log(torch.abs(val))
                p_list[j, i] = val
        np_list = np_list.repeat(batch_size, 1)
        p_list = p_list - np_list
        p_list = p_list.to(device)
        p_list = p_list * 100
        for i in range(0, batch_size):
            p_list[i, :] = torch.mul(p_list[i, :], (y[i, 0] - 0.5) * 2)
        mu = torch.mean(p_list, dim=0, keepdim=False)
        std = torch.std(p_list, dim=0, keepdim=False)
        p_list = (p_list - mu) / std
        #print('P_list,shape',p_list.shape)
        return p_list
Example #21
0
    def forward(self, inputs: torch.Tensor,
                mask: torch.BoolTensor) -> torch.Tensor:
        # (n_batch, d_hyper)
        direction = self.dir_encoder(inputs, mask)
        # (n_batch, 1)
        dir_norm = LA.norm(direction, dim=1, keepdim=True)
        # (n_batch, d_hyper) unit vectors
        direction = direction / dir_norm

        # (n_batch, d_norm)
        norm = self.norm_encoder(inputs, mask)
        # (n_batch, 1)
        norm = self.fc(norm)
        norm = self.sigmoid(norm)

        # (n_batch, d_hyper)
        embed_hyper = direction * norm

        return embed_hyper
Example #22
0
def mdd_loss(features, labels, left_weight=1, right_weight=1):
    softmax_out = F.softmax(features, dim=1)
    batch_size = features.size(0)
    if float(batch_size) % 2 != 0:
        raise Exception("Incorrect batch size provided")

    batch_left = softmax_out[: int(0.5 * batch_size)]
    batch_right = softmax_out[int(0.5 * batch_size) :]

    loss = la.norm(batch_left - batch_right, ord=2, dim=1).sum() / float(
        batch_size
    )

    labels_left = labels[: int(0.5 * batch_size)]
    batch_left_loss = pair_norm(labels_left, batch_left)

    labels_right = labels[int(0.5 * batch_size) :]
    batch_right_loss = pair_norm(labels_right, batch_right)
    return (
        loss + left_weight * batch_left_loss + right_weight * batch_right_loss
    )
Example #23
0
def compute_distance(emb, prototypes, l2_norm=False):
    if l2_norm:
        emb = emb / norm(emb, ord=2, dim=1, keepdim=True)  # 1 x 32 x h x w
        # prototypes = prototypes / norm(prototypes, ord=2, dim=-1, keepdim=True)

    n_classes = prototypes.shape[0]
    h, w = emb.shape[2:]
    grid = torch.zeros((n_classes, h, w), dtype=torch.float32)

    # prototypes: n_classes x 1 x 32
    for i, p in enumerate(prototypes):
        # p: 1 x 32
        p = p.unsqueeze(dim=2).unsqueeze(dim=3)  # 1 x 32 x 1 x 1
        p = p.repeat(1, 1, h, w)  # 1 x 32 x h x w
        sim = F.cosine_similarity(p, emb, dim=1)  # 1 x h x w
        # dist = ((emb - p) ** 2).sqrt()  # .sum(dim=1).sqrt()
        # dist = torch.exp(-dist).mean(dim=1)  # 1 x h x w
        # print(dist.shape)
        # grid[i] = dist.squeeze()   # h x w
        grid[i] = sim.squeeze()  # h x w
    return grid  # n_classes x h x w
Example #24
0
def gradient_penalty_loss(discriminator, from_real, from_fake):
    """Computes the gradient penalty in the WGAN-GP loss. 
    Since MSG-GAN computes the loss using different sized 
    versions of the same image, gradient penalty is 
    computed seperately for each size and the return 
    value is their average.
    """
    epsilon = rand(
        size=(
            len(from_real.keys()), 
            from_real[0].shape[0], 
            *np.ones(len(from_real[0].shape) - 1
            ).astype(int)),
        device=from_real[0].device, requires_grad=True
    )
    
    x_hat = OrderedDict()

    for layer in range(discriminator.num_blocks):
        x_hat[layer] = (
            epsilon[layer] * from_real[layer]
            + (1-epsilon[layer]) * from_fake[layer]
            ).requires_grad_(True)
    dis_out = discriminator(x_hat).sum()
    grads = grad(
        dis_out, 
        [x_hat[i] for i in x_hat.keys()], 
        create_graph=True,
        retain_graph=True
        )

    output = cat(
        [((
            norm(i.reshape(from_real[0].shape[0], -1), ord=2, dim=1) 
            - ones(from_real[0].shape[0], requires_grad=True, device=from_real[0].device)
            ) ** 2.).unsqueeze(1) for i in grads], 1)
    
    output = output.sum(dim=0).mean()
    return output
Example #25
0
    def interpolate(self, num_samples, z1=None, z2=None):
        if z1 is None or z2 is None:
            #z1 = torch.randn(1, *self.shape, device=self.loc.device, dtype=self.loc.dtype) * 0.01 + self.loc
            #eps = (torch.round(torch.rand(1, *self.shape, device=self.loc.device, dtype=self.loc.dtype)) * 2.0) - 1.0

            # select z2 as point on tail with ~5% probability
            eps = torch.randn(1,
                              *self.shape,
                              device=self.loc.device,
                              dtype=self.loc.dtype)
            z2 = 1.4 * eps / norm(eps)
            z2 = self.loc + self.log_scale.exp() * eps
            z1 = z2 * -1  # opposite tail
        elif z2 is None:
            # rename points so that z1 still represents point near the origin
            z2 = z1
            z1 = z2 * -1.0  # opposite tail
        else:
            assert z1.shape == z2.shape

        return torch.cat(
            [w * z2 + (1.0 - w) * z1 for w in np.linspace(0, 1, num_samples)],
            dim=0)
Example #26
0
    def solve(self, trust_radius):
        """Solve the subproblem using a conjugate gradient method.

        Parameters
        ----------
        trust_radius : float
            We are allowed to wander only this far away from the origin.

        Returns
        -------
        p : Tensor
            The proposed step.
        hits_boundary : bool
            True if the proposed step is on the boundary of the trust region.

        """

        # get the norm of jacobian and define the origin
        p_origin = torch.zeros_like(self.jac)

        # define a default tolerance
        tolerance = self.jac_mag * self.jac_mag.sqrt().clamp(max=0.5)

        # Stop the method if the search direction
        # is a direction of nonpositive curvature.
        if self.jac_mag < tolerance:
            hits_boundary = False
            return p_origin, hits_boundary

        # init the state for the first iteration
        z = p_origin
        r = self.jac
        d = -r

        # Search for the min of the approximation of the objective function.
        while True:

            # do an iteration
            Bd = self.hessp(d)
            dBd = d.dot(Bd)
            if dBd <= 0:
                # Look at the two boundary points.
                # Find both values of t to get the boundary points such that
                # ||z + t d|| == trust_radius
                # and then choose the one with the predicted min value.
                ta, tb = self.get_boundaries_intersections(z, d, trust_radius)
                pa = z + ta * d
                pb = z + tb * d
                p_boundary = torch.where(self(pa).lt(self(pb)), pa, pb)
                hits_boundary = True
                return p_boundary, hits_boundary
            r_squared = r.dot(r)
            alpha = r_squared / dBd
            z_next = z + alpha * d
            if norm(z_next) >= trust_radius:
                # Find t >= 0 to get the boundary point such that
                # ||z + t d|| == trust_radius
                ta, tb = self.get_boundaries_intersections(z, d, trust_radius)
                p_boundary = z + tb * d
                hits_boundary = True
                return p_boundary, hits_boundary
            r_next = r + alpha * Bd
            r_next_squared = r_next.dot(r_next)
            if r_next_squared.sqrt() < tolerance:
                hits_boundary = False
                return z_next, hits_boundary
            beta_next = r_next_squared / r_squared
            d_next = -r_next + beta_next * d

            # update the state for the next iteration
            z = z_next
            r = r_next
            d = d_next
Example #27
0
 def jac_mag(self):
     """Magnitude of jacobian of objective function at current iteration."""
     if self._g_mag is None:
         self._g_mag = norm(self.jac)
     return self._g_mag
Example #28
0
    def step(self, closure=None):

        loss = None
        if closure is not None and isinstance(closure, collections.Callable):
            with torch.grad():
                loss = closure()

        param_size = 0
        variance_ma_sum = 0.0
        weight_norm = 0


        # phase 1 - accumulate all of the variance_ma_sum to use in stable weight decay

        for i, group in enumerate(self.param_groups):
            for j, p in enumerate(group["params"]):
                if p.grad is None:
                    continue

                if not self.param_size:
                    param_size += p.numel()

                grad = p.grad

                if grad.is_sparse:
                    raise RuntimeError("sparse matrix not supported atm")

                state = self.state[p]

                current_weight_norm = LA.norm(p.data)
                #print(f"running norm = {current_weight_norm}")
                weight_norm += current_weight_norm.item()

                # State initialization
                if len(state) == 0:
                    # print("init state")
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["grad_ma"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    # Exponential moving average of squared gradient values
                    state["variance_ma"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )

                # centralize gradients
                if self.use_gc:
                    grad = centralize_gradient(
                        grad,
                        gc_conv_only=self.gc_conv_only,
                    )
                # else:
                #    grad = uncentralized_grad

                state["step"] += 1

                beta1, beta2 = group["betas"]
                grad_ma = state["grad_ma"]
                variance_ma = state["variance_ma"]

                bias_correction2 = 1 - beta2 ** state["step"]

                # update the exp averages
                grad_ma.mul_(beta1).add_(grad, alpha=1 - beta1)

                variance_ma.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                variance_ma_debiased = variance_ma / bias_correction2

                variance_ma_sum += variance_ma_debiased.sum()

            # print(f"variance hat sum = {exp_avg_sq_hat_sum}")
            # Calculate the sqrt of the mean of all elements in exp_avg_sq_hat

            # we will run this first epoch only and then memoize
            if not self.param_size:
                self.param_size = param_size
                print(f"params size saved")
                print(f"total param groups = {i+1}")
                print(f"total params in groups = {j+1}")

            if not self.param_size:
                raise ValueError("failed to set param size")

            # debugging
            self.variance_sum_tracking.append(variance_ma_sum.item())

            variance_normalized = math.sqrt(variance_ma_sum / self.param_size)

            # print(f"variance mean sqrt = {variance_normalized}")

        # phase 2 - apply weight decay and step
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue

                state = self.state[p]

                step = state["step"]

                # Perform stable weight decay
                decay = group["weight_decay"]
                eps = group["eps"]
                #lr = group["lr"]
                lr = self.current_lr

                if self.use_warmup:
                    lr = self.warmup_dampening(lr, step)
                    # if step < 10:
                    #    print(f"warmup dampening at step {step} = {lr} vs {group['lr']}")

                if decay:
                    p.data.mul_(1 - decay * lr / variance_normalized)

                beta1, beta2 = group["betas"]
                grad_exp_avg = state["grad_ma"]
                variance_ma = state["variance_ma"]

                bias_correction1 = 1 - beta1 ** step
                bias_correction2 = 1 - beta2 ** step

                variance_biased_ma = variance_ma / bias_correction2

                denom = variance_biased_ma.sqrt().add(eps)

                weight_mod = grad_exp_avg / denom
                
                step_size = lr / bias_correction1

                # update weights
                #p.data.add_(weight_mod, alpha=-step_size)
                p.addcdiv_(grad_exp_avg, denom, value=-step_size)

            # abel step
            abel_result = self.abel_update(None, weight_norm, self.current_lr)
            if abel_result is not None:
                self.current_lr = abel_result

        return loss
        samples_ind.append(
            functional.one_hot(
                sample_ind,
                num_classes=variable_num).float())  # with num_classes works ok
    return variables, samples_ind


if __name__ == '__main__':
    print('--> starting sample generation')
    variables, samples_ind = generate_training_data(
        100000, 10, 8, True)  # takes some time with large number of samples
    print('--> starting training')
    model = Estimate(10)
    print('--> model created')
    mse_loss = MSELoss()
    optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)
    for epoch in range(10):
        print(f'--> training epoch {epoch}')
        for sample in samples_ind:
            optimizer.zero_grad()
            output = model(sample)
            loss = mse_loss(output, tensor([8]).float())
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
        print(loss.item())

    print('Actual Variables:', variables, 'Estimated Variables:',
          exp(model.variables))
    print('Norm:', linalg.norm(variables - exp(model.variables))
          )  # check the norm to estimate the quality of the model
Example #30
0
    def solve(self, tr_radius):
        """Solve quadratic subproblem"""

        lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius)
        n = self.dimension
        hits_boundary = True
        already_factorized = False
        self.niter = 0

        while True:
            # Compute Cholesky factorization
            if already_factorized:
                already_factorized = False
            else:
                H = self.hess.clone()
                H.diagonal().add_(lambda_current)
                if self.torch_cholesky:
                    U, info = torch.linalg.cholesky_ex(H)
                    U = U.t().contiguous()
                else:
                    U, info = self.cholesky(H.cpu().numpy(),
                                            lower=False,
                                            overwrite_a=False,
                                            clean=True)
                    U = H.new_tensor(U)

            self.niter += 1

            # Check if factorization succeeded
            if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO:
                # Successful factorization

                # Solve `U.T U p = s`
                p = solve_cholesky(U, -self.jac, upper=True)
                p_norm = norm(p)

                # Check for interior convergence
                if p_norm <= tr_radius and lambda_current == 0:
                    hits_boundary = False
                    break

                # Solve `U.T w = p`
                w = solve_triangular(U, p, transpose=True)
                w_norm = norm(w)

                # Compute Newton step accordingly to
                # formula (4.44) p.87 from ref [2]_.
                delta_lambda = (p_norm / w_norm)**2 * (p_norm -
                                                       tr_radius) / tr_radius
                lambda_new = lambda_current + delta_lambda

                if p_norm < tr_radius:  # Inside boundary
                    s_min, z_min = estimate_smallest_singular_value(U)

                    ta, tb = self.get_boundaries_intersections(
                        p, z_min, tr_radius)

                    # Choose `step_len` with the smallest magnitude.
                    # The reason for this choice is explained at
                    # ref [3]_, p. 6 (Immediately before the formula
                    # for `tau`).
                    step_len = torch.min(ta.abs(), tb.abs())

                    # Compute the quadratic term  (p.T*H*p)
                    quadratic_term = p.dot(H.mv(p))

                    # Check stop criteria
                    relative_error = (
                        (step_len**2 * s_min**2) /
                        (quadratic_term + lambda_current * tr_radius**2))
                    if relative_error <= self.k_hard:
                        p.add_(step_len * z_min)
                        break

                    # Update uncertanty bounds
                    lambda_ub = lambda_current
                    lambda_lb = torch.max(lambda_lb, lambda_current - s_min**2)

                    # Compute Cholesky factorization
                    H = self.hess.clone()
                    H.diagonal().add_(lambda_new)
                    if self.torch_cholesky:
                        _, info = torch.linalg.cholesky_ex(H)
                    else:
                        _, info = self.cholesky(H.cpu().numpy(),
                                                lower=False,
                                                overwrite_a=False,
                                                clean=True)

                    if info == 0:
                        lambda_current = lambda_new
                        already_factorized = True
                    else:
                        lambda_lb = torch.max(lambda_lb, lambda_new)
                        lambda_current = torch.max(
                            torch.sqrt(lambda_lb * lambda_ub), lambda_lb +
                            self.UPDATE_COEFF * (lambda_ub - lambda_lb))

                else:  # Outside boundary
                    # Check stop criteria
                    relative_error = torch.abs(p_norm - tr_radius) / tr_radius
                    if relative_error <= self.k_easy:
                        break

                    # Update uncertanty bounds
                    lambda_lb = lambda_current

                    # Update damping factor
                    lambda_current = lambda_new

            elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO:
                # jac_mag very close to zero

                # Check for interior convergence
                if lambda_current == 0:
                    p = self.jac.new_zeros(n)
                    hits_boundary = False
                    break

                s_min, z_min = estimate_smallest_singular_value(U)
                step_len = tr_radius

                # Check stop criteria
                if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2:
                    p = step_len * z_min
                    break

                # Update uncertainty bounds and dampening factor
                lambda_ub = lambda_current
                lambda_lb = torch.max(lambda_lb, lambda_current - s_min**2)
                lambda_current = torch.max(
                    torch.sqrt(lambda_lb * lambda_ub),
                    lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb))

            else:
                # Unsuccessful factorization

                delta, v = singular_leading_submatrix(H, U, info)
                v_norm = norm(v)

                lambda_lb = torch.max(lambda_lb,
                                      lambda_current + delta / v_norm**2)

                # Update damping factor
                lambda_current = torch.max(
                    torch.sqrt(lambda_lb * lambda_ub),
                    lambda_lb + self.UPDATE_COEFF * (lambda_ub - lambda_lb))

        self.lambda_lb = lambda_lb
        self.lambda_current = lambda_current
        self.previous_tr_radius = tr_radius

        return p, hits_boundary