def verify_loss(self, f):
     p = 0
     n_pair = f.size(1)
     for i in range(n_pair):
         for t in range(n_pair):
             p += torch.dot(f[:, i], f[:, t]) / torch.dot(f[:, i], f)
     return p.mean()
Exemple #2
0
 def test_local_var_binary_methods(self):
     ''' Unit tests for methods mentioned on issue 1385
         https://github.com/OpenMined/PySyft/issues/1385'''
     x = torch.FloatTensor([1, 2, 3, 4])
     y = torch.FloatTensor([[1, 2, 3, 4]])
     z = torch.matmul(x, y.t())
     assert (torch.equal(z, torch.FloatTensor([30])))
     z = torch.add(x, y)
     assert (torch.equal(z, torch.FloatTensor([[2, 4, 6, 8]])))
     x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
     y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
     z = torch.cross(x, y, dim=1)
     assert (torch.equal(z, torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])))
     x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
     y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
     z = torch.dist(x, y)
     t = torch.FloatTensor([z])
     assert (torch.equal(t, torch.FloatTensor([0.])))
     x = torch.FloatTensor([1, 2, 3])
     y = torch.FloatTensor([1, 2, 3])
     z = torch.dot(x, y)
     t = torch.FloatTensor([z])
     assert torch.equal(t, torch.FloatTensor([14]))
     z = torch.eq(x, y)
     assert (torch.equal(z, torch.ByteTensor([1, 1, 1])))
     z = torch.ge(x, y)
     assert (torch.equal(z, torch.ByteTensor([1, 1, 1])))
Exemple #3
0
    def test_remote_var_binary_methods(self):
        ''' Unit tests for methods mentioned on issue 1385
            https://github.com/OpenMined/PySyft/issues/1385'''
        hook = TorchHook(verbose=False)
        local = hook.local_worker
        remote = VirtualWorker(hook, 1)
        local.add_worker(remote)

        x = Var(torch.FloatTensor([1, 2, 3, 4])).send(remote)
        y = Var(torch.FloatTensor([[1, 2, 3, 4]])).send(remote)
        z = torch.matmul(x, y.t())
        assert (torch.equal(z.get(), Var(torch.FloatTensor([30]))))
        z = torch.add(x, y)
        assert (torch.equal(z.get(), Var(torch.FloatTensor([[2, 4, 6, 8]]))))
        x = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote)
        y = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote)
        z = torch.cross(x, y, dim=1)
        assert (torch.equal(z.get(), Var(torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))))
        x = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote)
        y = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote)
        z = torch.dist(x, y)
        assert (torch.equal(z.get(), Var(torch.FloatTensor([0.]))))
        x = Var(torch.FloatTensor([1, 2, 3])).send(remote)
        y = Var(torch.FloatTensor([1, 2, 3])).send(remote)
        z = torch.dot(x, y)
        print(torch.equal(z.get(), Var(torch.FloatTensor([14]))))
        z = torch.eq(x, y)
        assert (torch.equal(z.get(), Var(torch.ByteTensor([1, 1, 1]))))
        z = torch.ge(x, y)
        assert (torch.equal(z.get(), Var(torch.ByteTensor([1, 1, 1]))))
Exemple #4
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                if state['step'] > 1:
                    prev_bias_correction1 = 1 - beta1 ** (state['step'] - 1)
                    prev_bias_correction2 = 1 - beta2 ** (state['step'] - 1)
                    # Hypergradient for Adam:
                    h = torch.dot(grad.view(-1), torch.div(exp_avg, exp_avg_sq.sqrt().add_(group['eps'])).view(-1)) * math.sqrt(prev_bias_correction2) / prev_bias_correction1
                    # Hypergradient descent of the learning rate:
                    tmp = group['hypergrad_lr'] * h
                    group['lr'] += tmp.double().cpu()

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss
def cal_angle(vec1, vec2):
    """ Calculate cosine similarities between two torch tensors or two ndarraies
        Args:
            vec1, vec2: two tensors or numpy ndarraies
    """
    if isinstance(vec1, torch.Tensor) and isinstance(vec1, torch.Tensor):
        return torch.dot(vec1, vec2)/(vec1.norm()*vec2.norm()).item()
    elif isinstance(vec1, np.ndarray) and isinstance(vec2, np.ndarray):
        return np.ndarray.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
Exemple #6
0
    def updateOutput(self, input, target):
        # - log(input) * target - log(1 - input) * (1 - target)
        if input.nelement() != target.nelement():
            raise RuntimeError("input and target size mismatch")

        if self.buffer is None:
            self.buffer = input.new()

        buffer = self.buffer
        weights = self.weights

        buffer.resize_as_(input)

        if weights is not None and target.dim() != 1:
            weights = self.weights.view(1, target.size(1)).expand_as(target)

        # log(input) * target
        torch.add(input, self.eps, out=buffer).log_()
        if weights is not None:
            buffer.mul_(weights)

        target_1d = target.contiguous().view(-1)
        # don't save a 1-d view of buffer: it should already be contiguous, and it's
        # used as non-1d tensor later.
        output = torch.dot(target_1d, buffer.contiguous().view(-1))

        # log(1 - input) * (1 - target)
        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
        if weights is not None:
            buffer.mul_(weights)

        output = output + torch.sum(buffer)
        output = output - torch.dot(target_1d, buffer.contiguous().view(-1))

        if self.sizeAverage:
            output = output / input.nelement()

        self.output = - output.item()

        return self.output
def project_1D(w, d):
    """ Project vector w to vector d and get the length of the projection.

        Args:
            w: vectorized weights
            d: vectorized direction

        Returns:
            the projection scalar
    """
    assert len(w) == len(d), 'dimension does not match for w and '
    scale = torch.dot(w, d)/d.norm()
    return scale.item()
Exemple #8
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        assert len(self.param_groups) == 1

        loss = None
        if closure is not None:
            loss = closure()

        group = self.param_groups[0]
        weight_decay = group['weight_decay']
        momentum = group['momentum']
        dampening = group['dampening']
        nesterov = group['nesterov']

        grad = self._gather_flat_grad_with_weight_decay(weight_decay)

        # NOTE: SGDHD has only global state, but we register it as state for
        # the first param, because this helps with casting in load_state_dict
        state = self.state[self._params[0]]
        # State initialization
        if len(state) == 0:
            state['grad_prev'] = torch.zeros_like(grad)

        grad_prev = state['grad_prev']
        # Hypergradient for SGD
        h = torch.dot(grad, grad_prev)
        # Hypergradient descent of the learning rate:
        group['lr'] += group['hypergrad_lr'] * h

        if momentum != 0:
            if 'momentum_buffer' not in state:
                buf = state['momentum_buffer'] = torch.zeros_like(grad)
                buf.mul_(momentum).add_(grad)
            else:
                buf = state['momentum_buffer']
                buf.mul_(momentum).add_(1 - dampening, grad)
            if nesterov:
                grad.add_(momentum, buf)
            else:
                grad = buf

        state['grad_prev'] = grad

        self._add_grad(-group['lr'], grad)

        return loss
Exemple #9
0
    def compute_weight(self, module):
        weight = module._parameters[self.name + '_org']
        u = module._buffers[self.name + '_u']
        height = weight.size(0)
        weight_mat = weight.view(height, -1)
        for _ in range(self.n_power_iterations):
            # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
            # are the first left and right singular vectors.
            # This power iteration produces approximations of `u` and `v`.
            v = normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps)
            u = normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps)

        sigma = torch.dot(u, torch.matmul(weight_mat, v))
        weight.data /= sigma
        return weight, u
    def compute_weight(self, module):
        weight = getattr(module, self.name + '_org')
        u = getattr(module, self.name + '_u')
        height = weight.size(0)
        weight_mat = weight.view(height, -1)
        with torch.no_grad():
            for _ in range(self.n_power_iterations):
                # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
                # are the first left and right singular vectors.
                # This power iteration produces approximations of `u` and `v`.
                v = normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps)
                u = normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps)

            sigma = torch.dot(u, torch.matmul(weight_mat, v))
        weight = weight / sigma
        return weight, u
Exemple #11
0
 def compute_weight(self, module):
   weight = getattr(module, self.name + '_orig')
   u = getattr(module, self.name + '_u')
   weight_mat = weight
   if self.dim != 0:
     # permute dim to front
     weight_mat = weight_mat.permute(self.dim,
                                           *[d for d in range(weight_mat.dim()) if d != self.dim])
   height = weight_mat.size(0)
   weight_mat = weight_mat.reshape(height, -1)
   with torch.no_grad():
     for _ in range(self.n_power_iterations):
       v = F.normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps)
       u = F.normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps)
   sigma = torch.dot(u, torch.matmul(weight_mat, v))
   weight = weight / sigma
   return weight, u
 def test(self, dataset):
     self.model.eval()
     total_loss = 0
     predictions = torch.zeros(len(dataset))
     indices = torch.arange(1, dataset.num_classes + 1)
     for idx in tqdm(range(len(dataset)),desc='Testing epoch  ' + str(self.epoch) + ''):
         ltree, lsent, rtree, rsent, label = dataset[idx]
         linput, rinput = Var(lsent, volatile=True), Var(rsent, volatile=True)
         target = Var(map_label_to_target(label, dataset.num_classes), volatile=True)
         if self.args.cuda:
             linput, rinput = linput.cuda(), rinput.cuda()
             target = target.cuda()
         output = self.model(ltree, linput, rtree, rinput)
         loss = self.criterion(output, target)
         total_loss += loss.data[0]
         output = output.data.squeeze().cpu()
         predictions[idx] = torch.dot(indices, torch.exp(output))
     return total_loss / len(dataset), predictions
Exemple #13
0
def lovasz_hinge_flat(logits, labels):
    """
    Binary Lovasz hinge loss
      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
      labels: [P] Tensor, binary ground truth labels (0 or 1)
      ignore: label to ignore
    """
    if len(labels) == 0:
        # only void pixels, the gradients should be 0
        return logits.sum() * 0.
    signs = 2. * labels.float() - 1.
    errors = (1. - logits * Variable(signs))
    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
    perm = perm.data
    gt_sorted = labels[perm]
    grad = lovasz_grad(gt_sorted)
    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
    return loss
Exemple #14
0
    def backward(self, gradient, image):
        # lazy import
        import torch
        from torch.autograd import Variable

        assert gradient.ndim == 1

        gradient = torch.from_numpy(gradient)
        if self.cuda:  # pragma: no cover
            gradient = gradient.cuda()
        gradient = Variable(gradient)

        image = self._process_input(image)
        assert image.ndim == 3
        images = image[np.newaxis]
        images = torch.from_numpy(images)
        if self.cuda:  # pragma: no cover
            images = images.cuda()
        images = Variable(images, requires_grad=True)
        predictions = self._model(images)

        print(predictions.size())
        predictions = predictions[0]

        assert gradient.dim() == 1
        assert predictions.dim() == 1
        assert gradient.size() == predictions.size()

        loss = torch.dot(predictions, gradient)
        loss.backward()
        # should be the same as predictions.backward(gradient=gradient)

        grad = images.grad

        grad = grad.data
        if self.cuda:  # pragma: no cover
            grad = grad.cpu()
        grad = grad.numpy()
        grad = self._process_gradient(grad)
        grad = np.squeeze(grad, axis=0)
        assert grad.shape == image.shape

        return grad
def lovasz_softmax_flat(probas, labels, only_present=False):
    """
    Multi-class Lovasz-Softmax loss
      probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
      labels: [P] Tensor, ground truth labels (between 0 and C - 1)
      only_present: average only on classes present in ground truth
    """
    C = probas.size(1)
    losses = []
    for c in range(C):
        fg = (labels == c).float() # foreground for class c
        if only_present and fg.sum() == 0:
            continue
        errors = (Variable(fg) - probas[:, c]).abs()
        errors_sorted, perm = torch.sort(errors, 0, descending=True)
        perm = perm.data
        fg_sorted = fg[perm]
        losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
    return mean(losses)
Exemple #16
0
    def test_remote_tensor_binary_methods(self):

        hook = TorchHook(verbose = False)
        local = hook.local_worker
        remote = VirtualWorker(hook, 0)
        local.add_worker(remote)

        x = torch.FloatTensor([1, 2, 3, 4, 5]).send(remote)
        y = torch.FloatTensor([1, 2, 3, 4, 5]).send(remote)
        assert (x.add_(y).get() == torch.FloatTensor([2,4,6,8,10])).all()

        x = torch.FloatTensor([1, 2, 3, 4]).send(remote)
        y = torch.FloatTensor([[1, 2, 3, 4]]).send(remote)
        z = torch.matmul(x, y.t())
        assert (torch.equal(z.get(), torch.FloatTensor([30])))

        z = torch.add(x, y)
        assert (torch.equal(z.get(), torch.FloatTensor([[2, 4, 6, 8]])))

        x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote)
        y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote)
        z = torch.cross(x, y, dim=1)
        assert (torch.equal(z.get(), torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])))

        x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote)
        y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote)
        z = torch.dist(x, y)
        t = torch.FloatTensor([z])
        assert (torch.equal(t, torch.FloatTensor([0.])))

        x = torch.FloatTensor([1, 2, 3]).send(remote)
        y = torch.FloatTensor([1, 2, 3]).send(remote)
        z = torch.dot(x, y)
        t = torch.FloatTensor([z])
        assert torch.equal(t, torch.FloatTensor([14]))

        z = torch.eq(x, y)
        assert (torch.equal(z.get(), torch.ByteTensor([1, 1, 1])))

        z = torch.ge(x, y)
        assert (torch.equal(z.get(), torch.ByteTensor([1, 1, 1])))
Exemple #17
0
    def test_local_tensor_binary_methods(self):
        ''' Unit tests for methods mentioned on issue 1385
        https://github.com/OpenMined/PySyft/issues/1385'''

        x = torch.FloatTensor([1, 2, 3, 4])
        y = torch.FloatTensor([[1, 2, 3, 4]])
        z = torch.matmul(x, y.t())
        assert (torch.equal(z, torch.FloatTensor([30])))

        z = torch.add(x, y)
        assert (torch.equal(z, torch.FloatTensor([[2, 4, 6, 8]])))

        x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
        y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
        z = torch.cross(x, y, dim=1)
        assert (torch.equal(z, torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])))

        x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
        y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])
        z = torch.dist(x, y)
        assert (torch.equal(torch.FloatTensor([z]), torch.FloatTensor([0])))

        x = torch.FloatTensor([1, 2, 3])
        y = torch.FloatTensor([1, 2, 3])
        z = torch.dot(x, y)
        # There is an issue with some Macs getting 0.0 instead
        # Solved here: https://github.com/pytorch/pytorch/issues/5609
        assert torch.equal(torch.FloatTensor([z]), torch.FloatTensor([14]))

        z = torch.eq(x, y)
        assert (torch.equal(z, torch.ByteTensor([1, 1, 1])))

        z = torch.ge(x, y)
        assert (torch.equal(z, torch.ByteTensor([1, 1, 1])))

        x = torch.FloatTensor([1, 2, 3, 4, 5])
        y = torch.FloatTensor([1, 2, 3, 4, 5])
        assert (x.add_(y) == torch.FloatTensor([2, 4, 6, 8, 10])).all()
Exemple #18
0
    def step(self, closure, b=None, M_inv=None):
        """
        Performs a single optimization step.

        Arguments:
            closure (callable): A closure that re-evaluates the model
                and returns a tuple of the loss and the output.
            b (callable, optional): A closure that calculates the vector b in
                the minimization problem x^T . A . x + x^T b.
            M (callable, optional): The INVERSE preconditioner of A
        """
        assert len(self.param_groups) == 1

        group = self.param_groups[0]
        alpha = group['alpha']
        delta_decay = group['delta_decay']
        cg_max_iter = group['cg_max_iter']
        damping = group['damping']
        use_gnm = group['use_gnm']
        verbose = group['verbose']

        state = self.state[self._params[0]]
        state.setdefault('func_evals', 0)
        state.setdefault('n_iter', 0)

        loss_before, output = closure()
        current_evals = 1
        state['func_evals'] += 1

        # Gather current parameters and respective gradients
        flat_params = parameters_to_vector(self._params)
        flat_grad = self._gather_flat_grad()

        # Define linear operator
        if use_gnm:
            # Generalized Gauss-Newton vector product
            def A(x):
                return self._Gv(loss_before, output, x, damping)
        else:
            # Hessian-vector product
            def A(x):
                return self._Hv(flat_grad, x, damping)

        if M_inv is not None:
            m_inv = M_inv()

            # Preconditioner recipe (Section 20.13)
            if m_inv.dim() == 1:
                m = (m_inv + damping) ** (-0.85)

                def M(x):
                    return m * x
            else:
                m = torch.inverse(m_inv + damping * torch.eye(*m_inv.shape))

                def M(x):
                    return m @ x
        else:
            M = None

        b = flat_grad.detach() if b is None else b().detach().flatten()

        # Initializing Conjugate-Gradient (Section 20.10)
        if state.get('init_delta') is not None:
            init_delta = delta_decay * state.get('init_delta')
        else:
            init_delta = torch.zeros_like(flat_params)

        eps = torch.finfo(b.dtype).eps

        # Conjugate-Gradient
        deltas, Ms = self._CG(A=A, b=b.neg(), x0=init_delta,
                              M=M, max_iter=cg_max_iter,
                              tol=1e1 * eps, eps=eps, martens=True)

        # Update parameters
        delta = state['init_delta'] = deltas[-1]
        M = Ms[-1]

        vector_to_parameters(flat_params + delta, self._params)
        loss_now = closure()[0]
        current_evals += 1
        state['func_evals'] += 1

        # Conjugate-Gradient backtracking (Section 20.8.7)
        if verbose:
            print("Loss before CG: {}".format(float(loss_before)))
            print("Loss before BT: {}".format(float(loss_now)))

        for (d, m) in zip(reversed(deltas[:-1][::2]), reversed(Ms[:-1][::2])):
            vector_to_parameters(flat_params + d, self._params)
            loss_prev = closure()[0]
            if float(loss_prev) > float(loss_now):
                break
            delta = d
            M = m
            loss_now = loss_prev

        if verbose:
            print("Loss after BT:  {}".format(float(loss_now)))

        # The Levenberg-Marquardt Heuristic (Section 20.8.5)
        reduction_ratio = (float(loss_now) -
                           float(loss_before)) / M if M != 0 else 1

        if reduction_ratio < 0.25:
            group['damping'] *= 3 / 2
        elif reduction_ratio > 0.75:
            group['damping'] *= 2 / 3
        if reduction_ratio < 0:
            group['init_delta'] = 0

        # Line Searching (Section 20.8.8)
        beta = 0.8
        c = 1e-2
        min_improv = min(c * torch.dot(b, delta), 0)

        for _ in range(60):
            if float(loss_now) <= float(loss_before) + alpha * min_improv:
                break

            alpha *= beta
            vector_to_parameters(flat_params + alpha * delta, self._params)
            loss_now = closure()[0]
        else:  # No good update found
            alpha = 0.0
            loss_now = loss_before

        # Update the parameters (this time fo real)
        vector_to_parameters(flat_params + alpha * delta, self._params)

        if verbose:
            print("Loss after LS:  {0} (lr: {1:.3f})".format(
                float(loss_now), alpha))
            print("Tikhonov damping: {0:.3f} (reduction ratio: {1:.3f})".format(
                group['damping'], reduction_ratio), end='\n\n')

        return loss_now
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--exp_name', default='ijba_eval')
    parser.add_argument('-g', '--gpu', type=int, default=0)

    parser.add_argument('-d', '--data_dir', 
                        default='/home/renyi/arunirc/data1/datasets/CS2')
    parser.add_argument('-p', '--protocol_dir', 
                        default='/home/renyi/arunirc/data1/datasets/IJB-A/IJB-A_11_sets/')
    parser.add_argument('--fold', type=int, default=1, choices=[1,10])
    parser.add_argument('--sqrt', action='store_true', default=False,
                        help='Add signed sqrt normalization')
    parser.add_argument('--cosine', action='store_true', default=False,
                        help='Use cosine similarity instead of L2 distance')
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('-m', '--model_path', 
                        default=MODEL_PATH, 
                        help='Path to pre-trained model')
    parser.add_argument('--model_type', default=MODEL_TYPE,
                        choices=['resnet50', 'resnet101', 'resnet101-512d', 'resnet101-512d-norm'])
    
    args = parser.parse_args()


    # CUDA setup
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
    cuda = torch.cuda.is_available()
    torch.manual_seed(1337)
    if cuda:
        torch.cuda.manual_seed(1337)
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True # enable if all images are same size    


    # -----------------------------------------------------------------------------
    # 1. Model
    # -----------------------------------------------------------------------------
    num_class = 8631
    if args.model_type == 'resnet50':
        model = torchvision.models.resnet50(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101':
        model = torchvision.models.resnet101(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101-512d':
        model = torchvision.models.resnet101(pretrained=False)
        layers = []
        layers.append(torch.nn.Linear(2048, 512))
        layers.append(torch.nn.Linear(512, num_class))
        model.fc = torch.nn.Sequential(*layers)
    elif args.model_type == 'resnet101-512d-norm':
        model = torchvision.models.resnet101(pretrained=False)
        layers = []
        layers.append(torch.nn.Linear(2048, 512))
        layers.append(models.NormFeat(scale_factor=50.0))
        layers.append(torch.nn.Linear(512, num_class))
        model.fc = torch.nn.Sequential(*layers)
    else:
        raise NotImplementedError
    
    checkpoint = torch.load(args.model_path)       
    if checkpoint['arch'] == 'DataParallel':
        model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4])
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.module # get network module from inside its DataParallel wrapper
    else:
        model.load_state_dict(checkpoint['model_state_dict'])

    if cuda:
        model = model.cuda()

    # Convert the trained network into a "feature extractor"
    feature_map = list(model.children())
    if args.model_type == 'resnet101-512d' or args.model_type == 'resnet101-512d-norm':
        model.eval()
        extractor = model
        extractor.fc = nn.Sequential(extractor.fc[0])
    else: 
        feature_map.pop()
        extractor = nn.Sequential(*feature_map)
    extractor.eval() # ALWAYS set to evaluation mode (fixes BatchNorm, dropout, etc.)



    # -----------------------------------------------------------------------------
    # 2. Dataset
    # -----------------------------------------------------------------------------
    fold_id = 1
    file_ext = '.jpg'
    RGB_MEAN = [ 0.485, 0.456, 0.406 ]
    RGB_STD = [ 0.229, 0.224, 0.225 ]

    test_transform = transforms.Compose([
        # transforms.Scale(224),
        # transforms.CenterCrop(224),
        transforms.Scale((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean = RGB_MEAN,
                             std = RGB_STD),
    ])


    pairs_path = osp.join(args.protocol_dir, 'split%d' % fold_id, 
                          'verify_comparisons_%d.csv' % fold_id)
    pairs = utils.read_ijba_pairs(pairs_path)
    protocol_file = osp.join(args.protocol_dir, 'split%d' % fold_id, 
                          'verify_metadata_%d.csv' % fold_id)
    metadata = utils.get_ijba_1_1_metadata(protocol_file) # dict
    assert np.all(np.unique(pairs) == np.unique(metadata['template_id']))  # sanity-check
    path_list = np.array([osp.join(args.data_dir, str(x)+file_ext) 
                         for x in metadata['sighting_id'] ]) # face crops saved as <sighting_id.jpg>

    # Create data loader
    test_loader = torch.utils.data.DataLoader(
                        data_loader.IJBADataset(
                        path_list, test_transform, split=fold_id), 
                        batch_size=args.batch_size, shuffle=False )

    # testing
    # for i in range(len(test_loader.dataset)):
    #     img = test_loader.dataset.__getitem__(i)
    #     sz = img.shape
    #     if sz[0] != 3:
    #         print sz




    # -----------------------------------------------------------------------------
    # 3. Feature extraction
    # -----------------------------------------------------------------------------
    print 'Feature extraction...'
    cache_dir = osp.join(here, 'cache-' + args.model_type)
    if not osp.exists(cache_dir):
        os.makedirs(cache_dir)

    feat_path = osp.join(cache_dir, 'feat-fold-%d.mat' % fold_id)

    if not osp.exists(feat_path):
        features = []
        for batch_idx, images in tqdm.tqdm(enumerate(test_loader), 
                                            total=len(test_loader), 
                                            desc='Extracting features'): 
            x = Variable(images, volatile=True) # test-time memory conservation
            if cuda:
                x = x.cuda()
            feat = extractor(x)
            if cuda:
                feat = feat.data.cpu() # free up GPU
            else:
                feat = feat.data
            features.append(feat)

        features = torch.cat(features, dim=0) # (n_batch*batch_sz) x 512
        sio.savemat(feat_path, {'feat': features.cpu().numpy() })
    else:
        dat = sio.loadmat(feat_path)
        features = torch.FloatTensor(dat['feat'])
        del dat
        print 'Loaded.'


    # -----------------------------------------------------------------------------
    # 4. Verification
    # -----------------------------------------------------------------------------
    scores = []
    labels = []

    # labels: is_same_subject
    print 'Computing pair labels . . . '
    for pair in tqdm.tqdm(pairs): # TODO - check tqdm
        sel_t0 = np.where(metadata['template_id'] == pair[0])
        sel_t1 = np.where(metadata['template_id'] == pair[1])
        subject0 = np.unique(metadata['subject_id'][sel_t0])
        subject1 = np.unique(metadata['subject_id'][sel_t1])
        labels.append(int(subject0 == subject1))
    labels = np.array(labels)
    print 'done'

    # templates: average pool, then L2-normalize
    print 'Pooling templates . . . '
    pooled_features = []
    template_set = np.unique(metadata['template_id'])
    for tid in tqdm.tqdm(template_set):
        sel = np.where(metadata['template_id'] == tid)
        # pool template: 1 x n x 512 -> 1 x 512
        feat = features[sel,:].mean(1)
        if args.sqrt:  # signed-square-root normalization
            feat = torch.mul(torch.sign(feat),torch.sqrt(torch.abs(feat)+1e-12))
        pooled_features.append(F.normalize(feat, p=2, dim=1) )    
    pooled_features = torch.cat(pooled_features, dim=0) # (n_batch*batch_sz) x 512
    print 'done'

    print 'Computing pair distances . . . '
    for pair in tqdm.tqdm(pairs):
        sel_t0 = np.where(template_set == pair[0])
        sel_t1 = np.where(template_set == pair[1])
        if args.cosine:
            feat_dist = torch.dot(torch.squeeze(pooled_features[sel_t0]), 
                                  torch.squeeze(pooled_features[sel_t1]))
        else:
            feat_dist = (pooled_features[sel_t0] - pooled_features[sel_t1]).norm(p=2, dim=1)
            feat_dist = -torch.squeeze(feat_dist)
            feat_dist = feat_dist.numpy()
        scores.append(feat_dist) # score: negative of L2-distance
    scores = np.array(scores)

    # Metrics: TAR (tpr) at FAR (fpr)
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, scores)
    fpr_levels = [0.0001, 0.001, 0.01, 0.1]
    f_interp = interpolate.interp1d(fpr, tpr)
    tpr_at_fpr = [ f_interp(x) for x in fpr_levels ]

    for (far, tar) in zip(fpr_levels, tpr_at_fpr):
        print 'TAR @ FAR=%.4f : %.4f' % (far, tar)

    res = {}
    res['TAR'] = tpr_at_fpr
    res['FAR'] = fpr_levels
    with open( osp.join(cache_dir, 'result-1-1-fold-%d.yaml' % fold_id), 
              'w') as f:
        yaml.dump(res, f, default_flow_style=False)

    sio.savemat(osp.join(cache_dir, 'roc-1-1-fold-%d.mat' % fold_id), 
                {'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds, 
                    'tpr_at_fpr': tpr_at_fpr})
Exemple #20
0
    def __getitem__(self, index):
        scene_id = self.scene_index[index // NUM_SAMPLE_PER_SCENE]
        sample_id = index % NUM_SAMPLE_PER_SCENE
        sample_path = os.path.join(self.image_folder, 'scene_' + str(scene_id),
                                   'sample_' + str(sample_id))

        images = []
        for image_name in image_names:
            image_path = os.path.join(sample_path, image_name)
            image = Image.open(image_path)
            image.load()
            images.append(self.transform["image"](image))
        image_tensor = torch.stack(images)

        data_entries = self.annotation_dataframe[
            (self.annotation_dataframe['scene'] == scene_id)
            & (self.annotation_dataframe['sample'] == sample_id)]
        corners = data_entries[[
            'fl_x', 'fr_x', 'bl_x', 'br_x', 'fl_y', 'fr_y', 'bl_y', 'br_y'
        ]].to_numpy()
        categories = data_entries.category_id.to_numpy()

        ego_path = os.path.join(sample_path, 'ego.png')
        ego_image = Image.open(ego_path)
        ego_image.load()
        ego_image = torchvision.transforms.functional.to_tensor(ego_image)
        road_image = convert_map_to_road_map(ego_image)
        road_image = self.transform["road"](road_image.type(torch.FloatTensor))

        #         print(torch.as_tensor(corners).view(-1, 2, 4).transpose(1,2).flatten(1,2))
        bounding_box = torch.as_tensor(corners).view(
            -1, 2, 4)  #.transpose(1,2)#.flatten(1,2)
        bounding_box[:, 0] = (bounding_box[:, 0] * 10) + 400
        bounding_box[:, 1] = (-bounding_box[:, 1] * 10) + 400
        bounding_box = (bounding_box * 256) / 800
        bounding_box = bounding_box.transpose(1, 2)
        # print(bounding_box[:, :, 0].shape)

        bbox = torch.zeros(bounding_box.shape[0], 4)
        # print(bbox.shape, bounding_box.shape)

        # bbox = (bbox * 256)/800

        bbox_new = torch.zeros(bounding_box.shape[0], 5)
        # print(bbox.shape, bounding_box.shape)
        # bbox[:, 0] = bounding_box[:, :, 0].min(dim=1)[0]
        # bbox[:, 1] = bounding_box[:, :, 1].min(dim=1)[0]
        # bbox[:, 2] = bounding_box[:, :, 0].max(dim=1)[0]
        # bbox[:, 3] = bounding_box[:, :, 1].max(dim=1)[0]

        # Computre rotate angle from center point

        for i, box in enumerate(bounding_box):

            if box[0][0] <= box[2][0] and box[0][1] >= box[1][1]:
                br = box[0]
                bl = box[1]
                fr = box[2]
                fl = box[3]
            else:
                fl = box[0]
                fr = box[1]
                bl = box[2]
                br = box[3]

            # print("before:",box)
            centerpoint = (fl + br) / 2
            if fl[0] > fr[0]:  # negative angle

                if fr[0] != centerpoint[0]:
                    theta = torch.atan(
                        (fr[1] - centerpoint[1]) / abs(fr[0] - centerpoint[0]))
                else:
                    theta = (np.pi / 2)

                a = bl - centerpoint
                b = fl - centerpoint
                tempangle = torch.acos(
                    torch.dot(a, b) / (torch.norm(a, 2) * torch.norm(b, 2)))
                beta = (np.pi - tempangle) / 2

                if fr[0] > centerpoint[0]:
                    gamma = -(theta - beta)
                else:
                    gamma = -(np.pi - theta - beta)

                # print ("-----test----")
                # print (torch.norm(a, 2))
                # print (torch.norm(b, 2))
                # print (theta)
                # print (beta)
                # print (gamma)
            elif fl[0] < fr[0]:  # positive angle

                if centerpoint[0] != br[0]:
                    theta = torch.atan(
                        (br[1] - centerpoint[1]) / abs(centerpoint[0] - br[0]))
                else:
                    theta = np.pi / 2

                a = fl - centerpoint
                b = bl - centerpoint
                tempangle = torch.acos(
                    torch.dot(a, b) / (torch.norm(a, 2) * torch.norm(b, 2)))
                beta = (np.pi - tempangle) / 2

                if br[0] > centerpoint[0]:
                    gamma = (theta - beta)
                else:
                    gamma = (np.pi - theta - beta)

            else:
                gamma = 0
            # print((gamma*180)/np.pi)

            #theta = np.arctan((fr[1] - br[1])/(fr[0]-br[0]))
            bbox_new[i, 4] = gamma

            translation_matrix = torch.tensor([[1, 0, centerpoint[0]],
                                               [0, 1, centerpoint[1]],
                                               [0, 0, 1]])
            reverse_translation_matrix = torch.tensor([[1, 0, -centerpoint[0]],
                                                       [0, 1, -centerpoint[1]],
                                                       [0, 0, 1]])
            rotation_matrix = torch.tensor(
                [[torch.cos(-gamma), -torch.sin(-gamma), 0],
                 [torch.sin(-gamma), torch.cos(-gamma), 0], [0, 0, 1]])
            # print(translation_matrix,reverse_translation_matrix,rotation_matrix)
            # print(box.shape)
            box = torch.cat([
                box.transpose(0, 1),
                torch.ones(box.shape[0]).type(torch.DoubleTensor).unsqueeze(0)
            ],
                            dim=0)
            # print(box)
            bbox_rotated = torch.matmul(
                translation_matrix,
                torch.matmul(rotation_matrix,
                             torch.matmul(reverse_translation_matrix,
                                          box)))[:2]
            # print(bbox_rotated)
            # print("\nrotation matrix shape:",rotation_matrix.shape)
            # rotation_matrix = torch.from_numpy(rotation_matrix)
            # bbox_rotated = torch.matmul(rotation_matrix, torch.transpose(box, 0, 1))
            # print("\nbbox_rotated shape:",bbox_rotated.shape)
            # print("\nrotated_bbox:", bbox_rotated)
            # print("\nbbox new shape:",bbox_new.shape)
            if box[0][0] <= box[2][0] and box[0][1] >= box[1][1]:

                bbox_new[i, 0] = bbox_rotated[0, 1]
                bbox_new[i, 1] = bbox_rotated[1, 1]
                bbox_new[i, 2] = bbox_rotated[0, 2]
                bbox_new[i, 3] = bbox_rotated[1, 2]

            else:

                bbox_new[i, 0] = bbox_rotated[0, 0]
                bbox_new[i, 1] = bbox_rotated[1, 0]
                bbox_new[i, 2] = bbox_rotated[0, 3]
                bbox_new[i, 3] = bbox_rotated[1, 3]

            # print("\nafter:",bbox_new[i])
            # if len(bbox_rotated[bbox_rotated<0])>0:

        # print(bbox[0])
        # print(scene_id, sample_id, bounding_box.shape)
        classes = torch.as_tensor(categories).view(-1, 1)

        # print(bbox_new.shape,classes.shape)

        if self.args.gen_semantic_map:
            semantic_map_path = os.path.join(sample_path, "semantic_map.npy")
            semantic_map = np.load(semantic_map_path)
            semantic_map = F.one_hot(
                torch.tensor(semantic_map).to(torch.int64), 11)

        else:  # self.args.gen_object_map:
            semantic_map_path = os.path.join(sample_path, "object_map.npy")
            semantic_map = np.load(semantic_map_path)
            semantic_map = F.one_hot(
                torch.tensor(semantic_map).to(torch.int64), 3)

        semantic_map = semantic_map.transpose(1, 2).transpose(0, 1)

        # plt.imshow(semantic_map)

        if self.extra_info:
            actions = data_entries.action_id.to_numpy()
            # You can change the binary_lane to False to get a lane with
            lane_image = convert_map_to_lane_map(ego_image, binary_lane=True)

            action = torch.as_tensor(actions)
            ego = self.transform["road"](ego_image)
            road = lane_image

            # print(scene_id, sample_id, bounding_box[0])
            # print(bounding_box.shape,classes.shape)
            # print(classes)
            # exit(0)
            return index, image_tensor, bbox_new, classes, action, ego, road_image, semantic_map

        else:
            return index, image_tensor, bbox_new, classes
Exemple #21
0
# -- Matrix Multiplication --
x1 = torch.rand((2, 5))
x2 = torch.rand((5, 3))
x3 = torch.mm(x1, x2)  # Matrix multiplication of x1 and x2, out shape: 2x3
x3 = x1.mm(x2)  # Similar as line above

# -- Matrix Exponentiation --
matrix_exp = torch.rand(5, 5)
print(matrix_exp.matrix_power(
    3))  # is same as matrix_exp (mm) matrix_exp (mm) matrix_exp

# -- Element wise Multiplication --
z = x * y  # z = [9, 16, 21] = [1*9, 2*8, 3*7]

# -- Dot product --
z = torch.dot(x, y)  # Dot product, in this case z = 1*9 + 2*8 + 3*7

# -- Batch Matrix Multiplication --
batch = 32
n = 10
m = 20
p = 30
tensor1 = torch.rand((batch, n, m))
tensor2 = torch.rand((batch, m, p))
out_bmm = torch.bmm(tensor1, tensor2)  # Will be shape: (b x n x p)

# -- Example of broadcasting --
x1 = torch.rand((5, 5))
x2 = torch.ones((1, 5))
z = (
    x1 - x2
Exemple #22
0
#--------------------------------------------------------------------

# Matrix modes_to_nodes
val_r_inv = torch.inverse(val_r)

# Computes coordiantes modes
coords_modes = torch.mm(val_r_inv, coords)

# Initialized coordiantes
interp_coords = torch.mm(val_i, coords_modes)

# Initialized jacobian
jacobian = torch.empty(3, 3, nnodes_if, dtype=torch.float64)
for inode in range(0, nnodes_if):
    jacobian[0, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 0])
    jacobian[0, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 0])
    jacobian[0, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 0])
    jacobian[1, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 1])
    jacobian[1, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 1])
    jacobian[1, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 1])
    jacobian[2, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 2])
    jacobian[2, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 2])
    jacobian[2, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 2])
    update_progress("Computing Jacobian                 ",
                    inode / (nnodes_if - 1))
if coord_sys == 'CYLINDRICAL':
    scaling_factor = torch.mm(val_i, coords_modes[:, 0])
    for inode in range(0, nnodes_if):
        jacobian[1, 0, inode] = jacobian[1, 0, inode] * scaling_factor[inode]
        jacobian[1, 1, inode] = jacobian[1, 1, inode] * scaling_factor[inode]
Exemple #23
0
    def train(self, num_episodes, save_path, batch_size):
        loss_file = open("results/losses.txt", "w")
        reward_file = open("results/rewards.txt", "w")

        step = 0
        for e in tqdm.tqdm(range(1, num_episodes + 1)):
            state = self.env.reset()
            done = False
            episode_reward = 0
            if self.prioritized_sample:
                gradient_accum = [
                    torch.zeros(param.shape).to(self.device)
                    for param in self.training_model.parameters()
                ]
            while not done:
                step += 1
                self.epsilon = max(self.epsilon_min,
                                   self.epsilon * self.epsilon_decay)
                action = self.get_action(state)
                # self.env.render()
                new_state, reward, done, _ = self.env.step(action.item())
                self.replay_buffer.push(
                    [state, action, new_state if not done else None, reward])
                episode_reward += reward
                state = new_state

                if len(self.replay_buffer) > self.minimum_buffer_size:
                    if self.prioritized_sample:
                        states, actions, new_states, rewards, mask, importance_weights, indices = self.replay_buffer.sample(
                            batch_size)
                        importance_weights = importance_weights.to(self.device)
                    else:
                        states, actions, new_states, rewards, mask = self.replay_buffer.sample(
                            batch_size)

                    rewards = rewards.to(self.device)
                    mask = mask.to(self.device)
                    best_actions = self.get_best_actions(new_states)
                    target_q_values = self.get_target_q_value(
                        new_states, best_actions) * mask
                    expected_q_values = self.get_training_q_value(
                        states, actions)
                    errors = (rewards + self.gamma * target_q_values -
                              expected_q_values).pow(2)

                    if self.prioritized_sample:
                        self.replay_buffer.update(errors.detach().cpu(),
                                                  indices)

                    loss = errors.mean().sqrt()
                    loss_file.write("{}\n".format(loss.item()))
                    self.optimizer.zero_grad()
                    loss.backward()

                    if self.prioritized_sample:
                        with torch.no_grad():
                            for i, param in enumerate(
                                    self.training_model.parameters()):
                                gradient_accum[i] += torch.dot(
                                    importance_weights, errors) * param.grad

                    with torch.no_grad():
                        for param in self.training_model.parameters():
                            param.grad.data.clamp_(-1, 1)

                    self.optimizer.step()

                    if (self.transfer_frequency >
                            0) and (step % self.transfer_frequency == 0):
                        self.target_model.load_state_dict(
                            self.training_model.state_dict())

            if self.prioritized_sample:
                with torch.no_grad():
                    for i, param in enumerate(
                            self.training_model.parameters()):
                        new_param = param + gradient_accum[i]
                        param.data.copy_(new_param)

            reward_file.write("{}\t{}\n".format(e, episode_reward))

        self.target_model.load_state_dict(self.training_model.state_dict())
        torch.save(self.target_model.state_dict(), save_path)
        loss_file.close()
        reward_file.close()
        self.env.close()
Exemple #24
0
    def compute_weight(self, module, do_power_iteration):
        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
        #     updated in power iteration **in-place**. This is very important
        #     because in `DataParallel` forward, the vectors (being buffers) are
        #     broadcast from the parallelized module to each module replica,
        #     which is a new module object created on the fly. And each replica
        #     runs its own spectral norm power iteration. So simply assigning
        #     the updated vectors to the module this function runs on will cause
        #     the update to be lost forever. And the next time the parallelized
        #     module is replicated, the same randomly initialized vectors are
        #     broadcast and used!
        #
        #     Therefore, to make the change propagate back, we rely on two
        #     important bahaviors (also enforced via tests):
        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
        #          is alreay on correct device; and it makes sure that the
        #          parallelized module is already on `device[0]`.
        #       2. If the out tensor in `out=` kwarg has correct shape, it will
        #          just fill in the values.
        #     Therefore, since the same power iteration is performed on all
        #     devices, simply updating the tensors in-place will make sure that
        #     the module replica on `device[0]` will update the _u vector on the
        #     parallized module (by shared storage).
        #
        #    However, after we update `u` and `v` in-place, we need to **clone**
        #    them before using them to normalize the weight. This is to support
        #    backproping through two forward passes, e.g., the common pattern in
        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
        #    complain that variables needed to do backward for the first forward
        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
        weight = getattr(module, self.name + '_orig')
        u = getattr(module, self.name + '_u')
        v = getattr(module, self.name + '_v')
        sigma_log = getattr(module, self.name + '_sigma')  # for logging

        # get settings from conv-module (for transposed convolution)
        stride = module.stride
        padding = module.padding

        if do_power_iteration:
            with torch.no_grad():
                for _ in range(self.n_power_iterations):
                    v_s = conv_transpose2d(u.view(self.out_shape),
                                           weight,
                                           stride=stride,
                                           padding=padding,
                                           output_padding=0)
                    # Note: out flag for in-place changes
                    v = normalize(v_s.view(-1), dim=0, eps=self.eps, out=v)

                    u_s = conv2d(v.view(self.input_dim),
                                 weight,
                                 stride=stride,
                                 padding=padding,
                                 bias=None)
                    u = normalize(u_s.view(-1), dim=0, eps=self.eps, out=u)
                if self.n_power_iterations > 0:
                    # See above on why we need to clone
                    u = u.clone()
                    v = v.clone()
        weight_v = conv2d(v.view(self.input_dim),
                          weight,
                          stride=stride,
                          padding=padding,
                          bias=None)
        weight_v = weight_v.view(-1)
        sigma = torch.dot(u.view(-1), weight_v)
        # enforce spectral norm only as constraint
        factorReverse = torch.max(
            torch.ones(1).to(weight.device), sigma / self.coeff)
        # for logging
        weight_v_det = weight_v.detach()
        u_det = u.detach()
        torch.max(torch.dot(u_det.view(-1), weight_v_det),
                  torch.dot(u_det.view(-1), weight_v_det),
                  out=sigma_log)

        # rescaling
        weight = weight / (factorReverse + 1e-5)  # for stability
        return weight
Exemple #25
0
    def compute_weight(self, module, do_power_iteration):
        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
        #     updated in power iteration **in-place**. This is very important
        #     because in `DataParallel` forward, the vectors (being buffers) are
        #     broadcast from the parallelized module to each module replica,
        #     which is a new module object created on the fly. And each replica
        #     runs its own spectral norm power iteration. So simply assigning
        #     the updated vectors to the module this function runs on will cause
        #     the update to be lost forever. And the next time the parallelized
        #     module is replicated, the same randomly initialized vectors are
        #     broadcast and used!
        #
        #     Therefore, to make the change propagate back, we rely on two
        #     important bahaviors (also enforced via tests):
        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
        #          is alreay on correct device; and it makes sure that the
        #          parallelized module is already on `device[0]`.
        #       2. If the out tensor in `out=` kwarg has correct shape, it will
        #          just fill in the values.
        #     Therefore, since the same power iteration is performed on all
        #     devices, simply updating the tensors in-place will make sure that
        #     the module replica on `device[0]` will update the _u vector on the
        #     parallized module (by shared storage).
        #
        #    However, after we update `u` and `v` in-place, we need to **clone**
        #    them before using them to normalize the weight. This is to support
        #    backproping through two forward passes, e.g., the common pattern in
        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
        #    complain that variables needed to do backward for the first forward
        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
        weight = getattr(module, self.name + '_orig')
        u = getattr(module, self.name + '_u')
        v = getattr(module, self.name + '_v')
        sigma_log = getattr(module, self.name + '_sigma')  # for logging
        weight_mat = self.reshape_weight_to_matrix(weight)

        if do_power_iteration:
            with torch.no_grad():
                for _ in range(self.n_power_iterations):
                    # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
                    # are the first left and right singular vectors.
                    # This power iteration produces approximations of `u` and `v`.
                    v = normalize(torch.mv(weight_mat.t(), u),
                                  dim=0,
                                  eps=self.eps,
                                  out=v)
                    u = normalize(torch.mv(weight_mat, v),
                                  dim=0,
                                  eps=self.eps,
                                  out=u)
                if self.n_power_iterations > 0:
                    # See above on why we need to clone
                    u = u.clone()
                    v = v.clone()

        sigma = torch.dot(u, torch.mv(weight_mat, v))
        # soft normalization: only when sigma larger than coeff
        factor = torch.max(torch.ones(1).to(weight.device), sigma / self.coeff)
        weight = weight / factor
        # for logging
        sigma_det = sigma.detach()
        torch.max(torch.ones(1).to(weight.device),
                  sigma_det / self.coeff,
                  out=sigma_log)
        return weight
Exemple #26
0
def Bearing(xA_4d, xB_4d):
    dp = xA_4d[:2] - xB_4d[:2]
    v = xA_4d[2:]
    cos_theta = torch.dot(dp, v) / (torch.norm(dp) * torch.norm(v) + 1E-6)
    return cos_theta
Exemple #27
0
def compute_nll_from_model(data,
                           pathmodel,
                           pathweights,
                           image_shape,
                           num_classes,
                           nb_step=5,
                           optim_default=partial(optim.SGD,
                                                 lr=5e-5,
                                                 momentum=0.),
                           dataloader=False):

    print("Compute NLL from Model")
    torch.random.manual_seed(0)
    np.random.seed(0)

    lls = {}
    grad_total = {}
    grad_stat_total = {}
    likelihood_ratio_statistic = {}

    grad_total[0] = []
    for k in range(nb_step + 1):
        lls[k] = []
        #   grad_total[k] = []
        grad_stat_total[k] = []
        likelihood_ratio_statistic[k] = []

    model = load_model_from_param(pathmodel, pathweights, num_classes,
                                  image_shape).cuda()

    if not dataloader:
        dataloader_aux = [(tqdm.tqdm(data), None)]
    else:
        dataloader_aux = tqdm.tqdm(iter(data))
    for data_list, _ in dataloader_aux:
        for x in data_list:
            # load weights.  print the weights.
            model_copy = load_model_from_param(pathmodel, pathweights,
                                               num_classes,
                                               image_shape).cuda()
            optimizer = optim_default(model_copy.parameters())
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
                break

            model_copy.zero_grad()

            grads = []
            diff_param = []
            x = x.to(device_test).unsqueeze(0)
            _, nll, _ = model_copy(x, y_onehot=None)
            nll.backward()
            lls[0].append(-nll.detach().cpu().item())
            optimizer.step()
            for name_copy, param_copy in model_copy.named_parameters():
                if param_copy.grad is not None:
                    grads.append(-param_copy.grad.view(-1))
            grad_total[0].append(
                torch.sum(lr * (torch.cat(grads)**2)).detach().cpu().item())

            for (name_copy,
                 param_copy), (name,
                               param) in zip(model_copy.named_parameters(),
                                             model.named_parameters()):
                assert (name_copy == name)
                if param_copy.grad is not None:
                    aux_diff_param = param_copy.data.detach(
                    ) - param.data.detach()
                    diff_param.append(aux_diff_param.view(-1))
            grads = torch.flatten(torch.cat(grads))
            diff_param = torch.flatten(torch.cat(diff_param))
            if not torch.isinf(torch.abs(torch.dot(grads, diff_param))).any():
                grad_stat_total[0].append(
                    torch.abs(torch.dot(grads,
                                        diff_param)).detach().cpu().item())

            for k in range(1, nb_step + 1):
                model_copy.zero_grad()
                diff_param = []
                _, nll, _ = model_copy(x, y_onehot=None)
                nll.backward()
                if not torch.isinf(-nll).any():
                    lls[k].append(-nll.detach().cpu().item())
                else:
                    print("INF NLL")
                    lls[k].append(torch.sign(nll).detach().cpu().item() * 1e8)

                optimizer.step()
                for (name_copy,
                     param_copy), (name,
                                   param) in zip(model_copy.named_parameters(),
                                                 model.named_parameters()):
                    assert (name_copy == name)
                    if param_copy.grad is not None:
                        aux_diff_param = param_copy.data.detach(
                        ) - param.data.detach()
                        diff_param.append(aux_diff_param.view(-1))
                diff_param = torch.flatten(torch.cat(diff_param))

                if not torch.isinf(torch.abs(torch.dot(grads,
                                                       diff_param))).any():
                    grad_stat_total[k].append(
                        torch.abs(torch.dot(grads,
                                            diff_param)).detach().cpu().item())
                else:
                    print("Inf grad stat")

    grad_total[0] = np.array(grad_total[0])
    for key in grad_stat_total.keys():
        lls[key] = np.array(lls[key])
        likelihood_ratio_statistic[key] = lls[key] - lls[0]
        likelihood_ratio_statistic[key] = likelihood_ratio_statistic[key][
            np.where(np.abs(likelihood_ratio_statistic[key]) < 1e7)]
        grad_stat_total[key] = np.array(grad_stat_total[key])

    return lls, grad_total, grad_stat_total, likelihood_ratio_statistic
Exemple #28
0
def compute_nll(data,
                model,
                nb_step=1,
                optim_default=partial(optim.SGD, lr=1e-5, momentum=0.),
                dataloader=False):
    print("Compute NLL")
    torch.random.manual_seed(0)
    np.random.seed(0)

    lls = {}
    grad_total = {}
    grad_stat_total = {}
    likelihood_ratio_statistic = {}

    for k in range(nb_step + 1):
        lls[k] = []
        grad_total[k] = []
        grad_stat_total[k] = []
        likelihood_ratio_statistic[k] = []

    if not dataloader:
        dataloader_aux = [(tqdm.tqdm(data), None)]
    else:
        dataloader_aux = tqdm.tqdm(iter(data))
    for data_list, _ in dataloader_aux:
        for x in data_list:
            # load weights.  print the weights.
            model_copy = copy.deepcopy(model).to(device_test)
            optimizer = optim_default(model_copy.parameters())
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
                break

            model_copy.zero_grad()

            grads = []
            diff_param = []
            x = x.to(device_test).unsqueeze(0)
            _, nll, _ = model_copy(x, y_onehot=None)
            nll.backward()
            lls[0].append(-nll.detach().cpu().item())
            optimizer.step()
            for name_copy, param_copy in model_copy.named_parameters():
                if param_copy.grad is not None:
                    grads.append(-param_copy.grad.view(-1))
            grad_total[0].append(
                torch.sum(lr * (torch.cat(grads)**2)).detach().cpu().item())

            for (name_copy,
                 param_copy), (name,
                               param) in zip(model_copy.named_parameters(),
                                             model.named_parameters()):
                assert (name_copy == name)
                if param_copy.grad is not None:
                    aux_diff_param = param_copy.data - param.data
                    diff_param.append(aux_diff_param.view(-1))
            grads = torch.flatten(torch.cat(grads))
            diff_param = torch.flatten(torch.cat(diff_param))
            grad_stat_total[0].append(
                torch.abs(torch.dot(grads, diff_param)).detach().cpu().item())

            for k in range(1, nb_step + 1):
                model_copy.zero_grad()
                diff_param = []
                _, nll, _ = model_copy(x, y_onehot=None)
                nll.backward()
                lls[k].append(-nll.detach().cpu().item())
                optimizer.step()
                for (name_copy,
                     param_copy), (name,
                                   param) in zip(model_copy.named_parameters(),
                                                 model.named_parameters()):
                    assert (name_copy == name)
                    if param_copy.grad is not None:
                        aux_diff_param = param_copy.data - param.data
                        diff_param.append(aux_diff_param.view(-1))

                grad_total[k].append(
                    torch.sum((grads**2) * lr).detach().cpu().item())
                diff_param = torch.flatten(torch.cat(diff_param))
                grad_stat_total[k].append(
                    torch.abs(torch.dot(grads,
                                        diff_param)).detach().cpu().item())

    for key in grad_total.keys():
        grad_total[key] = np.array(grad_total[key])
        lls[key] = np.array(lls[key])
        likelihood_ratio_statistic[key] = lls[key] - lls[0]
        grad_stat_total[key] = np.array(grad_stat_total[key])

    return lls, grad_total, grad_stat_total, likelihood_ratio_statistic
def estimate_metrics(pred, random_query, binary_target, sup_net, switch_vec):
    query_pred = torch.gather(pred, 1, random_query.view(-1, 1)).squeeze(1)

    num_s = torch.tensor(np.sum(switch_vec).item(),
                         dtype=torch.float32).to(device)
    # ipdb.set_trace()

    _, class_pred = torch.max(pred, dim=1)
    binary_pred = class_pred.eq(random_query).type(torch.cuda.LongTensor)
    correct = binary_target.eq(binary_pred).sum()
    accuracy = correct.type(torch.cuda.FloatTensor) / binary_target.size(0)

    bce_loss = bce(query_pred, binary_target.type(torch.cuda.FloatTensor))

    metrics = {}
    s_hist = {}
    ortho_mtrx = {}

    metrics['accuracy'] = accuracy
    metrics['bce_loss'] = bce_loss * args.lambda_bce
    metrics['l1_loss_total'] = torch.from_numpy(np.float32([0.])).to(device)
    metrics['orthogonality_loss_total'] = torch.from_numpy(np.float32(
        [0.])).to(device)
    metrics['quantization_loss_total'] = torch.from_numpy(np.float32(
        [0.])).to(device)
    metrics['total_loss'] = torch.from_numpy(np.float32([0.])).to(device)
    # ipdb.set_trace()
    metrics['total_loss'] = metrics['total_loss'] + metrics['bce_loss']

    one_hot = torch.zeros((10, 10)).fill_(1).to(device)
    s_one_hot = torch.zeros(10, 10).type(torch.cuda.FloatTensor)
    s_queries = torch.from_numpy(np.array(list(range(10)))).to(device)
    s_one_hot = s_one_hot.scatter_(dim=1,
                                   index=s_queries.view(-1, 1),
                                   src=one_hot)

    s_vectors_all = sup_net(s_one_hot)

    for k in range(len(switch_vec)):
        if switch_vec[k]:
            s_vectors = s_vectors_all[k]
            for i in range(10):
                s_hist['s_layer_{}_class_{}'.format(
                    k, i)] = s_vectors[i].cpu().data.numpy()

            sparsity_loss = l1(s_vectors,
                               torch.zeros_like(s_vectors).to(device))

            orth_loss = torch.from_numpy(np.float32([0.])).to(device)

            for i in range(10):
                for j in range(i, 10):
                    orth_loss = orth_loss + torch.dot(s_vectors[i],
                                                      s_vectors[j])

            ortho_mtrx['layer_{}'.format(k)] = np.zeros((10, 10))
            for i in range(10):
                for j in range(10):
                    ortho_mtrx['layer_{}'.format(k)][i][j] = torch.dot(
                        s_vectors[i], s_vectors[j]).cpu().data.numpy()

            quantization_target = s_vectors.detach() > 0.5
            quantization_loss = mse(
                s_vectors, quantization_target.type(torch.cuda.FloatTensor))

            orth_loss = orth_loss / 45

            # ipdb.set_trace()
            metrics['l1_loss_{}'.format(k)] = sparsity_loss * args.lambda_l1
            metrics['l1_loss_total'] = metrics['l1_loss_total'] + metrics[
                'l1_loss_{}'.format(k)]

            metrics['orthogonality_loss_{}'.format(
                k)] = orth_loss * args.lambda_ortho
            metrics['orthogonality_loss_total'] = metrics[
                'orthogonality_loss_total'] + metrics[
                    'orthogonality_loss_{}'.format(k)]

            metrics['quantization_loss_{}'.format(
                k)] = quantization_loss * args.lambda_quant
            metrics['quantization_loss_total'] = metrics[
                'quantization_loss_total'] + metrics[
                    'quantization_loss_{}'.format(k)]

    # ipdb.set_trace()

    metrics['total_loss'] = metrics['total_loss'] + metrics['l1_loss_total']/num_s + \
                            metrics['orthogonality_loss_total']/num_s + \
                            metrics['quantization_loss_total']/num_s

    return metrics, s_hist, ortho_mtrx
Exemple #30
0
def train_model(num_epochs, dataset_name, datadir, feature, model_name,
                fraction, select_every, optim_type, learning_rate, run, device,
                log_dir, trn_batch_size, strategy):

    # Loading the Dataset
    trainset, validset, testset, num_cls = load_dataset_custom(
        datadir, dataset_name, feature)
    N = len(trainset)
    val_batch_size = 1000
    tst_batch_size = 1000

    # Creating the Data Loaders
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=trn_batch_size,
                                              shuffle=False,
                                              pin_memory=True)

    valloader = torch.utils.data.DataLoader(validset,
                                            batch_size=val_batch_size,
                                            shuffle=False,
                                            pin_memory=True)

    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=tst_batch_size,
                                             shuffle=False,
                                             pin_memory=True)

    # Budget for subset selection
    bud = int(fraction * N)
    print("Budget, fraction and N:", bud, fraction, N)

    # Subset Selection and creating the subset data loader
    start_idxs = np.random.choice(N, size=bud, replace=False)
    idxs = start_idxs
    data_sub = Subset(trainset, idxs)
    subset_trnloader = torch.utils.data.DataLoader(data_sub,
                                                   batch_size=trn_batch_size,
                                                   shuffle=False,
                                                   pin_memory=True)

    # Variables to store accuracies
    gammas = torch.ones(len(idxs)).to(device)
    substrn_losses = np.zeros(num_epochs)
    val_losses = np.zeros(num_epochs)
    timing = np.zeros(num_epochs)
    val_acc = np.zeros(num_epochs)
    tst_acc = np.zeros(num_epochs)
    subtrn_acc = np.zeros(num_epochs)

    # Results logging file
    print_every = 3

    all_logs_dir = log_dir + '/' + str(uuid.uuid4())
    while os.path.exists(all_logs_dir):
        all_logs_dir = log_dir + '/' + str(uuid.uuid4())
    print(all_logs_dir)
    subprocess.run(["mkdir", "-p", all_logs_dir])
    path_logfile = os.path.join(all_logs_dir, 'log.txt')
    logfile = open(path_logfile, 'w')
    exp_name = dataset_name + '_fraction:' + str(fraction) + '_epochs:' + str(num_epochs) + \
               '_selEvery:' + str(select_every) + '_variant' + '_runs' + str(run)
    print(exp_name)

    # Model Creation
    model = create_model(model_name, num_cls, device)
    model1 = create_model(model_name, num_cls, device)
    # Loss Functions
    criterion, criterion_nored = loss_function()

    # Getting the optimizer and scheduler
    optimizer, scheduler = optimizer_with_scheduler(optim_type, model,
                                                    num_epochs, learning_rate)

    if strategy == 'GradMatch':
        # OMPGradMatch Selection strategy
        setf_model = OMPGradMatchStrategy(trainloader,
                                          valloader,
                                          model1,
                                          criterion,
                                          learning_rate,
                                          device,
                                          num_cls,
                                          True,
                                          'PerClassPerGradient',
                                          False,
                                          lam=0.5,
                                          eps=1e-100)
    elif strategy == 'GradMatchPB':
        setf_model = OMPGradMatchStrategy(trainloader,
                                          valloader,
                                          model1,
                                          criterion,
                                          learning_rate,
                                          device,
                                          num_cls,
                                          True,
                                          'PerBatch',
                                          False,
                                          lam=0,
                                          eps=1e-100)

    elif strategy == 'GradMatch-Explore':
        # OMPGradMatch Selection strategy
        setf_model = OMPGradMatchStrategy(trainloader,
                                          valloader,
                                          model1,
                                          criterion,
                                          learning_rate,
                                          device,
                                          num_cls,
                                          True,
                                          'PerClassPerGradient',
                                          False,
                                          lam=0.5,
                                          eps=1e-100)
        # Random-Online Selection strategy
        rand_setf_model = RandomStrategy(trainloader, online=True)

    elif strategy == 'GradMatchPB-Explore':
        # OMPGradMatch Selection strategy
        setf_model = OMPGradMatchStrategy(trainloader,
                                          valloader,
                                          model1,
                                          criterion,
                                          learning_rate,
                                          device,
                                          num_cls,
                                          True,
                                          'PerBatch',
                                          False,
                                          lam=0,
                                          eps=1e-100)
        # Random-Online Selection strategy
        rand_setf_model = RandomStrategy(trainloader, online=True)

    elif strategy == 'Random':
        # Random Selection strategy
        setf_model = RandomStrategy(trainloader, online=False)

    elif strategy == 'Random-Online':
        # Random-Online Selection strategy
        setf_model = RandomStrategy(trainloader, online=True)

    print("=======================================", file=logfile)
    kappa_epochs = int(0.5 * num_epochs)
    full_epochs = floor(kappa_epochs / int(fraction * 100))

    for i in range(num_epochs):
        subtrn_loss = 0
        subtrn_correct = 0
        subtrn_total = 0
        subset_selection_time = 0

        if (strategy in [
                'GLISTER', 'GradMatch', 'GradMatchPB', 'CRAIG', 'CRAIGPB'
        ]) and (((i + 1) % select_every) == 0):
            start_time = time.time()
            cached_state_dict = copy.deepcopy(model.state_dict())
            clone_dict = copy.deepcopy(model.state_dict())
            if strategy in ['CRAIG', 'CRAIGPB']:
                subset_idxs, gammas = setf_model.select(
                    int(bud), clone_dict, 'lazy')
            else:
                subset_idxs, gammas = setf_model.select(int(bud), clone_dict)
            model.load_state_dict(cached_state_dict)
            idxs = subset_idxs
            if strategy in ['GradMatch', 'GradMatchPB', 'CRAIG', 'CRAIGPB']:
                gammas = torch.from_numpy(np.array(gammas)).to(device).to(
                    torch.float32)
            subset_selection_time += (time.time() - start_time)

        elif (strategy in [
                'GLISTER-Explore', 'GradMatch-Explore', 'GradMatchPB-Explore',
                'CRAIG-Explore', 'CRAIGPB-Explore'
        ]):
            start_time = time.time()
            if i < full_epochs:
                subset_idxs, gammas = rand_setf_model.select(int(bud))
                idxs = subset_idxs
                gammas = gammas.to(device)
            elif ((i % select_every == 0) and (i >= kappa_epochs)):
                cached_state_dict = copy.deepcopy(model.state_dict())
                clone_dict = copy.deepcopy(model.state_dict())
                if strategy in ['CRAIG-Explore', 'CRAIGPB-Explore']:
                    subset_idxs, gammas = setf_model.select(
                        int(bud), clone_dict, 'lazy')
                else:
                    subset_idxs, gammas = setf_model.select(
                        int(bud), clone_dict)
                model.load_state_dict(cached_state_dict)
                idxs = subset_idxs
                if strategy in [
                        'GradMatch-Explore', 'GradMatchPB-Explore',
                        'CRAIG-Explore', 'CRAIGPB-Explore'
                ]:
                    gammas = torch.from_numpy(np.array(gammas)).to(device).to(
                        torch.float32)
            subset_selection_time += (time.time() - start_time)

        print("selEpoch: %d, Selection Ended at:" % (i),
              str(datetime.datetime.now()))
        data_sub = Subset(trainset, idxs)
        subset_trnloader = torch.utils.data.DataLoader(
            data_sub,
            batch_size=trn_batch_size,
            shuffle=False,
            pin_memory=True)

        model.train()
        batch_wise_indices = list(subset_trnloader.batch_sampler)
        if strategy in ['CRAIG', 'CRAIGPB', 'GradMatch', 'GradMatchPB']:
            start_time = time.time()
            for batch_idx, (inputs, targets) in enumerate(subset_trnloader):
                inputs, targets = inputs.to(device), targets.to(
                    device,
                    non_blocking=True)  # targets can have non_blocking=True.
                optimizer.zero_grad()
                outputs = model(inputs)
                losses = criterion_nored(outputs, targets)
                loss = torch.dot(
                    losses, gammas[batch_wise_indices[batch_idx]]) / (
                        gammas[batch_wise_indices[batch_idx]].sum())
                loss.backward()
                subtrn_loss += loss.item()
                optimizer.step()
                _, predicted = outputs.max(1)
                subtrn_total += targets.size(0)
                subtrn_correct += predicted.eq(targets).sum().item()
            train_time = time.time() - start_time

        elif strategy in [
                'CRAIGPB-Explore', 'CRAIG-Explore', 'GradMatch-Explore',
                'GradMatchPB-Explore'
        ]:
            start_time = time.time()
            if i < full_epochs:
                for batch_idx, (inputs, targets) in enumerate(trainloader):
                    inputs, targets = inputs.to(device), targets.to(
                        device, non_blocking=True
                    )  # targets can have non_blocking=True.
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    subtrn_loss += loss.item()
                    optimizer.step()
                    _, predicted = outputs.max(1)
                    subtrn_total += targets.size(0)
                    subtrn_correct += predicted.eq(targets).sum().item()

            elif i >= kappa_epochs:
                for batch_idx, (inputs,
                                targets) in enumerate(subset_trnloader):
                    inputs, targets = inputs.to(device), targets.to(
                        device, non_blocking=True
                    )  # targets can have non_blocking=True.
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    losses = criterion_nored(outputs, targets)
                    loss = torch.dot(
                        losses, gammas[batch_wise_indices[batch_idx]]) / (
                            gammas[batch_wise_indices[batch_idx]].sum())
                    loss.backward()
                    subtrn_loss += loss.item()
                    optimizer.step()
                    _, predicted = outputs.max(1)
                    subtrn_total += targets.size(0)
                    subtrn_correct += predicted.eq(targets).sum().item()
            train_time = time.time() - start_time

        elif strategy in ['Full']:
            start_time = time.time()
            for batch_idx, (inputs, targets) in enumerate(trainloader):
                inputs, targets = inputs.to(device), targets.to(
                    device,
                    non_blocking=True)  # targets can have non_blocking=True.
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                subtrn_loss += loss.item()
                optimizer.step()
                _, predicted = outputs.max(1)
                subtrn_total += targets.size(0)
                subtrn_correct += predicted.eq(targets).sum().item()
            train_time = time.time() - start_time

        scheduler.step()
        timing[i] = train_time + subset_selection_time
        # print("Epoch timing is: " + str(timing[i]))

        val_loss = 0
        val_correct = 0
        val_total = 0
        tst_correct = 0
        tst_total = 0
        tst_loss = 0
        model.eval()

        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(valloader):
                # print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(
                    device, non_blocking=True)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += targets.size(0)
                val_correct += predicted.eq(targets).sum().item()

            for batch_idx, (inputs, targets) in enumerate(testloader):
                # print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(
                    device, non_blocking=True)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                tst_loss += loss.item()
                _, predicted = outputs.max(1)
                tst_total += targets.size(0)
                tst_correct += predicted.eq(targets).sum().item()

        val_acc[i] = val_correct / val_total
        tst_acc[i] = tst_correct / tst_total
        subtrn_acc[i] = subtrn_correct / subtrn_total
        substrn_losses[i] = subtrn_loss
        val_losses[i] = val_loss
        print('Epoch:', i + 1, 'Validation Accuracy: ', val_acc[i],
              'Test Accuracy: ', tst_acc[i], 'Train Accuracy:', subtrn_acc[i],
              'Time: ', timing[i])
    print(strategy + " Selection Run---------------------------------")
    print("Final SubsetTrn:", subtrn_loss)
    print("Validation Loss and Accuracy:", val_loss, val_acc.max())
    print("Test Data Loss and Accuracy:", tst_loss, tst_acc.max())
    print('-----------------------------------')

    # Results logging into the file
    print(strategy, file=logfile)
    print(
        '---------------------------------------------------------------------',
        file=logfile)
    val = "Validation Accuracy, "
    tst = "Test Accuracy, "
    trn = "Train Accuracy, "
    time_str = "Time, "

    for i in range(num_epochs):
        time_str = time_str + "," + str(timing[i])
        val = val + "," + str(val_acc[i])
        trn = trn + "," + str(subtrn_acc[i])
        tst = tst + "," + str(tst_acc[i])

    print(timing, file=logfile)
    print(val, file=logfile)
    print(trn, file=logfile)
    print(tst, file=logfile)

    omp_timing = np.array(timing)
    omp_cum_timing = list(generate_cumulative_timing(omp_timing))
    omp_tst_acc = list(filter(tst_acc))
    print("Total time taken by " + strategy + " = " + str(omp_cum_timing[-1]))
    logfile.close()
    return {
        'loss': -tst_acc.max(),
        'max_val_acc': val_acc.max(),
        'train_acc': subtrn_acc.max(),
        'status': STATUS_OK
    }
def f(x):
    rewards = np.zeros(n_cats)
    rewards[0] = 1.
    rewards = torch.tensor(rewards).float()
    # print (x, rewards)
    return torch.dot(x,rewards)
Exemple #32
0
 def calc_potential_energy(self, xx):
     xx = xx - self.bias
     potential_energy = torch.dot(xx, torch.matmul(self.weight_matrix, xx))
     return potential_energy
Exemple #33
0
def get_coordinates(v, basis_vectors, offset_v):

    adjusted_v = v - offset_v
    coeffs = [float(torch.dot(adjusted_v, b) / torch.norm(b)) for b in basis_vectors]
    return coeffs
Exemple #34
0
import os
import torch

A = torch.tensor([1, 2, 3], dtype=torch.float)
B = torch.tensor([4, 5, 6], dtype=torch.float)

result = torch.dot(A, B)

print(result)
print(result.item())
    def train(self, env, expert, render=False):
        num_iters = self.train_config["num_iters"]
        num_steps_per_iter = self.train_config["num_steps_per_iter"]
        horizon = self.train_config["horizon"]
        lambda_ = self.train_config["lambda"]
        gae_gamma = self.train_config["gae_gamma"]
        gae_lambda = self.train_config["gae_lambda"]
        eps = self.train_config["epsilon"]
        max_kl = self.train_config["max_kl"]
        cg_damping = self.train_config["cg_damping"]
        normalize_advantage = self.train_config["normalize_advantage"]

        opt_d = torch.optim.Adam(self.d.parameters())

        exp_rwd_iter = []

        exp_obs = []
        exp_acts = []

        steps = 0
        while steps < num_steps_per_iter:
            ep_obs = []
            ep_rwds = []

            t = 0
            done = False

            ob = env.reset()

            while not done and steps < num_steps_per_iter:
                act = expert.act(ob)

                ep_obs.append(ob)
                exp_obs.append(ob)
                exp_acts.append(act)

                if render:
                    env.render()
                ob, rwd, done, info = env.step(act)

                ep_rwds.append(rwd)

                t += 1
                steps += 1

                if horizon is not None:
                    if t >= horizon:
                        break

            if done:
                exp_rwd_iter.append(np.sum(ep_rwds))

            ep_obs = FloatTensor(ep_obs)
            ep_rwds = FloatTensor(ep_rwds)

        exp_rwd_mean = np.mean(exp_rwd_iter)
        print("Expert Reward Mean: {}".format(exp_rwd_mean))

        exp_obs = FloatTensor(exp_obs)
        exp_acts = FloatTensor(np.array(exp_acts))

        rwd_iter_means = []
        for i in range(num_iters):
            rwd_iter = []

            obs = []
            acts = []
            rets = []
            advs = []
            gms = []

            steps = 0
            while steps < num_steps_per_iter:
                ep_obs = []
                ep_acts = []
                ep_rwds = []
                ep_costs = []
                ep_disc_costs = []
                ep_gms = []
                ep_lmbs = []

                t = 0
                done = False

                ob = env.reset()

                while not done and steps < num_steps_per_iter:
                    act = self.act(ob)

                    ep_obs.append(ob)
                    obs.append(ob)

                    ep_acts.append(act)
                    acts.append(act)

                    if render:
                        env.render()
                    ob, rwd, done, info = env.step(act)

                    ep_rwds.append(rwd)
                    ep_gms.append(gae_gamma**t)
                    ep_lmbs.append(gae_lambda**t)

                    t += 1
                    steps += 1

                    if horizon is not None:
                        if t >= horizon:
                            break

                if done:
                    rwd_iter.append(np.sum(ep_rwds))

                ep_obs = FloatTensor(ep_obs)
                # ep_acts = FloatTensor(np.array(ep_acts)).to(torch.device("cuda"))
                ep_acts = FloatTensor(np.array(ep_acts))
                ep_rwds = FloatTensor(ep_rwds)
                # ep_disc_rwds = FloatTensor(ep_disc_rwds)
                ep_gms = FloatTensor(ep_gms)
                ep_lmbs = FloatTensor(ep_lmbs)

                ep_costs = (-1) * torch.log(self.d(ep_obs, ep_acts))\
                    .squeeze().detach()
                ep_disc_costs = ep_gms * ep_costs

                ep_disc_rets = FloatTensor(
                    [sum(ep_disc_costs[i:]) for i in range(t)])
                ep_rets = ep_disc_rets / ep_gms

                rets.append(ep_rets)

                self.v.eval()
                curr_vals = self.v(ep_obs).detach()
                next_vals = torch.cat(
                    (self.v(ep_obs)[1:], FloatTensor([[0.]]))).detach()
                ep_deltas = ep_costs.unsqueeze(-1)\
                    + gae_gamma * next_vals\
                    - curr_vals

                ep_advs = torch.FloatTensor([
                    ((ep_gms * ep_lmbs)[:t - j].unsqueeze(-1) *
                     ep_deltas[j:]).sum() for j in range(t)
                ])
                advs.append(ep_advs)

                gms.append(ep_gms)

            rwd_iter_means.append(np.mean(rwd_iter))
            print("Iterations: {},   Reward Mean: {}".format(
                i + 1, np.mean(rwd_iter)))

            obs = FloatTensor(obs)
            # acts = FloatTensor(np.array(acts)).to(torch.device("cuda"))
            acts = FloatTensor(np.array(acts))
            rets = torch.cat(rets)
            advs = torch.cat(advs)
            gms = torch.cat(gms)

            if normalize_advantage:
                advs = (advs - advs.mean()) / advs.std()

            self.d.train()
            exp_scores = self.d.get_logits(exp_obs, exp_acts)
            nov_scores = self.d.get_logits(obs, acts)

            opt_d.zero_grad()
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                exp_scores, torch.zeros_like(exp_scores)
            ) \
                + torch.nn.functional.binary_cross_entropy_with_logits(
                    nov_scores, torch.ones_like(nov_scores)
                )
            loss.backward()
            opt_d.step()

            self.v.train()
            old_params = get_flat_params(self.v).detach()
            old_v = self.v(obs).detach()

            def constraint():
                return ((old_v - self.v(obs))**2).mean()

            grad_diff = get_flat_grads(constraint(), self.v)

            def Hv(v):
                hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\
                    .detach()

                return hessian

            g = get_flat_grads(
                ((-1) * (self.v(obs).squeeze() - rets)**2).mean(),
                self.v).detach()
            s = conjugate_gradient(Hv, g).detach()

            Hs = Hv(s).detach()
            alpha = torch.sqrt(2 * eps / torch.dot(s, Hs))

            new_params = old_params + alpha * s

            set_params(self.v, new_params)

            self.pi.train()
            old_params = get_flat_params(self.pi).detach()
            old_distb = self.pi(obs)

            def L():
                distb = self.pi(obs)

                return (advs.to(torch.device("cuda")) * torch.exp(
                    distb.log_prob(acts) - old_distb.log_prob(acts).detach())
                        ).mean()

            def kld():
                distb = self.pi(obs)

                if self.discrete:
                    old_p = old_distb.probs.detach()
                    p = distb.probs

                    return (old_p * (torch.log(old_p) - torch.log(p)))\
                        .sum(-1)\
                        .mean()

                else:
                    old_mean = old_distb.mean.detach()
                    old_cov = old_distb.covariance_matrix.sum(-1).detach()
                    mean = distb.mean
                    cov = distb.covariance_matrix.sum(-1)

                    return (0.5) * ((old_cov / cov).sum(-1) +
                                    (((old_mean - mean)**2) / cov).sum(-1) -
                                    self.action_dim + torch.log(cov).sum(-1) -
                                    torch.log(old_cov).sum(-1)).mean()

            grad_kld_old_param = get_flat_grads(kld(), self.pi)

            def Hv(v):
                hessian = get_flat_grads(torch.dot(grad_kld_old_param, v),
                                         self.pi).detach()

                return hessian + cg_damping * v

            g = get_flat_grads(L(), self.pi).detach()

            s = conjugate_gradient(Hv, g).detach()
            Hs = Hv(s).detach()

            new_params = rescale_and_linesearch(g, s, Hs, max_kl, L, kld,
                                                old_params, self.pi)

            disc_causal_entropy = ((-1) * gms * self.pi(obs).log_prob(acts))\
                .mean()
            grad_disc_causal_entropy = get_flat_grads(disc_causal_entropy,
                                                      self.pi)
            new_params += lambda_ * grad_disc_causal_entropy

            set_params(self.pi, new_params)

        return exp_rwd_mean, rwd_iter_means
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        assert len(self.param_groups) == 1

        loss = None
        if closure is not None:
            loss = closure()

        group = self.param_groups[0]
        weight_decay = group['weight_decay']
        momentum = group['momentum']
        dampening = group['dampening']
        nesterov = group['nesterov']

        grad = self._gather_flat_grad_with_weight_decay(weight_decay)

        # NOTE: op_Sgd_lop_Sgdn has only global state, but we register it as state for
        # the first param, because this helps with casting in load_state_dict
        state = self.state[self._params[0]]
        # State initialization
        if len(state) == 0:
            state['step'] = 0
            state['grad_prev'] = torch.zeros_like(grad)
            # Accumulated momentum for the hypergradient
            state['momentum_buffer_h'] = grad.new_tensor(0)

        state['step'] += 1

        grad_prev = state['grad_prev']

        # Hypergradient for SGD optimizer
        h = torch.dot(grad, grad_prev)
        h = -h
        ''' Hypergradient descent with momentum (HD-momentum) coefficients
        Parameters
        -----------
        momentum_h : momentum coefficient for the hypergradient
        dampening_h : dampening coefficient for the hypergradient
        nesterov_h : bool, if true : use nesterov momentum for the l.r update, else use sgd + momemtum
        '''
        momentum_h = group['momentum_h']
        dampening_h = group['dampening_h']
        nesterov_h = group['nesterov_h']

        # Hypergradient descent with momentum (HD momentum) for the learning rate
        if momentum_h and state['step'] > 1:
            buf_h = state['momentum_buffer_h']
            buf_h.mul_(momentum_h).add_(1 - dampening_h, h)
            state['momentum_buffer_h'] = buf_h
            if nesterov_h:
                h.add_(momentum_h, buf_h)
            else:
                h = buf_h

        group['lr'] -= group['hypergrad_lr'] * h

        if momentum != 0:
            if 'momentum_buffer' not in state:
                buf = state['momentum_buffer'] = torch.zeros_like(grad)
                buf.mul_(momentum).add_(grad)
            else:
                buf = state['momentum_buffer']
                buf.mul_(momentum).add_(1 - dampening, grad)
            if nesterov:
                grad.add_(momentum, buf)
            else:
                grad = buf

        state['grad_prev'] = grad

        self._add_grad(-group['lr'], grad)

        return loss
Exemple #37
0
 def get_pure_mspbe(self):
     A_theta_minus_b = torch.mv(self.A, self.theta) - self.b
     return (1/2) * torch.dot(A_theta_minus_b, torch.mv(self.C_inv, A_theta_minus_b))
Exemple #38
0
def test(learner, args, train_envs, test_envs, log_dir):
    batch_sampler = sampler(args.batch_size, args.num_bandits)
    batch_sampler.build(args.num_tasks_train, train_envs, args.batch_size)
    max_kl = args.max_kl
    cg_iters = args.cg_iters
    cg_damping = args.cg_damping
    ls_max_steps = args.ls_max_steps
    ls_backtrack_ratio = args.ls_backtrack_ratio
    train_rew = []
    test_rew = []
    for i in range(args.num_updates):
        #print(i)
        adapt_params = []
        inner_losses = []
        adapt_episodes = []
        rew_rem = []
        rew_rem_test = []
        for j in range(args.num_tasks_test):
            e = batch_sampler.sample(test_envs[j], learner)
            inner_loss = learner.cal_loss(e.s, e.a, e.r)
            params = learner.update_params(inner_loss, args.inner_lr,
                                           args.first_order)
            a_e = batch_sampler.sample_policy(test_envs[j], learner, params)
            mean_rew = torch.mean(a_e.r).data.numpy()
            rew_rem_test.append(mean_rew)

        for j in range(args.num_tasks_train):
            e = batch_sampler.sample(train_envs[j], learner)
            inner_loss = learner.cal_loss(e.s, e.a, e.r)
            params = learner.update_params(inner_loss, args.inner_lr,
                                           args.first_order)
            a_e = batch_sampler.sample_policy(train_envs[j], learner, params)
            adapt_params.append(params)
            adapt_episodes.append(a_e)
            inner_losses.append(inner_loss)
            mean_rew = torch.mean(a_e.r).data.numpy()
            rew_rem.append(mean_rew)

        print(i, np.mean(rew_rem), np.mean(rew_rem_test))
        #print(batch_sampler.poss/batch_sampler.cnt)
        train_rew.append(np.mean(rew_rem))
        test_rew.append(np.mean(rew_rem_test))
        old_loss, _, old_pis = learner.surrogate_loss(adapt_episodes,
                                                      inner_losses)
        grads = torch.autograd.grad(old_loss,
                                    learner.parameters(),
                                    retain_graph=True)
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = learner.hessian_vector_product(
            adapt_episodes, inner_losses, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(learner.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 learner.parameters())
            loss, kl, _ = learner.surrogate_loss(adapt_episodes,
                                                 inner_losses,
                                                 old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, learner.parameters())

        if (i + 1) % 10 == 0:
            test_input = torch.FloatTensor([[1]])
            test_output = learner.forward(test_input).data.numpy()[0]
            plt.figure()
            plt.bar(np.arange(len(test_output)), test_output)
            plt.savefig(log_dir + 'figures/before%i.png' % i)
            plt.close()
            for j in range(args.num_tasks_train):
                test_output = learner.forward(test_input,
                                              adapt_params[j]).data.numpy()[0]
                plt.figure()
                plt.bar(np.arange(len(test_output)), test_output)
                plt.savefig(log_dir + 'figures/after%i_%i.png' % (j, i))
                plt.close()

    np.save(log_dir + 'train_rew' + str(args.inner_lr) + '.npy', train_rew)
    np.save(log_dir + 'test_rew' + str(args.inner_lr) + '.npy', test_rew)
    plt.figure()
    plt.plot(train_rew)
    plt.show()
    plt.figure()
    plt.plot(train_rew)
    plt.savefig(log_dir + 'train_rew.png')
    plt.close()
    plt.figure()
    plt.plot(test_rew)
    plt.savefig(log_dir + 'test_rew.png')
    plt.figure()
    plt.bar(np.arange(len(batch_sampler.poss)),
            batch_sampler.poss / batch_sampler.cnt)
    plt.savefig(log_dir + 'sample.png')
    plt.close()

    return
Exemple #39
0
def adj_broyden_correl(opa_freq,
                       n_runs=1,
                       random_prescribed=True,
                       dataset='imagenet',
                       model_size='LARGE'):
    # setup
    model = setup_model(opa_freq is not None, dataset, model_size)
    if dataset == 'imagenet':
        traindir = os.path.join(config.DATASET.ROOT + '/images',
                                config.DATASET.TRAIN_SET)
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        transform_train = transforms.Compose([
            transforms.RandomResizedCrop(config.MODEL.IMAGE_SIZE[0]),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
        train_dataset = datasets.ImageFolder(traindir, transform_train)
    else:
        normalize = transforms.Normalize((0.4914, 0.4822, 0.4465),
                                         (0.2023, 0.1994, 0.2010))
        augment_list = [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip()
        ] if config.DATASET.AUGMENT else []
        transform_train = transforms.Compose(augment_list + [
            transforms.ToTensor(),
            normalize,
        ])
        train_dataset = datasets.CIFAR10(root=f'{config.DATASET.ROOT}',
                                         train=True,
                                         download=True,
                                         transform=transform_train)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=10,
        pin_memory=True,
        worker_init_fn=partial(worker_init_fn, seed=42),
    )
    methods_results = {
        method_name: {
            'correl': [],
            'ratio': []
        }
        for method_name in ['shine-adj-br', 'shine', 'shine-opa', 'fpn']
    }
    methods_solvers = {
        'shine': broyden,
        'shine-adj-br': adj_broyden,
        'shine-opa': adj_broyden,
        'fpn': broyden,
    }
    random_results = {'correl': [], 'ratio': []}
    iter_loader = iter(train_loader)
    for i_run in range(n_runs):
        input, target = next(iter_loader)
        target = target.cuda(non_blocking=True)
        x_list, z_list = model.feature_extraction(input.cuda())
        model.fullstage._reset(z_list)
        model.fullstage_copy._copy(model.fullstage)
        # fixed point solving
        x_list = [x.clone().detach().requires_grad_() for x in x_list]
        cutoffs = [(elem.size(1), elem.size(2), elem.size(3))
                   for elem in z_list]
        args = (27, int(1e9), None)
        nelem = sum([elem.nelement() for elem in z_list])
        eps = 1e-5 * np.sqrt(nelem)
        z1_est = DEQFunc2d.list2vec(z_list)
        directions_dir = {
            'random': torch.randn(z1_est.shape),
            'prescribed': torch.randn(z1_est.shape),
        }
        for method_name in methods_results.keys():
            z1_est = torch.zeros_like(z1_est)
            g = lambda x: DEQFunc2d.g(model.fullstage_copy, x, x_list, cutoffs,
                                      *args)
            if random_prescribed:
                inverse_direction_fun = lambda x: directions_dir['prescribed']
            else:
                model.copy_modules()
                loss_function = lambda y_est: model.get_fixed_point_loss(
                    y_est, target)

                def inverse_direction_fun_vec(x):
                    x_temp = x.clone().detach().requires_grad_()
                    with torch.enable_grad():
                        x_list = DEQFunc2d.vec2list(x_temp, cutoffs)
                        loss = loss_function(x_list)
                    loss.backward()
                    dl_dx = x_temp.grad
                    return dl_dx

                inverse_direction_fun = inverse_direction_fun_vec

            solver = methods_solvers[method_name]
            if 'opa' in method_name:
                add_kwargs = dict(
                    inverse_direction_freq=opa_freq,
                    inverse_direction_fun=inverse_direction_fun
                    if opa_freq is not None else None,
                )
            else:
                add_kwargs = {}
            result_info = solver(
                g,
                z1_est,
                threshold=config.MODEL.F_THRES,
                eps=eps,
                name="forward",
                **add_kwargs,
            )
            z1_est = result_info['result']
            Us = result_info['Us']
            VTs = result_info['VTs']
            nstep = result_info['lowest_step']
            if opa_freq is not None:
                nstep += (nstep - 1) // opa_freq
            # compute true incoming gradient if needed
            if not random_prescribed:
                directions_dir['prescribed'] = inverse_direction_fun_vec(
                    z1_est)
                # making sure the random direction norm is not unrealistic
                directions_dir[
                    'random'] = directions_dir['random'] * torch.norm(
                        directions_dir['prescribed']) / torch.norm(
                            directions_dir['random'])
            # inversion on random gradients
            z1_temp = z1_est.clone().detach().requires_grad_()
            with torch.enable_grad():
                y = DEQFunc2d.g(model.fullstage_copy, z1_temp, x_list, cutoffs,
                                *args)

            eps = 2e-10
            for direction_name, direction in directions_dir.items():

                def g(x):
                    y.backward(x, retain_graph=True)
                    res = z1_temp.grad + direction
                    z1_temp.grad.zero_()
                    return res

                result_info_inversion = broyden(
                    g,
                    direction,  # we initialize Jacobian Free style
                    # in order to accelerate the convergence
                    threshold=35,
                    eps=eps,
                    name="backward",
                )
                true_inv = result_info_inversion['result']
                inv_dir = {
                    'fpn':
                    direction,
                    'shine':
                    -rmatvec(Us[:, :, :, :nstep - 1], VTs[:, :nstep - 1],
                             direction),
                }
                inv_dir['shine-opa'] = inv_dir['shine']
                inv_dir['shine-adj-br'] = inv_dir['shine']
                approx_inv = inv_dir[method_name]
                correl = torch.dot(
                    torch.flatten(true_inv),
                    torch.flatten(approx_inv),
                )
                scaling = torch.norm(true_inv) * torch.norm(approx_inv)
                correl = correl / scaling
                ratio = torch.norm(true_inv) / torch.norm(approx_inv)
                if direction_name == 'prescribed':
                    methods_results[method_name]['correl'].append(
                        correl.item())
                    methods_results[method_name]['ratio'].append(ratio.item())
                else:
                    if method_name == 'fpn':
                        random_results['correl'].append(correl.item())
                        random_results['ratio'].append(ratio.item())
            y.backward(torch.zeros_like(true_inv), retain_graph=False)
    return methods_results, random_results
Exemple #40
0
 def get_att_score(
         self, dec_output, enc_output
 ):  # enc_outputs [batch_size, num_directions(=1) * n_hidden]
     score = self.attn(enc_output)  # score : [batch_size, n_hidden]
     return torch.dot(dec_output.view(-1),
                      score.view(-1))  # inner product make scalar value
def main(
    data_path,
    model_path,
    w2i_path,
    hidden_size,
    classifier_path=None,
    intervention=False,
    learning_rate=None,
    component_names=None,
    generate_labels=False):
    
    # load word-to-index vocabulary
    with open(w2i_path, 'r') as f:
        vocab_lines = f.readlines()

        w2i = {}
        for i, line in enumerate(vocab_lines):
            w2i[line.strip()] = i
        unk_idx = w2i['<unk>']


    vocab_size = len(w2i)

    # load and initialise model
    lstm = Forward_LSTM(vocab_size,
                        hidden_size,
                        hidden_size,
                        vocab_size,
                        w2i_path,
                        model_path)

    # initialise hidden state for time step -1
    # the hidden state will not be reset for each sentence
    relevant_activations = {}
    relevant_activations['hx_l0'] = torch.Tensor(torch.zeros(hidden_size))
    relevant_activations['cx_l0'] = torch.Tensor(torch.zeros(hidden_size))
    relevant_activations['hx_l1'] = torch.Tensor(torch.zeros(hidden_size))
    relevant_activations['cx_l1'] = torch.Tensor(torch.zeros(hidden_size))


    # load testing data
    with open(data_path, 'r') as f_in:
        test_set = f_in.readlines()[1:]


    # collect scores for all subcategories
    scores_original_nvv = []
    scores_nonce_nvv = []
    scores_original_vnpcv = []
    scores_nonce_vnpcv = []
    scores_original = []
    scores_nonce = []
    scores = []


    if intervention:
        classifiers = defaultdict()

        # load diagnostic classifiers for the intervention
        for act in component_names:
            with open("{}/{}.pickle".format(classifier_path, act), 'rb') as trained_classifier:
                classifiers[act] = pickle.load(trained_classifier)


    # process sentences
    for line_idx in range(0, len(test_set), 2):

        # read two consecutive lines with the following structure:
        # 0:pattern 1:constr_id 2:sent_id 3:correct_number 4:form 5:class 6:type
        # 7:prefix 8:n_attr 9:punct 10:freq 11:len_context 12:len_prefix 13:sent

        sent_data1 = test_set[line_idx].split('\t')
        sent_data2 = test_set[line_idx + 1].split('\t')

        # L__NOUN_VERB_VERB or R__VERB_NOUN_CCONJ_VERB
        pattern1 = sent_data1[0]
        pattern2 = sent_data2[0]

        assert(pattern1[0] in ['R', 'L'] and pattern2[0] in ['R', 'L'])
        assert(pattern1[0] == pattern2[0])
        construction_id = 0 if pattern1[0] == 'R' else 1

        assert(sent_data1[3] == sent_data2[3])
        if not generate_labels:
            label = 0 if sent_data1[3].strip() == 'sing' else 1

        assert(sent_data1[5] != sent_data2[5])
        if sent_data1[5] == 'correct':
            correct_form = sent_data1[4]
            wrong_form = sent_data2[4]
        else:
            correct_form = sent_data2[4]
            wrong_form = sent_data1[4]

        assert(sent_data1[6] == sent_data2[6])
        type_of_sent = sent_data1[6]

        assert(sent_data1[11] == sent_data2[11])
        context_length = int(sent_data1[11])

        assert(sent_data1[12] == sent_data2[12])
        target_idx = int(sent_data1[12])

        subject_idx = target_idx - context_length

        assert(sent_data1[13] == sent_data2[13])
        sentence = sent_data1[13].split()

        # process sentence
        for t, word in enumerate(sentence):
            output, layer0, layer1 = lstm(word,
                                          relevant_activations['hx_l0'],
                                          relevant_activations['cx_l0'],
                                          relevant_activations['hx_l1'],
                                          relevant_activations['cx_l1'])

            relevant_activations['hx_l0'] = layer0[0]
            relevant_activations['hx_l1'] = layer1[0]
            relevant_activations['cx_l0'] = layer0[1]
            relevant_activations['cx_l1'] = layer1[1]

            if t == target_idx - 1:
                vocab_probs = F.log_softmax(
                    output.view(-1, len(w2i)), dim=1)[0]

            # intervention at subject timestep
            if intervention and t == subject_idx:
                for act in component_names:
                    weight, bias = classifiers[act]
                    weight = Variable(torch.tensor(
                        weight, dtype=torch.double).squeeze(0), requires_grad=False)
                    bias = Variable(torch.tensor(
                        bias, dtype=torch.double), requires_grad=False)
                    current_activation = Variable(torch.tensor(
                        relevant_activations[act], dtype=torch.double), requires_grad=True)

                    total_prob = torch.tensor(1.0, dtype=torch.double)
                    class_1_prob = torch.dot(weight, current_activation) + bias
                    class_1_prob = F.sigmoid(class_1_prob)
                    class_0_prob = total_prob - class_1_prob

                    class_0_log_prob = torch.log(class_0_prob)
                    class_1_log_prob = torch.log(class_1_prob)

                    params = [current_activation]
                    optimiser = torch.optim.SGD(params, lr=learning_rate) 
                    optimiser.zero_grad()

                    prediction = (class_0_log_prob, class_1_log_prob)
                    prediction = torch.tensor(
                        torch.cat(prediction)).unsqueeze(0)

                    # unsupervised intervention requires generated labels
                    if generate_labels:
                        label = 0 if class_0_prob > class_1_prob else 1

                    gold_label = torch.tensor(label).unsqueeze(0)

                    criterion = nn.NLLLoss()
                    loss = criterion(prediction, gold_label)
                    loss.backward()
                    optimiser.step()

                    relevant_activations[act] = torch.tensor(current_activation, dtype=torch.float)


        correct_form_score = vocab_probs[w2i[correct_form]].data
        wrong_form_score = vocab_probs[w2i[wrong_form]].data

        if (correct_form_score > wrong_form_score).all():
            score = 1
        else:
            score = 0

        scores.append(score)

        if construction_id == 0 and type_of_sent == 'original':
            scores_original_vnpcv.append(score)
            scores_original.append(score)

        if construction_id == 1 and type_of_sent == 'original':
            scores_original_nvv.append(score)
            scores_original.append(score)

        if construction_id == 0 and type_of_sent == 'generated':
            scores_nonce_vnpcv.append(score)
            scores_nonce.append(score)

        if construction_id == 1 and type_of_sent == 'generated':
            scores_nonce_nvv.append(score)
            scores_nonce.append(score)

    assert(len(scores) == len(test_set) / 2)


    # Print accuracy results
    print ('Original   V NP Conj V   ', np.sum(
        scores_original_vnpcv) / len(scores_original_vnpcv))
    print ('Nonce      V NP Conj V   ', np.sum(
        scores_nonce_vnpcv) / len(scores_nonce_vnpcv))
    print ('Original   N V V         ', np.sum(
        scores_original_nvv) / len(scores_original_nvv))
    print ('Nonce      N V V         ', np.sum(
        scores_nonce_nvv) / len(scores_nonce_nvv))
    print ('Original   Overall       ', np.sum(
        scores_original) / len(scores_original))
    print ('Nonce      Overall       ', np.sum(scores_nonce) / len(scores_nonce))
    print ('Overall                  ', np.sum(scores) / len(scores))
 def cosine(a, b):
     numerator = torch.dot(a, b)
     denominator = torch.norm(a, 2) * torch.norm(b, 2)
     return float(numerator / denominator)
 def get_att_score(self, hidden, encoder_hidden):
     score = self.attn(encoder_hidden)
     return torch.dot(hidden.view(-1), score.view(-1))
Exemple #44
0
# print(max_1)
# print(value)
# print(index)
# print(max_1_0)
# print(max_1_1)
'''
(tensor([2., 4., 6., 8.]), tensor([1, 1, 1, 1]))
tensor([2., 4., 6., 8.])
tensor([1, 1, 1, 1])
tensor([2., 4., 6., 8.])
tensor([1, 1, 1, 1])
'''

### Dot product
tensor = torch.Tensor([1, 2, 3, 4, 5])
dot = torch.dot(tensor, tensor)
# print(dot)
'''
tensor(55.)
'''

### Mathematical functions
tensor = torch.Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])

sqrt = torch.sqrt(tensor)
exp = torch.exp(tensor)
log = torch.log(tensor)
# print(sqrt)
# print(exp)
# print(log)
'''
Exemple #45
0
def matrix_factorization(R, K, steps=100, lr=0.002):
    """
    Input
    -----
    R - Tensor: Ratings matrix_factorization:
        Dimensions: N-users by M-items
    K - Int: Number of latent features

    Output
    ------
    P: User-feature matrix
        Dimensions: N-users by K-features
    Qt: Transpose of Item-feature matrix
        Dimensions: K-features by M-items

    Reference
    ---------
    https://towardsdatascience.com/recommendation-system-matrix-factorization-d61978660b4b
    """
    # Initialize matrices P and Qt with random values between 1 and 0
    N_users, M_items = R.size()
    P = torch.randint(1, 5, (N_users, K))
    Qt = torch.randint(1, 5, (K, M_items))
    beta = 0.02
    prev_e = float("inf")
    # Setup training loop
    for step in range(steps):
        # Calculate and apply gradients
        for user_i in range(N_users):
            for item_j in range(M_items):
                if R[user_i][item_j] > 0:
                    pred = torch.dot(P[user_i,:], Qt[:,item_j])
                    err_ij = R[user_i][item_j] - pred

                    for k in range(K):
                        # Calculate and shift the gradient
                        P[user_i][k] = P[user_i][k] + lr * (2*err_ij*Qt[k][item_j] - beta*P[user_i][k])
                        Qt[k][item_j] = Qt[k][item_j] + lr * (2*err_ij*P[user_i][k] - beta*Qt[k][item_j])

        # Calculate difference in loss
        e = 0.0
        for user_i in range(N_users):
            for item_j in range(M_items):
                if R[user_i][item_j] > 0:
                    e = e + pow(R[user_i][item_j] - torch.dot(P[user_i,:],Qt[:,item_j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[user_i][k],2) + pow(Qt[k][item_j],2))

        if 0 < (prev_e - e) < 50:
            break
        prev_e = e

        if step % 1 == 0:
            print("step: %s, loss: %s" % (step+1, e))
    return P, Qt




    # Calculate Root mean squared error for all values of R and P*Qt
    # Calculate gradients for all values of R and P*Qt

    # Apply gradients to P and Q

    # Return P and Qt
            def Hv(v):
                hessian = get_flat_grads(torch.dot(grad_kld_old_param, v),
                                         self.pi).detach()

                return hessian + cg_damping * v
Exemple #47
0
def main(dom="driving",
         reptype="wordfeat",
         splittype="LOOtask",
         excludeid=2,
         taskrepsize=2,
         modeltype="neural",
         gpmode=0,
         pval=0.1,
         seed=0,
         nfolds=10):

    modelname = modeltype + "_" + str(taskrepsize) + "_" + str(
        gpmode) + "_" + dom + "_" + splittype + "_" + str(excludeid)

    # check what kind of modifications to the GP we are using
    print("Modelname: ", modelname)
    usepriormean, usepriorpoints = getGPParams(gpmode)

    verbose = False

    torch.manual_seed(seed)  # set up our seed for reproducibility
    np.random.seed(seed)

    # load the data
    data, nparts = loadData(dom)
    # print(data)

    # recreate word vectors if needed
    # e.g., when you download new word features from glove.
    recreate_word_vectors = False
    if recreate_word_vectors:
        recreateWordVectors()

    # load word features
    wordfeatures = loadWordFeatures(dom, loadpickle=True)
    print(wordfeatures.shape)

    # in the experiments in the paper, we use the word features directly. However,
    # you can also use tsne or pca dim-reduced features.
    tsnefeatures = computeTSNEFeatures(wordfeatures)
    pcafeatures = computePCAFeatures(wordfeatures)

    allfeatures = {
        "wordfeat": wordfeatures,
        "tsne": tsnefeatures,
        "pca": pcafeatures
    }

    # create primary dataset
    dataset = createDataset(data, reptype, allfeatures)

    # create dataset splits
    expdata = getTrainTestValSplit(data,
                                   dataset,
                                   splittype,
                                   excludeid=excludeid,
                                   pval=pval,
                                   nfolds=nfolds)

    nfeats = allfeatures[reptype].shape[1]

    # we don't use an initial projection matrix. You can substitute one here if you like
    Ainit = None

    inptasksobs = Variable(dtype(expdata["tasksobsfeats_train"]),
                           requires_grad=False)
    inptasksperf = Variable(dtype(expdata["tasksobsperf_train"]),
                            requires_grad=False)
    inptaskspred = Variable(dtype(expdata["taskspredfeats_train"]),
                            requires_grad=False)
    outtrustpred = Variable(dtype(expdata["trustpred_train"]),
                            requires_grad=False)

    inptasksobs_val = Variable(dtype(expdata["tasksobsfeats_val"]),
                               requires_grad=False)
    inptasksperf_val = Variable(dtype(expdata["tasksobsperf_val"]),
                                requires_grad=False)
    inptaskspred_val = Variable(dtype(expdata["taskspredfeats_val"]),
                                requires_grad=False)
    outtrustpred_val = Variable(dtype(expdata["trustpred_val"]),
                                requires_grad=False)

    inptasksobs_test = Variable(dtype(expdata["tasksobsfeats_test"]),
                                requires_grad=False)
    inptasksperf_test = Variable(dtype(expdata["tasksobsperf_test"]),
                                 requires_grad=False)
    inptaskspred_test = Variable(dtype(expdata["taskspredfeats_test"]),
                                 requires_grad=False)
    outtrustpred_test = Variable(dtype(expdata["trustpred_test"]),
                                 requires_grad=False)

    learning_rate = 1e-3

    if modeltype == "gp":
        learning_rate = 1e-1
        usepriormean = usepriormean
        obsseqlen = 2
        phiinit = 1.0
        weight_decay = 0.01  #0.01
        modelparams = {
            "inputsize": inptasksobs.shape[2],
            "reptype": reptype,
            "taskrepsize": taskrepsize,
            "phiinit": phiinit,
            "Ainit": None,  # np.array(Ainit),
            "obsseqlen": obsseqlen,
            "verbose": verbose,
            "usepriormean": usepriormean,
            "usepriorpoints": usepriorpoints
        }
    elif modeltype == "neural":
        perfrepsize = taskrepsize
        numGRUlayers = 2
        nperf = 2
        weight_decay = 0.00
        modelparams = {
            "perfrepsize": perfrepsize,
            "numGRUlayers": numGRUlayers,
            "nperf": nperf,
            "verbose": verbose,
            "taskrepsize": taskrepsize,
            "Ainit": None,  #np.array(Ainit), 
            "nfeats": inptasksobs.shape[2]
        }
    elif modeltype == "lineargaussian":
        obsseqlen = 2
        weight_decay = 0.01
        modelparams = {
            "inputsize": inptasksobs.shape[2],
            "obsseqlen": obsseqlen,
        }
    elif modeltype == "constant":
        obsseqlen = 2
        weight_decay = 0.01
        modelparams = {
            "inputsize": inptasksobs.shape[2],
            "obsseqlen": obsseqlen,
        }
    else:
        raise ValueError("No such model")

    verbose = False
    reportperiod = 1

    # these two parameters control the early stopping
    # we save the stopcount-th model after the best validation is achived
    # but keep the model running for burnin longer in case a better
    # model is attained
    if splittype == "3participant":
        stopcount = 3
        burnin = 50
    elif splittype == "LOOtask":
        stopcount = 3
        burnin = 50

    t0 = time.time()
    bestvalloss = 1e10

    modeldir = "savedmodels"

    for rep in range(1):
        print("REP", rep)
        model = initModel(modeltype, modelname, parameters=modelparams)
        # optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
        #if modeltype == "neural"
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=weight_decay)
        #optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate, max_iter=10, max_eval=20)
        #optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)
        counter = 0

        torch.save(model, os.path.join(modeldir, model.modelname + ".pth"))
        restartopt = False
        t = 1
        #l2comp = nn.L2Loss()
        while t < 500:

            def closure():
                N = inptaskspred.shape[0]
                predtrust = model(inptasksobs, inptasksperf, inptaskspred)
                predtrust = torch.squeeze(predtrust)
                # logloss = torch.mean(torch.pow(predtrust - outtrustpred, 2.0)) # / 2*torch.exp(obsnoise))
                loss = -(
                    torch.dot(outtrustpred, torch.log(predtrust)) + torch.dot(
                        (1 - outtrustpred), torch.log(1.0 - predtrust))) / N

                optimizer.zero_grad()
                loss.backward()
                return loss

            optimizer.step(closure)

            if t % reportperiod == 0:
                # compute training loss
                predtrust = model(inptasksobs, inptasksperf, inptaskspred)
                predtrust = torch.squeeze(predtrust)
                loss = -(torch.dot(outtrustpred, torch.log(predtrust)) +
                         torch.dot(
                             (1 - outtrustpred), torch.log(1.0 - predtrust))
                         ) / inptaskspred.shape[0]

                # compute validation loss
                predtrust_val = model(inptasksobs_val, inptasksperf_val,
                                      inptaskspred_val)
                predtrust_val = torch.squeeze(predtrust_val)
                valloss = -(torch.dot(
                    outtrustpred_val, torch.log(predtrust_val)) + torch.dot(
                        (1 - outtrustpred_val), torch.log(1.0 - predtrust_val))
                            ) / predtrust_val.shape[0]

                # compute prediction loss
                predtrust_test = torch.squeeze(
                    model(inptasksobs_test, inptasksperf_test,
                          inptaskspred_test))
                predloss = -(torch.dot(
                    outtrustpred_test, torch.log(predtrust_test)) + torch.dot(
                        (1 - outtrustpred_test), torch.log(
                            1.0 - predtrust_test))) / predtrust_test.shape[0]

                #print(model.wb, model.wtp, model.trust0, model.sigma0)

                #check for nans
                checkval = np.sum(np.array(predtrust_test.data))
                if np.isnan(checkval) or np.isinf(checkval):
                    # check if we have already restarted once
                    if restartopt:
                        #we've already done this, fail out.
                        #break out.
                        print("Already restarted once. Quitting")
                        break

                    # reinitialize model and switch optimizer
                    print("NaN value encountered. Restarting opt")
                    model = initModel(modeltype,
                                      modelname,
                                      parameters=modelparams)
                    optimizer = torch.optim.Adam(model.parameters(),
                                                 lr=learning_rate)
                    t = 1
                    counter = 0
                    restartopt = True
                else:
                    # print(predtrust_test.data, outtrustpred_test.data)
                    mae = metrics.mean_absolute_error(predtrust_test.data,
                                                      outtrustpred_test.data)

                    print(t, loss.data[0], valloss.data[0], predloss.data[0],
                          mae)
                    optimizer.zero_grad()

                    # if validation loss has increased for stopcount iterations

                    augname = model.modelname + "_" + str(excludeid) + ".pth"
                    if valloss.data[0] <= bestvalloss:
                        torch.save(model, os.path.join(modeldir, augname))
                        print(valloss.data[0], bestvalloss, "Model saved")
                        bestvalloss = valloss.data[0]
                        counter = 0
                    else:
                        if counter < stopcount and (valloss.data[0] -
                                                    bestvalloss) <= 0.1:
                            torch.save(model, os.path.join(modeldir, augname))
                            print(valloss.data[0], bestvalloss,
                                  "Model saved : POST", counter)
                        counter += 1

            if counter >= stopcount and t > burnin:
                #torch.save(model, modeldir+ model.modelname + ".pth")
                break

            t = t + 1

    t1 = time.time()
    print("Total time: ", t1 - t0)
    model = torch.load(
        os.path.join(modeldir, modelname + "_" + str(excludeid) + ".pth"))

    # make predictions using trained model and compute metrics
    predtrust_test = torch.squeeze(
        model(inptasksobs_test, inptasksperf_test, inptaskspred_test))

    res = np.zeros((predtrust_test.shape[0], 2))
    res[:, 0] = predtrust_test.data[:]
    res[:, 1] = outtrustpred_test.data[:]
    print(res)

    mae = metrics.mean_absolute_error(predtrust_test.data,
                                      outtrustpred_test.data)
    predloss = -(torch.dot(outtrustpred_test, torch.log(predtrust_test)) + torch.dot((1 - outtrustpred_test),
                                                                                     torch.log(1.0 - predtrust_test))) / \
               predtrust_test.shape[0]
    predloss = predloss.data[0]

    return (mae, predloss, res)
            def Hv(v):
                hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\
                    .detach()

                return hessian
Exemple #49
0
def regularize_expectation(loss):
    u = loss / loss.sum()
    return torch.dot(loss, u)
Exemple #50
0
 def get_att_score(self, dec_output, enc_output):
     score = self.attn(enc_output)  # score : [batch_size, n_hidden]
     return torch.dot(dec_output.view(-1), score.view(-1))  # 标量值
Exemple #51
0
def matmul(tensor1, tensor2, out=None):
    r"""Matrix product of two tensors.

    The behavior depends on the dimensionality of the tensors as follows:

    - If both tensors are 1-dimensional, the dot product (scalar) is returned.
    - If both arguments are 2-dimensional, the matrix-matrix product is returned.
    - If the first argument is 1-dimensional and the second argument is 2-dimensional,
      a 1 is prepended to its dimension for the purpose of the matrix multiply.
      After the matrix multiply, the prepended dimension is removed.
    - If the first argument is 2-dimensional and the second argument is 1-dimensional,
      the matrix-vector product is returned.
    - If both arguments are at least 1-dimensional and at least one argument is
      N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
      argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
      batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
      1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
      The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
      must be broadcastable).  For example, if :attr:`tensor1` is a
      :math:`(j \times 1 \times n \times m)` tensor and :attr:`tensor2` is a :math:`(k \times m \times p)`
      tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor.

    .. note::

        The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.

    Arguments:
        tensor1 (Tensor): the first tensor to be multiplied
        tensor2 (Tensor): the second tensor to be multiplied
        out (Tensor, optional): the output tensor
    """
    dim_tensor1 = tensor1.dim()
    dim_tensor2 = tensor2.dim()
    if dim_tensor1 == 1 and dim_tensor2 == 1:
        if out is None:
            return torch.dot(tensor1, tensor2)
        else:
            raise ValueError("out must be None for 1-d tensor matmul, returns a scalar")
    if dim_tensor1 == 2 and dim_tensor2 == 1:
        if out is None:
            return torch.mv(tensor1, tensor2)
        else:
            return torch.mv(tensor1, tensor2, out=out)
    elif dim_tensor1 == 1 and dim_tensor2 == 2:
        if out is None:
            return torch.mm(tensor1.unsqueeze(0), tensor2).squeeze_(0)
        else:
            return torch.mm(tensor1.unsqueeze(0), tensor2, out=out).squeeze_(0)
    elif dim_tensor1 == 2 and dim_tensor2 == 2:
        if out is None:
            return torch.mm(tensor1, tensor2)
        else:
            return torch.mm(tensor1, tensor2, out=out)
    elif dim_tensor1 >= 3 and (dim_tensor2 == 1 or dim_tensor2 == 2):
        # optimization: use mm instead of bmm by folding tensor1's batch into
        # its leading matrix dimension.

        if dim_tensor2 == 1:
            tensor2 = tensor2.unsqueeze(-1)

        size1 = tensor1.size()
        size2 = tensor2.size()
        output_size = size1[:-1] + size2[-1:]

        # fold the batch into the first dimension
        tensor1 = tensor1.contiguous().view(-1, size1[-1])

        if out is None or not out.is_contiguous():
            output = torch.mm(tensor1, tensor2)
        else:
            output = torch.mm(tensor1, tensor2, out=out)

        output = output.view(output_size)

        if dim_tensor2 == 1:
            output = output.squeeze(-1)

        if out is not None:
            out.set_(output)
            return out

        return output
    elif (dim_tensor1 >= 1 and dim_tensor2 >= 1) and (dim_tensor1 >= 3 or dim_tensor2 >= 3):
        # ensure each tensor size is at least 3-dimensional
        tensor1_exp_size = torch.Size((1,) * max(3 - tensor1.dim(), 0) + tensor1.size())
        # rhs needs to be a separate case since we can't freely expand 1s on the rhs, but can on lhs
        if dim_tensor2 == 1:
            tensor2 = tensor2.unsqueeze(1)
        tensor2_exp_size = torch.Size((1,) * max(3 - tensor2.dim(), 0) + tensor2.size())

        # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
        expand_batch_portion = torch._C._infer_size(tensor1_exp_size[:-2], tensor2_exp_size[:-2])

        # flatten expanded batches
        tensor1_expanded = tensor1.expand(*(expand_batch_portion + tensor1_exp_size[-2:])) \
            .contiguous().view(reduce(mul, expand_batch_portion), *tensor1_exp_size[-2:])
        tensor2_expanded = tensor2.expand(*(expand_batch_portion + tensor2_exp_size[-2:])) \
            .contiguous().view(reduce(mul, expand_batch_portion), *tensor2_exp_size[-2:])

        # reshape batches back into result
        total_expansion = expand_batch_portion + (tensor1_exp_size[-2], tensor2_exp_size[-1])

        def maybeSqueeze(tensor):
            if dim_tensor1 == 1:
                return tensor.squeeze(-2)
            elif dim_tensor2 == 1:
                return tensor.squeeze(-1)
            else:
                return tensor

        if out is None or not out.is_contiguous():
            output = torch.bmm(tensor1_expanded, tensor2_expanded)
        else:
            output = torch.bmm(tensor1_expanded, tensor2_expanded, out=out)

        output = maybeSqueeze(output.view(total_expansion))

        if out is not None:
            out.set_(output)
            return out

        return output

    raise ValueError("both arguments to __matmul__ need to be at least 1D, "
                     "but they are {}D and {}D".format(dim_tensor1, dim_tensor2))
Exemple #52
0
def regularize_expectation_exp(loss):
    u = loss.exp() / loss.exp().sum()
    return torch.dot(loss, u)