def verify_loss(self, f): p = 0 n_pair = f.size(1) for i in range(n_pair): for t in range(n_pair): p += torch.dot(f[:, i], f[:, t]) / torch.dot(f[:, i], f) return p.mean()
def test_local_var_binary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' x = torch.FloatTensor([1, 2, 3, 4]) y = torch.FloatTensor([[1, 2, 3, 4]]) z = torch.matmul(x, y.t()) assert (torch.equal(z, torch.FloatTensor([30]))) z = torch.add(x, y) assert (torch.equal(z, torch.FloatTensor([[2, 4, 6, 8]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) z = torch.cross(x, y, dim=1) assert (torch.equal(z, torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) z = torch.dist(x, y) t = torch.FloatTensor([z]) assert (torch.equal(t, torch.FloatTensor([0.]))) x = torch.FloatTensor([1, 2, 3]) y = torch.FloatTensor([1, 2, 3]) z = torch.dot(x, y) t = torch.FloatTensor([z]) assert torch.equal(t, torch.FloatTensor([14])) z = torch.eq(x, y) assert (torch.equal(z, torch.ByteTensor([1, 1, 1]))) z = torch.ge(x, y) assert (torch.equal(z, torch.ByteTensor([1, 1, 1])))
def test_remote_var_binary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' hook = TorchHook(verbose=False) local = hook.local_worker remote = VirtualWorker(hook, 1) local.add_worker(remote) x = Var(torch.FloatTensor([1, 2, 3, 4])).send(remote) y = Var(torch.FloatTensor([[1, 2, 3, 4]])).send(remote) z = torch.matmul(x, y.t()) assert (torch.equal(z.get(), Var(torch.FloatTensor([30])))) z = torch.add(x, y) assert (torch.equal(z.get(), Var(torch.FloatTensor([[2, 4, 6, 8]])))) x = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote) y = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote) z = torch.cross(x, y, dim=1) assert (torch.equal(z.get(), Var(torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])))) x = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote) y = Var(torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]])).send(remote) z = torch.dist(x, y) assert (torch.equal(z.get(), Var(torch.FloatTensor([0.])))) x = Var(torch.FloatTensor([1, 2, 3])).send(remote) y = Var(torch.FloatTensor([1, 2, 3])).send(remote) z = torch.dot(x, y) print(torch.equal(z.get(), Var(torch.FloatTensor([14])))) z = torch.eq(x, y) assert (torch.equal(z.get(), Var(torch.ByteTensor([1, 1, 1])))) z = torch.ge(x, y) assert (torch.equal(z.get(), Var(torch.ByteTensor([1, 1, 1]))))
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 if group['weight_decay'] != 0: grad = grad.add(group['weight_decay'], p.data) if state['step'] > 1: prev_bias_correction1 = 1 - beta1 ** (state['step'] - 1) prev_bias_correction2 = 1 - beta2 ** (state['step'] - 1) # Hypergradient for Adam: h = torch.dot(grad.view(-1), torch.div(exp_avg, exp_avg_sq.sqrt().add_(group['eps'])).view(-1)) * math.sqrt(prev_bias_correction2) / prev_bias_correction1 # Hypergradient descent of the learning rate: tmp = group['hypergrad_lr'] * h group['lr'] += tmp.double().cpu() # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) return loss
def cal_angle(vec1, vec2): """ Calculate cosine similarities between two torch tensors or two ndarraies Args: vec1, vec2: two tensors or numpy ndarraies """ if isinstance(vec1, torch.Tensor) and isinstance(vec1, torch.Tensor): return torch.dot(vec1, vec2)/(vec1.norm()*vec2.norm()).item() elif isinstance(vec1, np.ndarray) and isinstance(vec2, np.ndarray): return np.ndarray.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
def updateOutput(self, input, target): # - log(input) * target - log(1 - input) * (1 - target) if input.nelement() != target.nelement(): raise RuntimeError("input and target size mismatch") if self.buffer is None: self.buffer = input.new() buffer = self.buffer weights = self.weights buffer.resize_as_(input) if weights is not None and target.dim() != 1: weights = self.weights.view(1, target.size(1)).expand_as(target) # log(input) * target torch.add(input, self.eps, out=buffer).log_() if weights is not None: buffer.mul_(weights) target_1d = target.contiguous().view(-1) # don't save a 1-d view of buffer: it should already be contiguous, and it's # used as non-1d tensor later. output = torch.dot(target_1d, buffer.contiguous().view(-1)) # log(1 - input) * (1 - target) torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_() if weights is not None: buffer.mul_(weights) output = output + torch.sum(buffer) output = output - torch.dot(target_1d, buffer.contiguous().view(-1)) if self.sizeAverage: output = output / input.nelement() self.output = - output.item() return self.output
def project_1D(w, d): """ Project vector w to vector d and get the length of the projection. Args: w: vectorized weights d: vectorized direction Returns: the projection scalar """ assert len(w) == len(d), 'dimension does not match for w and ' scale = torch.dot(w, d)/d.norm() return scale.item()
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ assert len(self.param_groups) == 1 loss = None if closure is not None: loss = closure() group = self.param_groups[0] weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] grad = self._gather_flat_grad_with_weight_decay(weight_decay) # NOTE: SGDHD has only global state, but we register it as state for # the first param, because this helps with casting in load_state_dict state = self.state[self._params[0]] # State initialization if len(state) == 0: state['grad_prev'] = torch.zeros_like(grad) grad_prev = state['grad_prev'] # Hypergradient for SGD h = torch.dot(grad, grad_prev) # Hypergradient descent of the learning rate: group['lr'] += group['hypergrad_lr'] * h if momentum != 0: if 'momentum_buffer' not in state: buf = state['momentum_buffer'] = torch.zeros_like(grad) buf.mul_(momentum).add_(grad) else: buf = state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, grad) if nesterov: grad.add_(momentum, buf) else: grad = buf state['grad_prev'] = grad self._add_grad(-group['lr'], grad) return loss
def compute_weight(self, module): weight = module._parameters[self.name + '_org'] u = module._buffers[self.name + '_u'] height = weight.size(0) weight_mat = weight.view(height, -1) for _ in range(self.n_power_iterations): # Spectral norm of weight equals to `u^T W v`, where `u` and `v` # are the first left and right singular vectors. # This power iteration produces approximations of `u` and `v`. v = normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps) u = normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps) sigma = torch.dot(u, torch.matmul(weight_mat, v)) weight.data /= sigma return weight, u
def compute_weight(self, module): weight = getattr(module, self.name + '_org') u = getattr(module, self.name + '_u') height = weight.size(0) weight_mat = weight.view(height, -1) with torch.no_grad(): for _ in range(self.n_power_iterations): # Spectral norm of weight equals to `u^T W v`, where `u` and `v` # are the first left and right singular vectors. # This power iteration produces approximations of `u` and `v`. v = normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps) u = normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps) sigma = torch.dot(u, torch.matmul(weight_mat, v)) weight = weight / sigma return weight, u
def compute_weight(self, module): weight = getattr(module, self.name + '_orig') u = getattr(module, self.name + '_u') weight_mat = weight if self.dim != 0: # permute dim to front weight_mat = weight_mat.permute(self.dim, *[d for d in range(weight_mat.dim()) if d != self.dim]) height = weight_mat.size(0) weight_mat = weight_mat.reshape(height, -1) with torch.no_grad(): for _ in range(self.n_power_iterations): v = F.normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps) u = F.normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps) sigma = torch.dot(u, torch.matmul(weight_mat, v)) weight = weight / sigma return weight, u
def test(self, dataset): self.model.eval() total_loss = 0 predictions = torch.zeros(len(dataset)) indices = torch.arange(1, dataset.num_classes + 1) for idx in tqdm(range(len(dataset)),desc='Testing epoch ' + str(self.epoch) + ''): ltree, lsent, rtree, rsent, label = dataset[idx] linput, rinput = Var(lsent, volatile=True), Var(rsent, volatile=True) target = Var(map_label_to_target(label, dataset.num_classes), volatile=True) if self.args.cuda: linput, rinput = linput.cuda(), rinput.cuda() target = target.cuda() output = self.model(ltree, linput, rtree, rinput) loss = self.criterion(output, target) total_loss += loss.data[0] output = output.data.squeeze().cpu() predictions[idx] = torch.dot(indices, torch.exp(output)) return total_loss / len(dataset), predictions
def lovasz_hinge_flat(logits, labels): """ Binary Lovasz hinge loss logits: [P] Variable, logits at each prediction (between -\infty and +\infty) labels: [P] Tensor, binary ground truth labels (0 or 1) ignore: label to ignore """ if len(labels) == 0: # only void pixels, the gradients should be 0 return logits.sum() * 0. signs = 2. * labels.float() - 1. errors = (1. - logits * Variable(signs)) errors_sorted, perm = torch.sort(errors, dim=0, descending=True) perm = perm.data gt_sorted = labels[perm] grad = lovasz_grad(gt_sorted) loss = torch.dot(F.relu(errors_sorted), Variable(grad)) return loss
def backward(self, gradient, image): # lazy import import torch from torch.autograd import Variable assert gradient.ndim == 1 gradient = torch.from_numpy(gradient) if self.cuda: # pragma: no cover gradient = gradient.cuda() gradient = Variable(gradient) image = self._process_input(image) assert image.ndim == 3 images = image[np.newaxis] images = torch.from_numpy(images) if self.cuda: # pragma: no cover images = images.cuda() images = Variable(images, requires_grad=True) predictions = self._model(images) print(predictions.size()) predictions = predictions[0] assert gradient.dim() == 1 assert predictions.dim() == 1 assert gradient.size() == predictions.size() loss = torch.dot(predictions, gradient) loss.backward() # should be the same as predictions.backward(gradient=gradient) grad = images.grad grad = grad.data if self.cuda: # pragma: no cover grad = grad.cpu() grad = grad.numpy() grad = self._process_gradient(grad) grad = np.squeeze(grad, axis=0) assert grad.shape == image.shape return grad
def lovasz_softmax_flat(probas, labels, only_present=False): """ Multi-class Lovasz-Softmax loss probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) labels: [P] Tensor, ground truth labels (between 0 and C - 1) only_present: average only on classes present in ground truth """ C = probas.size(1) losses = [] for c in range(C): fg = (labels == c).float() # foreground for class c if only_present and fg.sum() == 0: continue errors = (Variable(fg) - probas[:, c]).abs() errors_sorted, perm = torch.sort(errors, 0, descending=True) perm = perm.data fg_sorted = fg[perm] losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) return mean(losses)
def test_remote_tensor_binary_methods(self): hook = TorchHook(verbose = False) local = hook.local_worker remote = VirtualWorker(hook, 0) local.add_worker(remote) x = torch.FloatTensor([1, 2, 3, 4, 5]).send(remote) y = torch.FloatTensor([1, 2, 3, 4, 5]).send(remote) assert (x.add_(y).get() == torch.FloatTensor([2,4,6,8,10])).all() x = torch.FloatTensor([1, 2, 3, 4]).send(remote) y = torch.FloatTensor([[1, 2, 3, 4]]).send(remote) z = torch.matmul(x, y.t()) assert (torch.equal(z.get(), torch.FloatTensor([30]))) z = torch.add(x, y) assert (torch.equal(z.get(), torch.FloatTensor([[2, 4, 6, 8]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote) z = torch.cross(x, y, dim=1) assert (torch.equal(z.get(), torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]).send(remote) z = torch.dist(x, y) t = torch.FloatTensor([z]) assert (torch.equal(t, torch.FloatTensor([0.]))) x = torch.FloatTensor([1, 2, 3]).send(remote) y = torch.FloatTensor([1, 2, 3]).send(remote) z = torch.dot(x, y) t = torch.FloatTensor([z]) assert torch.equal(t, torch.FloatTensor([14])) z = torch.eq(x, y) assert (torch.equal(z.get(), torch.ByteTensor([1, 1, 1]))) z = torch.ge(x, y) assert (torch.equal(z.get(), torch.ByteTensor([1, 1, 1])))
def test_local_tensor_binary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' x = torch.FloatTensor([1, 2, 3, 4]) y = torch.FloatTensor([[1, 2, 3, 4]]) z = torch.matmul(x, y.t()) assert (torch.equal(z, torch.FloatTensor([30]))) z = torch.add(x, y) assert (torch.equal(z, torch.FloatTensor([[2, 4, 6, 8]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) z = torch.cross(x, y, dim=1) assert (torch.equal(z, torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))) x = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) y = torch.FloatTensor([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) z = torch.dist(x, y) assert (torch.equal(torch.FloatTensor([z]), torch.FloatTensor([0]))) x = torch.FloatTensor([1, 2, 3]) y = torch.FloatTensor([1, 2, 3]) z = torch.dot(x, y) # There is an issue with some Macs getting 0.0 instead # Solved here: https://github.com/pytorch/pytorch/issues/5609 assert torch.equal(torch.FloatTensor([z]), torch.FloatTensor([14])) z = torch.eq(x, y) assert (torch.equal(z, torch.ByteTensor([1, 1, 1]))) z = torch.ge(x, y) assert (torch.equal(z, torch.ByteTensor([1, 1, 1]))) x = torch.FloatTensor([1, 2, 3, 4, 5]) y = torch.FloatTensor([1, 2, 3, 4, 5]) assert (x.add_(y) == torch.FloatTensor([2, 4, 6, 8, 10])).all()
def step(self, closure, b=None, M_inv=None): """ Performs a single optimization step. Arguments: closure (callable): A closure that re-evaluates the model and returns a tuple of the loss and the output. b (callable, optional): A closure that calculates the vector b in the minimization problem x^T . A . x + x^T b. M (callable, optional): The INVERSE preconditioner of A """ assert len(self.param_groups) == 1 group = self.param_groups[0] alpha = group['alpha'] delta_decay = group['delta_decay'] cg_max_iter = group['cg_max_iter'] damping = group['damping'] use_gnm = group['use_gnm'] verbose = group['verbose'] state = self.state[self._params[0]] state.setdefault('func_evals', 0) state.setdefault('n_iter', 0) loss_before, output = closure() current_evals = 1 state['func_evals'] += 1 # Gather current parameters and respective gradients flat_params = parameters_to_vector(self._params) flat_grad = self._gather_flat_grad() # Define linear operator if use_gnm: # Generalized Gauss-Newton vector product def A(x): return self._Gv(loss_before, output, x, damping) else: # Hessian-vector product def A(x): return self._Hv(flat_grad, x, damping) if M_inv is not None: m_inv = M_inv() # Preconditioner recipe (Section 20.13) if m_inv.dim() == 1: m = (m_inv + damping) ** (-0.85) def M(x): return m * x else: m = torch.inverse(m_inv + damping * torch.eye(*m_inv.shape)) def M(x): return m @ x else: M = None b = flat_grad.detach() if b is None else b().detach().flatten() # Initializing Conjugate-Gradient (Section 20.10) if state.get('init_delta') is not None: init_delta = delta_decay * state.get('init_delta') else: init_delta = torch.zeros_like(flat_params) eps = torch.finfo(b.dtype).eps # Conjugate-Gradient deltas, Ms = self._CG(A=A, b=b.neg(), x0=init_delta, M=M, max_iter=cg_max_iter, tol=1e1 * eps, eps=eps, martens=True) # Update parameters delta = state['init_delta'] = deltas[-1] M = Ms[-1] vector_to_parameters(flat_params + delta, self._params) loss_now = closure()[0] current_evals += 1 state['func_evals'] += 1 # Conjugate-Gradient backtracking (Section 20.8.7) if verbose: print("Loss before CG: {}".format(float(loss_before))) print("Loss before BT: {}".format(float(loss_now))) for (d, m) in zip(reversed(deltas[:-1][::2]), reversed(Ms[:-1][::2])): vector_to_parameters(flat_params + d, self._params) loss_prev = closure()[0] if float(loss_prev) > float(loss_now): break delta = d M = m loss_now = loss_prev if verbose: print("Loss after BT: {}".format(float(loss_now))) # The Levenberg-Marquardt Heuristic (Section 20.8.5) reduction_ratio = (float(loss_now) - float(loss_before)) / M if M != 0 else 1 if reduction_ratio < 0.25: group['damping'] *= 3 / 2 elif reduction_ratio > 0.75: group['damping'] *= 2 / 3 if reduction_ratio < 0: group['init_delta'] = 0 # Line Searching (Section 20.8.8) beta = 0.8 c = 1e-2 min_improv = min(c * torch.dot(b, delta), 0) for _ in range(60): if float(loss_now) <= float(loss_before) + alpha * min_improv: break alpha *= beta vector_to_parameters(flat_params + alpha * delta, self._params) loss_now = closure()[0] else: # No good update found alpha = 0.0 loss_now = loss_before # Update the parameters (this time fo real) vector_to_parameters(flat_params + alpha * delta, self._params) if verbose: print("Loss after LS: {0} (lr: {1:.3f})".format( float(loss_now), alpha)) print("Tikhonov damping: {0:.3f} (reduction ratio: {1:.3f})".format( group['damping'], reduction_ratio), end='\n\n') return loss_now
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--exp_name', default='ijba_eval') parser.add_argument('-g', '--gpu', type=int, default=0) parser.add_argument('-d', '--data_dir', default='/home/renyi/arunirc/data1/datasets/CS2') parser.add_argument('-p', '--protocol_dir', default='/home/renyi/arunirc/data1/datasets/IJB-A/IJB-A_11_sets/') parser.add_argument('--fold', type=int, default=1, choices=[1,10]) parser.add_argument('--sqrt', action='store_true', default=False, help='Add signed sqrt normalization') parser.add_argument('--cosine', action='store_true', default=False, help='Use cosine similarity instead of L2 distance') parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('-m', '--model_path', default=MODEL_PATH, help='Path to pre-trained model') parser.add_argument('--model_type', default=MODEL_TYPE, choices=['resnet50', 'resnet101', 'resnet101-512d', 'resnet101-512d-norm']) args = parser.parse_args() # CUDA setup os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) cuda = torch.cuda.is_available() torch.manual_seed(1337) if cuda: torch.cuda.manual_seed(1337) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # enable if all images are same size # ----------------------------------------------------------------------------- # 1. Model # ----------------------------------------------------------------------------- num_class = 8631 if args.model_type == 'resnet50': model = torchvision.models.resnet50(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101': model = torchvision.models.resnet101(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101-512d': model = torchvision.models.resnet101(pretrained=False) layers = [] layers.append(torch.nn.Linear(2048, 512)) layers.append(torch.nn.Linear(512, num_class)) model.fc = torch.nn.Sequential(*layers) elif args.model_type == 'resnet101-512d-norm': model = torchvision.models.resnet101(pretrained=False) layers = [] layers.append(torch.nn.Linear(2048, 512)) layers.append(models.NormFeat(scale_factor=50.0)) layers.append(torch.nn.Linear(512, num_class)) model.fc = torch.nn.Sequential(*layers) else: raise NotImplementedError checkpoint = torch.load(args.model_path) if checkpoint['arch'] == 'DataParallel': model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4]) model.load_state_dict(checkpoint['model_state_dict']) model = model.module # get network module from inside its DataParallel wrapper else: model.load_state_dict(checkpoint['model_state_dict']) if cuda: model = model.cuda() # Convert the trained network into a "feature extractor" feature_map = list(model.children()) if args.model_type == 'resnet101-512d' or args.model_type == 'resnet101-512d-norm': model.eval() extractor = model extractor.fc = nn.Sequential(extractor.fc[0]) else: feature_map.pop() extractor = nn.Sequential(*feature_map) extractor.eval() # ALWAYS set to evaluation mode (fixes BatchNorm, dropout, etc.) # ----------------------------------------------------------------------------- # 2. Dataset # ----------------------------------------------------------------------------- fold_id = 1 file_ext = '.jpg' RGB_MEAN = [ 0.485, 0.456, 0.406 ] RGB_STD = [ 0.229, 0.224, 0.225 ] test_transform = transforms.Compose([ # transforms.Scale(224), # transforms.CenterCrop(224), transforms.Scale((224,224)), transforms.ToTensor(), transforms.Normalize(mean = RGB_MEAN, std = RGB_STD), ]) pairs_path = osp.join(args.protocol_dir, 'split%d' % fold_id, 'verify_comparisons_%d.csv' % fold_id) pairs = utils.read_ijba_pairs(pairs_path) protocol_file = osp.join(args.protocol_dir, 'split%d' % fold_id, 'verify_metadata_%d.csv' % fold_id) metadata = utils.get_ijba_1_1_metadata(protocol_file) # dict assert np.all(np.unique(pairs) == np.unique(metadata['template_id'])) # sanity-check path_list = np.array([osp.join(args.data_dir, str(x)+file_ext) for x in metadata['sighting_id'] ]) # face crops saved as <sighting_id.jpg> # Create data loader test_loader = torch.utils.data.DataLoader( data_loader.IJBADataset( path_list, test_transform, split=fold_id), batch_size=args.batch_size, shuffle=False ) # testing # for i in range(len(test_loader.dataset)): # img = test_loader.dataset.__getitem__(i) # sz = img.shape # if sz[0] != 3: # print sz # ----------------------------------------------------------------------------- # 3. Feature extraction # ----------------------------------------------------------------------------- print 'Feature extraction...' cache_dir = osp.join(here, 'cache-' + args.model_type) if not osp.exists(cache_dir): os.makedirs(cache_dir) feat_path = osp.join(cache_dir, 'feat-fold-%d.mat' % fold_id) if not osp.exists(feat_path): features = [] for batch_idx, images in tqdm.tqdm(enumerate(test_loader), total=len(test_loader), desc='Extracting features'): x = Variable(images, volatile=True) # test-time memory conservation if cuda: x = x.cuda() feat = extractor(x) if cuda: feat = feat.data.cpu() # free up GPU else: feat = feat.data features.append(feat) features = torch.cat(features, dim=0) # (n_batch*batch_sz) x 512 sio.savemat(feat_path, {'feat': features.cpu().numpy() }) else: dat = sio.loadmat(feat_path) features = torch.FloatTensor(dat['feat']) del dat print 'Loaded.' # ----------------------------------------------------------------------------- # 4. Verification # ----------------------------------------------------------------------------- scores = [] labels = [] # labels: is_same_subject print 'Computing pair labels . . . ' for pair in tqdm.tqdm(pairs): # TODO - check tqdm sel_t0 = np.where(metadata['template_id'] == pair[0]) sel_t1 = np.where(metadata['template_id'] == pair[1]) subject0 = np.unique(metadata['subject_id'][sel_t0]) subject1 = np.unique(metadata['subject_id'][sel_t1]) labels.append(int(subject0 == subject1)) labels = np.array(labels) print 'done' # templates: average pool, then L2-normalize print 'Pooling templates . . . ' pooled_features = [] template_set = np.unique(metadata['template_id']) for tid in tqdm.tqdm(template_set): sel = np.where(metadata['template_id'] == tid) # pool template: 1 x n x 512 -> 1 x 512 feat = features[sel,:].mean(1) if args.sqrt: # signed-square-root normalization feat = torch.mul(torch.sign(feat),torch.sqrt(torch.abs(feat)+1e-12)) pooled_features.append(F.normalize(feat, p=2, dim=1) ) pooled_features = torch.cat(pooled_features, dim=0) # (n_batch*batch_sz) x 512 print 'done' print 'Computing pair distances . . . ' for pair in tqdm.tqdm(pairs): sel_t0 = np.where(template_set == pair[0]) sel_t1 = np.where(template_set == pair[1]) if args.cosine: feat_dist = torch.dot(torch.squeeze(pooled_features[sel_t0]), torch.squeeze(pooled_features[sel_t1])) else: feat_dist = (pooled_features[sel_t0] - pooled_features[sel_t1]).norm(p=2, dim=1) feat_dist = -torch.squeeze(feat_dist) feat_dist = feat_dist.numpy() scores.append(feat_dist) # score: negative of L2-distance scores = np.array(scores) # Metrics: TAR (tpr) at FAR (fpr) fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, scores) fpr_levels = [0.0001, 0.001, 0.01, 0.1] f_interp = interpolate.interp1d(fpr, tpr) tpr_at_fpr = [ f_interp(x) for x in fpr_levels ] for (far, tar) in zip(fpr_levels, tpr_at_fpr): print 'TAR @ FAR=%.4f : %.4f' % (far, tar) res = {} res['TAR'] = tpr_at_fpr res['FAR'] = fpr_levels with open( osp.join(cache_dir, 'result-1-1-fold-%d.yaml' % fold_id), 'w') as f: yaml.dump(res, f, default_flow_style=False) sio.savemat(osp.join(cache_dir, 'roc-1-1-fold-%d.mat' % fold_id), {'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds, 'tpr_at_fpr': tpr_at_fpr})
def __getitem__(self, index): scene_id = self.scene_index[index // NUM_SAMPLE_PER_SCENE] sample_id = index % NUM_SAMPLE_PER_SCENE sample_path = os.path.join(self.image_folder, 'scene_' + str(scene_id), 'sample_' + str(sample_id)) images = [] for image_name in image_names: image_path = os.path.join(sample_path, image_name) image = Image.open(image_path) image.load() images.append(self.transform["image"](image)) image_tensor = torch.stack(images) data_entries = self.annotation_dataframe[ (self.annotation_dataframe['scene'] == scene_id) & (self.annotation_dataframe['sample'] == sample_id)] corners = data_entries[[ 'fl_x', 'fr_x', 'bl_x', 'br_x', 'fl_y', 'fr_y', 'bl_y', 'br_y' ]].to_numpy() categories = data_entries.category_id.to_numpy() ego_path = os.path.join(sample_path, 'ego.png') ego_image = Image.open(ego_path) ego_image.load() ego_image = torchvision.transforms.functional.to_tensor(ego_image) road_image = convert_map_to_road_map(ego_image) road_image = self.transform["road"](road_image.type(torch.FloatTensor)) # print(torch.as_tensor(corners).view(-1, 2, 4).transpose(1,2).flatten(1,2)) bounding_box = torch.as_tensor(corners).view( -1, 2, 4) #.transpose(1,2)#.flatten(1,2) bounding_box[:, 0] = (bounding_box[:, 0] * 10) + 400 bounding_box[:, 1] = (-bounding_box[:, 1] * 10) + 400 bounding_box = (bounding_box * 256) / 800 bounding_box = bounding_box.transpose(1, 2) # print(bounding_box[:, :, 0].shape) bbox = torch.zeros(bounding_box.shape[0], 4) # print(bbox.shape, bounding_box.shape) # bbox = (bbox * 256)/800 bbox_new = torch.zeros(bounding_box.shape[0], 5) # print(bbox.shape, bounding_box.shape) # bbox[:, 0] = bounding_box[:, :, 0].min(dim=1)[0] # bbox[:, 1] = bounding_box[:, :, 1].min(dim=1)[0] # bbox[:, 2] = bounding_box[:, :, 0].max(dim=1)[0] # bbox[:, 3] = bounding_box[:, :, 1].max(dim=1)[0] # Computre rotate angle from center point for i, box in enumerate(bounding_box): if box[0][0] <= box[2][0] and box[0][1] >= box[1][1]: br = box[0] bl = box[1] fr = box[2] fl = box[3] else: fl = box[0] fr = box[1] bl = box[2] br = box[3] # print("before:",box) centerpoint = (fl + br) / 2 if fl[0] > fr[0]: # negative angle if fr[0] != centerpoint[0]: theta = torch.atan( (fr[1] - centerpoint[1]) / abs(fr[0] - centerpoint[0])) else: theta = (np.pi / 2) a = bl - centerpoint b = fl - centerpoint tempangle = torch.acos( torch.dot(a, b) / (torch.norm(a, 2) * torch.norm(b, 2))) beta = (np.pi - tempangle) / 2 if fr[0] > centerpoint[0]: gamma = -(theta - beta) else: gamma = -(np.pi - theta - beta) # print ("-----test----") # print (torch.norm(a, 2)) # print (torch.norm(b, 2)) # print (theta) # print (beta) # print (gamma) elif fl[0] < fr[0]: # positive angle if centerpoint[0] != br[0]: theta = torch.atan( (br[1] - centerpoint[1]) / abs(centerpoint[0] - br[0])) else: theta = np.pi / 2 a = fl - centerpoint b = bl - centerpoint tempangle = torch.acos( torch.dot(a, b) / (torch.norm(a, 2) * torch.norm(b, 2))) beta = (np.pi - tempangle) / 2 if br[0] > centerpoint[0]: gamma = (theta - beta) else: gamma = (np.pi - theta - beta) else: gamma = 0 # print((gamma*180)/np.pi) #theta = np.arctan((fr[1] - br[1])/(fr[0]-br[0])) bbox_new[i, 4] = gamma translation_matrix = torch.tensor([[1, 0, centerpoint[0]], [0, 1, centerpoint[1]], [0, 0, 1]]) reverse_translation_matrix = torch.tensor([[1, 0, -centerpoint[0]], [0, 1, -centerpoint[1]], [0, 0, 1]]) rotation_matrix = torch.tensor( [[torch.cos(-gamma), -torch.sin(-gamma), 0], [torch.sin(-gamma), torch.cos(-gamma), 0], [0, 0, 1]]) # print(translation_matrix,reverse_translation_matrix,rotation_matrix) # print(box.shape) box = torch.cat([ box.transpose(0, 1), torch.ones(box.shape[0]).type(torch.DoubleTensor).unsqueeze(0) ], dim=0) # print(box) bbox_rotated = torch.matmul( translation_matrix, torch.matmul(rotation_matrix, torch.matmul(reverse_translation_matrix, box)))[:2] # print(bbox_rotated) # print("\nrotation matrix shape:",rotation_matrix.shape) # rotation_matrix = torch.from_numpy(rotation_matrix) # bbox_rotated = torch.matmul(rotation_matrix, torch.transpose(box, 0, 1)) # print("\nbbox_rotated shape:",bbox_rotated.shape) # print("\nrotated_bbox:", bbox_rotated) # print("\nbbox new shape:",bbox_new.shape) if box[0][0] <= box[2][0] and box[0][1] >= box[1][1]: bbox_new[i, 0] = bbox_rotated[0, 1] bbox_new[i, 1] = bbox_rotated[1, 1] bbox_new[i, 2] = bbox_rotated[0, 2] bbox_new[i, 3] = bbox_rotated[1, 2] else: bbox_new[i, 0] = bbox_rotated[0, 0] bbox_new[i, 1] = bbox_rotated[1, 0] bbox_new[i, 2] = bbox_rotated[0, 3] bbox_new[i, 3] = bbox_rotated[1, 3] # print("\nafter:",bbox_new[i]) # if len(bbox_rotated[bbox_rotated<0])>0: # print(bbox[0]) # print(scene_id, sample_id, bounding_box.shape) classes = torch.as_tensor(categories).view(-1, 1) # print(bbox_new.shape,classes.shape) if self.args.gen_semantic_map: semantic_map_path = os.path.join(sample_path, "semantic_map.npy") semantic_map = np.load(semantic_map_path) semantic_map = F.one_hot( torch.tensor(semantic_map).to(torch.int64), 11) else: # self.args.gen_object_map: semantic_map_path = os.path.join(sample_path, "object_map.npy") semantic_map = np.load(semantic_map_path) semantic_map = F.one_hot( torch.tensor(semantic_map).to(torch.int64), 3) semantic_map = semantic_map.transpose(1, 2).transpose(0, 1) # plt.imshow(semantic_map) if self.extra_info: actions = data_entries.action_id.to_numpy() # You can change the binary_lane to False to get a lane with lane_image = convert_map_to_lane_map(ego_image, binary_lane=True) action = torch.as_tensor(actions) ego = self.transform["road"](ego_image) road = lane_image # print(scene_id, sample_id, bounding_box[0]) # print(bounding_box.shape,classes.shape) # print(classes) # exit(0) return index, image_tensor, bbox_new, classes, action, ego, road_image, semantic_map else: return index, image_tensor, bbox_new, classes
# -- Matrix Multiplication -- x1 = torch.rand((2, 5)) x2 = torch.rand((5, 3)) x3 = torch.mm(x1, x2) # Matrix multiplication of x1 and x2, out shape: 2x3 x3 = x1.mm(x2) # Similar as line above # -- Matrix Exponentiation -- matrix_exp = torch.rand(5, 5) print(matrix_exp.matrix_power( 3)) # is same as matrix_exp (mm) matrix_exp (mm) matrix_exp # -- Element wise Multiplication -- z = x * y # z = [9, 16, 21] = [1*9, 2*8, 3*7] # -- Dot product -- z = torch.dot(x, y) # Dot product, in this case z = 1*9 + 2*8 + 3*7 # -- Batch Matrix Multiplication -- batch = 32 n = 10 m = 20 p = 30 tensor1 = torch.rand((batch, n, m)) tensor2 = torch.rand((batch, m, p)) out_bmm = torch.bmm(tensor1, tensor2) # Will be shape: (b x n x p) # -- Example of broadcasting -- x1 = torch.rand((5, 5)) x2 = torch.ones((1, 5)) z = ( x1 - x2
#-------------------------------------------------------------------- # Matrix modes_to_nodes val_r_inv = torch.inverse(val_r) # Computes coordiantes modes coords_modes = torch.mm(val_r_inv, coords) # Initialized coordiantes interp_coords = torch.mm(val_i, coords_modes) # Initialized jacobian jacobian = torch.empty(3, 3, nnodes_if, dtype=torch.float64) for inode in range(0, nnodes_if): jacobian[0, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 0]) jacobian[0, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 0]) jacobian[0, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 0]) jacobian[1, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 1]) jacobian[1, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 1]) jacobian[1, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 1]) jacobian[2, 0, inode] = torch.dot(ddxi_i[inode, :], coords_modes[:, 2]) jacobian[2, 1, inode] = torch.dot(ddeta_i[inode, :], coords_modes[:, 2]) jacobian[2, 2, inode] = torch.dot(ddzeta_i[inode, :], coords_modes[:, 2]) update_progress("Computing Jacobian ", inode / (nnodes_if - 1)) if coord_sys == 'CYLINDRICAL': scaling_factor = torch.mm(val_i, coords_modes[:, 0]) for inode in range(0, nnodes_if): jacobian[1, 0, inode] = jacobian[1, 0, inode] * scaling_factor[inode] jacobian[1, 1, inode] = jacobian[1, 1, inode] * scaling_factor[inode]
def train(self, num_episodes, save_path, batch_size): loss_file = open("results/losses.txt", "w") reward_file = open("results/rewards.txt", "w") step = 0 for e in tqdm.tqdm(range(1, num_episodes + 1)): state = self.env.reset() done = False episode_reward = 0 if self.prioritized_sample: gradient_accum = [ torch.zeros(param.shape).to(self.device) for param in self.training_model.parameters() ] while not done: step += 1 self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) action = self.get_action(state) # self.env.render() new_state, reward, done, _ = self.env.step(action.item()) self.replay_buffer.push( [state, action, new_state if not done else None, reward]) episode_reward += reward state = new_state if len(self.replay_buffer) > self.minimum_buffer_size: if self.prioritized_sample: states, actions, new_states, rewards, mask, importance_weights, indices = self.replay_buffer.sample( batch_size) importance_weights = importance_weights.to(self.device) else: states, actions, new_states, rewards, mask = self.replay_buffer.sample( batch_size) rewards = rewards.to(self.device) mask = mask.to(self.device) best_actions = self.get_best_actions(new_states) target_q_values = self.get_target_q_value( new_states, best_actions) * mask expected_q_values = self.get_training_q_value( states, actions) errors = (rewards + self.gamma * target_q_values - expected_q_values).pow(2) if self.prioritized_sample: self.replay_buffer.update(errors.detach().cpu(), indices) loss = errors.mean().sqrt() loss_file.write("{}\n".format(loss.item())) self.optimizer.zero_grad() loss.backward() if self.prioritized_sample: with torch.no_grad(): for i, param in enumerate( self.training_model.parameters()): gradient_accum[i] += torch.dot( importance_weights, errors) * param.grad with torch.no_grad(): for param in self.training_model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if (self.transfer_frequency > 0) and (step % self.transfer_frequency == 0): self.target_model.load_state_dict( self.training_model.state_dict()) if self.prioritized_sample: with torch.no_grad(): for i, param in enumerate( self.training_model.parameters()): new_param = param + gradient_accum[i] param.data.copy_(new_param) reward_file.write("{}\t{}\n".format(e, episode_reward)) self.target_model.load_state_dict(self.training_model.state_dict()) torch.save(self.target_model.state_dict(), save_path) loss_file.close() reward_file.close() self.env.close()
def compute_weight(self, module, do_power_iteration): # NB: If `do_power_iteration` is set, the `u` and `v` vectors are # updated in power iteration **in-place**. This is very important # because in `DataParallel` forward, the vectors (being buffers) are # broadcast from the parallelized module to each module replica, # which is a new module object created on the fly. And each replica # runs its own spectral norm power iteration. So simply assigning # the updated vectors to the module this function runs on will cause # the update to be lost forever. And the next time the parallelized # module is replicated, the same randomly initialized vectors are # broadcast and used! # # Therefore, to make the change propagate back, we rely on two # important bahaviors (also enforced via tests): # 1. `DataParallel` doesn't clone storage if the broadcast tensor # is alreay on correct device; and it makes sure that the # parallelized module is already on `device[0]`. # 2. If the out tensor in `out=` kwarg has correct shape, it will # just fill in the values. # Therefore, since the same power iteration is performed on all # devices, simply updating the tensors in-place will make sure that # the module replica on `device[0]` will update the _u vector on the # parallized module (by shared storage). # # However, after we update `u` and `v` in-place, we need to **clone** # them before using them to normalize the weight. This is to support # backproping through two forward passes, e.g., the common pattern in # GAN training: loss = D(real) - D(fake). Otherwise, engine will # complain that variables needed to do backward for the first forward # (i.e., the `u` and `v` vectors) are changed in the second forward. weight = getattr(module, self.name + '_orig') u = getattr(module, self.name + '_u') v = getattr(module, self.name + '_v') sigma_log = getattr(module, self.name + '_sigma') # for logging # get settings from conv-module (for transposed convolution) stride = module.stride padding = module.padding if do_power_iteration: with torch.no_grad(): for _ in range(self.n_power_iterations): v_s = conv_transpose2d(u.view(self.out_shape), weight, stride=stride, padding=padding, output_padding=0) # Note: out flag for in-place changes v = normalize(v_s.view(-1), dim=0, eps=self.eps, out=v) u_s = conv2d(v.view(self.input_dim), weight, stride=stride, padding=padding, bias=None) u = normalize(u_s.view(-1), dim=0, eps=self.eps, out=u) if self.n_power_iterations > 0: # See above on why we need to clone u = u.clone() v = v.clone() weight_v = conv2d(v.view(self.input_dim), weight, stride=stride, padding=padding, bias=None) weight_v = weight_v.view(-1) sigma = torch.dot(u.view(-1), weight_v) # enforce spectral norm only as constraint factorReverse = torch.max( torch.ones(1).to(weight.device), sigma / self.coeff) # for logging weight_v_det = weight_v.detach() u_det = u.detach() torch.max(torch.dot(u_det.view(-1), weight_v_det), torch.dot(u_det.view(-1), weight_v_det), out=sigma_log) # rescaling weight = weight / (factorReverse + 1e-5) # for stability return weight
def compute_weight(self, module, do_power_iteration): # NB: If `do_power_iteration` is set, the `u` and `v` vectors are # updated in power iteration **in-place**. This is very important # because in `DataParallel` forward, the vectors (being buffers) are # broadcast from the parallelized module to each module replica, # which is a new module object created on the fly. And each replica # runs its own spectral norm power iteration. So simply assigning # the updated vectors to the module this function runs on will cause # the update to be lost forever. And the next time the parallelized # module is replicated, the same randomly initialized vectors are # broadcast and used! # # Therefore, to make the change propagate back, we rely on two # important bahaviors (also enforced via tests): # 1. `DataParallel` doesn't clone storage if the broadcast tensor # is alreay on correct device; and it makes sure that the # parallelized module is already on `device[0]`. # 2. If the out tensor in `out=` kwarg has correct shape, it will # just fill in the values. # Therefore, since the same power iteration is performed on all # devices, simply updating the tensors in-place will make sure that # the module replica on `device[0]` will update the _u vector on the # parallized module (by shared storage). # # However, after we update `u` and `v` in-place, we need to **clone** # them before using them to normalize the weight. This is to support # backproping through two forward passes, e.g., the common pattern in # GAN training: loss = D(real) - D(fake). Otherwise, engine will # complain that variables needed to do backward for the first forward # (i.e., the `u` and `v` vectors) are changed in the second forward. weight = getattr(module, self.name + '_orig') u = getattr(module, self.name + '_u') v = getattr(module, self.name + '_v') sigma_log = getattr(module, self.name + '_sigma') # for logging weight_mat = self.reshape_weight_to_matrix(weight) if do_power_iteration: with torch.no_grad(): for _ in range(self.n_power_iterations): # Spectral norm of weight equals to `u^T W v`, where `u` and `v` # are the first left and right singular vectors. # This power iteration produces approximations of `u` and `v`. v = normalize(torch.mv(weight_mat.t(), u), dim=0, eps=self.eps, out=v) u = normalize(torch.mv(weight_mat, v), dim=0, eps=self.eps, out=u) if self.n_power_iterations > 0: # See above on why we need to clone u = u.clone() v = v.clone() sigma = torch.dot(u, torch.mv(weight_mat, v)) # soft normalization: only when sigma larger than coeff factor = torch.max(torch.ones(1).to(weight.device), sigma / self.coeff) weight = weight / factor # for logging sigma_det = sigma.detach() torch.max(torch.ones(1).to(weight.device), sigma_det / self.coeff, out=sigma_log) return weight
def Bearing(xA_4d, xB_4d): dp = xA_4d[:2] - xB_4d[:2] v = xA_4d[2:] cos_theta = torch.dot(dp, v) / (torch.norm(dp) * torch.norm(v) + 1E-6) return cos_theta
def compute_nll_from_model(data, pathmodel, pathweights, image_shape, num_classes, nb_step=5, optim_default=partial(optim.SGD, lr=5e-5, momentum=0.), dataloader=False): print("Compute NLL from Model") torch.random.manual_seed(0) np.random.seed(0) lls = {} grad_total = {} grad_stat_total = {} likelihood_ratio_statistic = {} grad_total[0] = [] for k in range(nb_step + 1): lls[k] = [] # grad_total[k] = [] grad_stat_total[k] = [] likelihood_ratio_statistic[k] = [] model = load_model_from_param(pathmodel, pathweights, num_classes, image_shape).cuda() if not dataloader: dataloader_aux = [(tqdm.tqdm(data), None)] else: dataloader_aux = tqdm.tqdm(iter(data)) for data_list, _ in dataloader_aux: for x in data_list: # load weights. print the weights. model_copy = load_model_from_param(pathmodel, pathweights, num_classes, image_shape).cuda() optimizer = optim_default(model_copy.parameters()) for param_group in optimizer.param_groups: lr = param_group['lr'] break model_copy.zero_grad() grads = [] diff_param = [] x = x.to(device_test).unsqueeze(0) _, nll, _ = model_copy(x, y_onehot=None) nll.backward() lls[0].append(-nll.detach().cpu().item()) optimizer.step() for name_copy, param_copy in model_copy.named_parameters(): if param_copy.grad is not None: grads.append(-param_copy.grad.view(-1)) grad_total[0].append( torch.sum(lr * (torch.cat(grads)**2)).detach().cpu().item()) for (name_copy, param_copy), (name, param) in zip(model_copy.named_parameters(), model.named_parameters()): assert (name_copy == name) if param_copy.grad is not None: aux_diff_param = param_copy.data.detach( ) - param.data.detach() diff_param.append(aux_diff_param.view(-1)) grads = torch.flatten(torch.cat(grads)) diff_param = torch.flatten(torch.cat(diff_param)) if not torch.isinf(torch.abs(torch.dot(grads, diff_param))).any(): grad_stat_total[0].append( torch.abs(torch.dot(grads, diff_param)).detach().cpu().item()) for k in range(1, nb_step + 1): model_copy.zero_grad() diff_param = [] _, nll, _ = model_copy(x, y_onehot=None) nll.backward() if not torch.isinf(-nll).any(): lls[k].append(-nll.detach().cpu().item()) else: print("INF NLL") lls[k].append(torch.sign(nll).detach().cpu().item() * 1e8) optimizer.step() for (name_copy, param_copy), (name, param) in zip(model_copy.named_parameters(), model.named_parameters()): assert (name_copy == name) if param_copy.grad is not None: aux_diff_param = param_copy.data.detach( ) - param.data.detach() diff_param.append(aux_diff_param.view(-1)) diff_param = torch.flatten(torch.cat(diff_param)) if not torch.isinf(torch.abs(torch.dot(grads, diff_param))).any(): grad_stat_total[k].append( torch.abs(torch.dot(grads, diff_param)).detach().cpu().item()) else: print("Inf grad stat") grad_total[0] = np.array(grad_total[0]) for key in grad_stat_total.keys(): lls[key] = np.array(lls[key]) likelihood_ratio_statistic[key] = lls[key] - lls[0] likelihood_ratio_statistic[key] = likelihood_ratio_statistic[key][ np.where(np.abs(likelihood_ratio_statistic[key]) < 1e7)] grad_stat_total[key] = np.array(grad_stat_total[key]) return lls, grad_total, grad_stat_total, likelihood_ratio_statistic
def compute_nll(data, model, nb_step=1, optim_default=partial(optim.SGD, lr=1e-5, momentum=0.), dataloader=False): print("Compute NLL") torch.random.manual_seed(0) np.random.seed(0) lls = {} grad_total = {} grad_stat_total = {} likelihood_ratio_statistic = {} for k in range(nb_step + 1): lls[k] = [] grad_total[k] = [] grad_stat_total[k] = [] likelihood_ratio_statistic[k] = [] if not dataloader: dataloader_aux = [(tqdm.tqdm(data), None)] else: dataloader_aux = tqdm.tqdm(iter(data)) for data_list, _ in dataloader_aux: for x in data_list: # load weights. print the weights. model_copy = copy.deepcopy(model).to(device_test) optimizer = optim_default(model_copy.parameters()) for param_group in optimizer.param_groups: lr = param_group['lr'] break model_copy.zero_grad() grads = [] diff_param = [] x = x.to(device_test).unsqueeze(0) _, nll, _ = model_copy(x, y_onehot=None) nll.backward() lls[0].append(-nll.detach().cpu().item()) optimizer.step() for name_copy, param_copy in model_copy.named_parameters(): if param_copy.grad is not None: grads.append(-param_copy.grad.view(-1)) grad_total[0].append( torch.sum(lr * (torch.cat(grads)**2)).detach().cpu().item()) for (name_copy, param_copy), (name, param) in zip(model_copy.named_parameters(), model.named_parameters()): assert (name_copy == name) if param_copy.grad is not None: aux_diff_param = param_copy.data - param.data diff_param.append(aux_diff_param.view(-1)) grads = torch.flatten(torch.cat(grads)) diff_param = torch.flatten(torch.cat(diff_param)) grad_stat_total[0].append( torch.abs(torch.dot(grads, diff_param)).detach().cpu().item()) for k in range(1, nb_step + 1): model_copy.zero_grad() diff_param = [] _, nll, _ = model_copy(x, y_onehot=None) nll.backward() lls[k].append(-nll.detach().cpu().item()) optimizer.step() for (name_copy, param_copy), (name, param) in zip(model_copy.named_parameters(), model.named_parameters()): assert (name_copy == name) if param_copy.grad is not None: aux_diff_param = param_copy.data - param.data diff_param.append(aux_diff_param.view(-1)) grad_total[k].append( torch.sum((grads**2) * lr).detach().cpu().item()) diff_param = torch.flatten(torch.cat(diff_param)) grad_stat_total[k].append( torch.abs(torch.dot(grads, diff_param)).detach().cpu().item()) for key in grad_total.keys(): grad_total[key] = np.array(grad_total[key]) lls[key] = np.array(lls[key]) likelihood_ratio_statistic[key] = lls[key] - lls[0] grad_stat_total[key] = np.array(grad_stat_total[key]) return lls, grad_total, grad_stat_total, likelihood_ratio_statistic
def estimate_metrics(pred, random_query, binary_target, sup_net, switch_vec): query_pred = torch.gather(pred, 1, random_query.view(-1, 1)).squeeze(1) num_s = torch.tensor(np.sum(switch_vec).item(), dtype=torch.float32).to(device) # ipdb.set_trace() _, class_pred = torch.max(pred, dim=1) binary_pred = class_pred.eq(random_query).type(torch.cuda.LongTensor) correct = binary_target.eq(binary_pred).sum() accuracy = correct.type(torch.cuda.FloatTensor) / binary_target.size(0) bce_loss = bce(query_pred, binary_target.type(torch.cuda.FloatTensor)) metrics = {} s_hist = {} ortho_mtrx = {} metrics['accuracy'] = accuracy metrics['bce_loss'] = bce_loss * args.lambda_bce metrics['l1_loss_total'] = torch.from_numpy(np.float32([0.])).to(device) metrics['orthogonality_loss_total'] = torch.from_numpy(np.float32( [0.])).to(device) metrics['quantization_loss_total'] = torch.from_numpy(np.float32( [0.])).to(device) metrics['total_loss'] = torch.from_numpy(np.float32([0.])).to(device) # ipdb.set_trace() metrics['total_loss'] = metrics['total_loss'] + metrics['bce_loss'] one_hot = torch.zeros((10, 10)).fill_(1).to(device) s_one_hot = torch.zeros(10, 10).type(torch.cuda.FloatTensor) s_queries = torch.from_numpy(np.array(list(range(10)))).to(device) s_one_hot = s_one_hot.scatter_(dim=1, index=s_queries.view(-1, 1), src=one_hot) s_vectors_all = sup_net(s_one_hot) for k in range(len(switch_vec)): if switch_vec[k]: s_vectors = s_vectors_all[k] for i in range(10): s_hist['s_layer_{}_class_{}'.format( k, i)] = s_vectors[i].cpu().data.numpy() sparsity_loss = l1(s_vectors, torch.zeros_like(s_vectors).to(device)) orth_loss = torch.from_numpy(np.float32([0.])).to(device) for i in range(10): for j in range(i, 10): orth_loss = orth_loss + torch.dot(s_vectors[i], s_vectors[j]) ortho_mtrx['layer_{}'.format(k)] = np.zeros((10, 10)) for i in range(10): for j in range(10): ortho_mtrx['layer_{}'.format(k)][i][j] = torch.dot( s_vectors[i], s_vectors[j]).cpu().data.numpy() quantization_target = s_vectors.detach() > 0.5 quantization_loss = mse( s_vectors, quantization_target.type(torch.cuda.FloatTensor)) orth_loss = orth_loss / 45 # ipdb.set_trace() metrics['l1_loss_{}'.format(k)] = sparsity_loss * args.lambda_l1 metrics['l1_loss_total'] = metrics['l1_loss_total'] + metrics[ 'l1_loss_{}'.format(k)] metrics['orthogonality_loss_{}'.format( k)] = orth_loss * args.lambda_ortho metrics['orthogonality_loss_total'] = metrics[ 'orthogonality_loss_total'] + metrics[ 'orthogonality_loss_{}'.format(k)] metrics['quantization_loss_{}'.format( k)] = quantization_loss * args.lambda_quant metrics['quantization_loss_total'] = metrics[ 'quantization_loss_total'] + metrics[ 'quantization_loss_{}'.format(k)] # ipdb.set_trace() metrics['total_loss'] = metrics['total_loss'] + metrics['l1_loss_total']/num_s + \ metrics['orthogonality_loss_total']/num_s + \ metrics['quantization_loss_total']/num_s return metrics, s_hist, ortho_mtrx
def train_model(num_epochs, dataset_name, datadir, feature, model_name, fraction, select_every, optim_type, learning_rate, run, device, log_dir, trn_batch_size, strategy): # Loading the Dataset trainset, validset, testset, num_cls = load_dataset_custom( datadir, dataset_name, feature) N = len(trainset) val_batch_size = 1000 tst_batch_size = 1000 # Creating the Data Loaders trainloader = torch.utils.data.DataLoader(trainset, batch_size=trn_batch_size, shuffle=False, pin_memory=True) valloader = torch.utils.data.DataLoader(validset, batch_size=val_batch_size, shuffle=False, pin_memory=True) testloader = torch.utils.data.DataLoader(testset, batch_size=tst_batch_size, shuffle=False, pin_memory=True) # Budget for subset selection bud = int(fraction * N) print("Budget, fraction and N:", bud, fraction, N) # Subset Selection and creating the subset data loader start_idxs = np.random.choice(N, size=bud, replace=False) idxs = start_idxs data_sub = Subset(trainset, idxs) subset_trnloader = torch.utils.data.DataLoader(data_sub, batch_size=trn_batch_size, shuffle=False, pin_memory=True) # Variables to store accuracies gammas = torch.ones(len(idxs)).to(device) substrn_losses = np.zeros(num_epochs) val_losses = np.zeros(num_epochs) timing = np.zeros(num_epochs) val_acc = np.zeros(num_epochs) tst_acc = np.zeros(num_epochs) subtrn_acc = np.zeros(num_epochs) # Results logging file print_every = 3 all_logs_dir = log_dir + '/' + str(uuid.uuid4()) while os.path.exists(all_logs_dir): all_logs_dir = log_dir + '/' + str(uuid.uuid4()) print(all_logs_dir) subprocess.run(["mkdir", "-p", all_logs_dir]) path_logfile = os.path.join(all_logs_dir, 'log.txt') logfile = open(path_logfile, 'w') exp_name = dataset_name + '_fraction:' + str(fraction) + '_epochs:' + str(num_epochs) + \ '_selEvery:' + str(select_every) + '_variant' + '_runs' + str(run) print(exp_name) # Model Creation model = create_model(model_name, num_cls, device) model1 = create_model(model_name, num_cls, device) # Loss Functions criterion, criterion_nored = loss_function() # Getting the optimizer and scheduler optimizer, scheduler = optimizer_with_scheduler(optim_type, model, num_epochs, learning_rate) if strategy == 'GradMatch': # OMPGradMatch Selection strategy setf_model = OMPGradMatchStrategy(trainloader, valloader, model1, criterion, learning_rate, device, num_cls, True, 'PerClassPerGradient', False, lam=0.5, eps=1e-100) elif strategy == 'GradMatchPB': setf_model = OMPGradMatchStrategy(trainloader, valloader, model1, criterion, learning_rate, device, num_cls, True, 'PerBatch', False, lam=0, eps=1e-100) elif strategy == 'GradMatch-Explore': # OMPGradMatch Selection strategy setf_model = OMPGradMatchStrategy(trainloader, valloader, model1, criterion, learning_rate, device, num_cls, True, 'PerClassPerGradient', False, lam=0.5, eps=1e-100) # Random-Online Selection strategy rand_setf_model = RandomStrategy(trainloader, online=True) elif strategy == 'GradMatchPB-Explore': # OMPGradMatch Selection strategy setf_model = OMPGradMatchStrategy(trainloader, valloader, model1, criterion, learning_rate, device, num_cls, True, 'PerBatch', False, lam=0, eps=1e-100) # Random-Online Selection strategy rand_setf_model = RandomStrategy(trainloader, online=True) elif strategy == 'Random': # Random Selection strategy setf_model = RandomStrategy(trainloader, online=False) elif strategy == 'Random-Online': # Random-Online Selection strategy setf_model = RandomStrategy(trainloader, online=True) print("=======================================", file=logfile) kappa_epochs = int(0.5 * num_epochs) full_epochs = floor(kappa_epochs / int(fraction * 100)) for i in range(num_epochs): subtrn_loss = 0 subtrn_correct = 0 subtrn_total = 0 subset_selection_time = 0 if (strategy in [ 'GLISTER', 'GradMatch', 'GradMatchPB', 'CRAIG', 'CRAIGPB' ]) and (((i + 1) % select_every) == 0): start_time = time.time() cached_state_dict = copy.deepcopy(model.state_dict()) clone_dict = copy.deepcopy(model.state_dict()) if strategy in ['CRAIG', 'CRAIGPB']: subset_idxs, gammas = setf_model.select( int(bud), clone_dict, 'lazy') else: subset_idxs, gammas = setf_model.select(int(bud), clone_dict) model.load_state_dict(cached_state_dict) idxs = subset_idxs if strategy in ['GradMatch', 'GradMatchPB', 'CRAIG', 'CRAIGPB']: gammas = torch.from_numpy(np.array(gammas)).to(device).to( torch.float32) subset_selection_time += (time.time() - start_time) elif (strategy in [ 'GLISTER-Explore', 'GradMatch-Explore', 'GradMatchPB-Explore', 'CRAIG-Explore', 'CRAIGPB-Explore' ]): start_time = time.time() if i < full_epochs: subset_idxs, gammas = rand_setf_model.select(int(bud)) idxs = subset_idxs gammas = gammas.to(device) elif ((i % select_every == 0) and (i >= kappa_epochs)): cached_state_dict = copy.deepcopy(model.state_dict()) clone_dict = copy.deepcopy(model.state_dict()) if strategy in ['CRAIG-Explore', 'CRAIGPB-Explore']: subset_idxs, gammas = setf_model.select( int(bud), clone_dict, 'lazy') else: subset_idxs, gammas = setf_model.select( int(bud), clone_dict) model.load_state_dict(cached_state_dict) idxs = subset_idxs if strategy in [ 'GradMatch-Explore', 'GradMatchPB-Explore', 'CRAIG-Explore', 'CRAIGPB-Explore' ]: gammas = torch.from_numpy(np.array(gammas)).to(device).to( torch.float32) subset_selection_time += (time.time() - start_time) print("selEpoch: %d, Selection Ended at:" % (i), str(datetime.datetime.now())) data_sub = Subset(trainset, idxs) subset_trnloader = torch.utils.data.DataLoader( data_sub, batch_size=trn_batch_size, shuffle=False, pin_memory=True) model.train() batch_wise_indices = list(subset_trnloader.batch_sampler) if strategy in ['CRAIG', 'CRAIGPB', 'GradMatch', 'GradMatchPB']: start_time = time.time() for batch_idx, (inputs, targets) in enumerate(subset_trnloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) # targets can have non_blocking=True. optimizer.zero_grad() outputs = model(inputs) losses = criterion_nored(outputs, targets) loss = torch.dot( losses, gammas[batch_wise_indices[batch_idx]]) / ( gammas[batch_wise_indices[batch_idx]].sum()) loss.backward() subtrn_loss += loss.item() optimizer.step() _, predicted = outputs.max(1) subtrn_total += targets.size(0) subtrn_correct += predicted.eq(targets).sum().item() train_time = time.time() - start_time elif strategy in [ 'CRAIGPB-Explore', 'CRAIG-Explore', 'GradMatch-Explore', 'GradMatchPB-Explore' ]: start_time = time.time() if i < full_epochs: for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True ) # targets can have non_blocking=True. optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() subtrn_loss += loss.item() optimizer.step() _, predicted = outputs.max(1) subtrn_total += targets.size(0) subtrn_correct += predicted.eq(targets).sum().item() elif i >= kappa_epochs: for batch_idx, (inputs, targets) in enumerate(subset_trnloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True ) # targets can have non_blocking=True. optimizer.zero_grad() outputs = model(inputs) losses = criterion_nored(outputs, targets) loss = torch.dot( losses, gammas[batch_wise_indices[batch_idx]]) / ( gammas[batch_wise_indices[batch_idx]].sum()) loss.backward() subtrn_loss += loss.item() optimizer.step() _, predicted = outputs.max(1) subtrn_total += targets.size(0) subtrn_correct += predicted.eq(targets).sum().item() train_time = time.time() - start_time elif strategy in ['Full']: start_time = time.time() for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) # targets can have non_blocking=True. optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() subtrn_loss += loss.item() optimizer.step() _, predicted = outputs.max(1) subtrn_total += targets.size(0) subtrn_correct += predicted.eq(targets).sum().item() train_time = time.time() - start_time scheduler.step() timing[i] = train_time + subset_selection_time # print("Epoch timing is: " + str(timing[i])) val_loss = 0 val_correct = 0 val_total = 0 tst_correct = 0 tst_total = 0 tst_loss = 0 model.eval() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(valloader): # print(batch_idx) inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) outputs = model(inputs) loss = criterion(outputs, targets) val_loss += loss.item() _, predicted = outputs.max(1) val_total += targets.size(0) val_correct += predicted.eq(targets).sum().item() for batch_idx, (inputs, targets) in enumerate(testloader): # print(batch_idx) inputs, targets = inputs.to(device), targets.to( device, non_blocking=True) outputs = model(inputs) loss = criterion(outputs, targets) tst_loss += loss.item() _, predicted = outputs.max(1) tst_total += targets.size(0) tst_correct += predicted.eq(targets).sum().item() val_acc[i] = val_correct / val_total tst_acc[i] = tst_correct / tst_total subtrn_acc[i] = subtrn_correct / subtrn_total substrn_losses[i] = subtrn_loss val_losses[i] = val_loss print('Epoch:', i + 1, 'Validation Accuracy: ', val_acc[i], 'Test Accuracy: ', tst_acc[i], 'Train Accuracy:', subtrn_acc[i], 'Time: ', timing[i]) print(strategy + " Selection Run---------------------------------") print("Final SubsetTrn:", subtrn_loss) print("Validation Loss and Accuracy:", val_loss, val_acc.max()) print("Test Data Loss and Accuracy:", tst_loss, tst_acc.max()) print('-----------------------------------') # Results logging into the file print(strategy, file=logfile) print( '---------------------------------------------------------------------', file=logfile) val = "Validation Accuracy, " tst = "Test Accuracy, " trn = "Train Accuracy, " time_str = "Time, " for i in range(num_epochs): time_str = time_str + "," + str(timing[i]) val = val + "," + str(val_acc[i]) trn = trn + "," + str(subtrn_acc[i]) tst = tst + "," + str(tst_acc[i]) print(timing, file=logfile) print(val, file=logfile) print(trn, file=logfile) print(tst, file=logfile) omp_timing = np.array(timing) omp_cum_timing = list(generate_cumulative_timing(omp_timing)) omp_tst_acc = list(filter(tst_acc)) print("Total time taken by " + strategy + " = " + str(omp_cum_timing[-1])) logfile.close() return { 'loss': -tst_acc.max(), 'max_val_acc': val_acc.max(), 'train_acc': subtrn_acc.max(), 'status': STATUS_OK }
def f(x): rewards = np.zeros(n_cats) rewards[0] = 1. rewards = torch.tensor(rewards).float() # print (x, rewards) return torch.dot(x,rewards)
def calc_potential_energy(self, xx): xx = xx - self.bias potential_energy = torch.dot(xx, torch.matmul(self.weight_matrix, xx)) return potential_energy
def get_coordinates(v, basis_vectors, offset_v): adjusted_v = v - offset_v coeffs = [float(torch.dot(adjusted_v, b) / torch.norm(b)) for b in basis_vectors] return coeffs
import os import torch A = torch.tensor([1, 2, 3], dtype=torch.float) B = torch.tensor([4, 5, 6], dtype=torch.float) result = torch.dot(A, B) print(result) print(result.item())
def train(self, env, expert, render=False): num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] lambda_ = self.train_config["lambda"] gae_gamma = self.train_config["gae_gamma"] gae_lambda = self.train_config["gae_lambda"] eps = self.train_config["epsilon"] max_kl = self.train_config["max_kl"] cg_damping = self.train_config["cg_damping"] normalize_advantage = self.train_config["normalize_advantage"] opt_d = torch.optim.Adam(self.d.parameters()) exp_rwd_iter = [] exp_obs = [] exp_acts = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_rwds = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = expert.act(ob) ep_obs.append(ob) exp_obs.append(ob) exp_acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) t += 1 steps += 1 if horizon is not None: if t >= horizon: break if done: exp_rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(ep_obs) ep_rwds = FloatTensor(ep_rwds) exp_rwd_mean = np.mean(exp_rwd_iter) print("Expert Reward Mean: {}".format(exp_rwd_mean)) exp_obs = FloatTensor(exp_obs) exp_acts = FloatTensor(np.array(exp_acts)) rwd_iter_means = [] for i in range(num_iters): rwd_iter = [] obs = [] acts = [] rets = [] advs = [] gms = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_acts = [] ep_rwds = [] ep_costs = [] ep_disc_costs = [] ep_gms = [] ep_lmbs = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = self.act(ob) ep_obs.append(ob) obs.append(ob) ep_acts.append(act) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) ep_gms.append(gae_gamma**t) ep_lmbs.append(gae_lambda**t) t += 1 steps += 1 if horizon is not None: if t >= horizon: break if done: rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(ep_obs) # ep_acts = FloatTensor(np.array(ep_acts)).to(torch.device("cuda")) ep_acts = FloatTensor(np.array(ep_acts)) ep_rwds = FloatTensor(ep_rwds) # ep_disc_rwds = FloatTensor(ep_disc_rwds) ep_gms = FloatTensor(ep_gms) ep_lmbs = FloatTensor(ep_lmbs) ep_costs = (-1) * torch.log(self.d(ep_obs, ep_acts))\ .squeeze().detach() ep_disc_costs = ep_gms * ep_costs ep_disc_rets = FloatTensor( [sum(ep_disc_costs[i:]) for i in range(t)]) ep_rets = ep_disc_rets / ep_gms rets.append(ep_rets) self.v.eval() curr_vals = self.v(ep_obs).detach() next_vals = torch.cat( (self.v(ep_obs)[1:], FloatTensor([[0.]]))).detach() ep_deltas = ep_costs.unsqueeze(-1)\ + gae_gamma * next_vals\ - curr_vals ep_advs = torch.FloatTensor([ ((ep_gms * ep_lmbs)[:t - j].unsqueeze(-1) * ep_deltas[j:]).sum() for j in range(t) ]) advs.append(ep_advs) gms.append(ep_gms) rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) obs = FloatTensor(obs) # acts = FloatTensor(np.array(acts)).to(torch.device("cuda")) acts = FloatTensor(np.array(acts)) rets = torch.cat(rets) advs = torch.cat(advs) gms = torch.cat(gms) if normalize_advantage: advs = (advs - advs.mean()) / advs.std() self.d.train() exp_scores = self.d.get_logits(exp_obs, exp_acts) nov_scores = self.d.get_logits(obs, acts) opt_d.zero_grad() loss = torch.nn.functional.binary_cross_entropy_with_logits( exp_scores, torch.zeros_like(exp_scores) ) \ + torch.nn.functional.binary_cross_entropy_with_logits( nov_scores, torch.ones_like(nov_scores) ) loss.backward() opt_d.step() self.v.train() old_params = get_flat_params(self.v).detach() old_v = self.v(obs).detach() def constraint(): return ((old_v - self.v(obs))**2).mean() grad_diff = get_flat_grads(constraint(), self.v) def Hv(v): hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\ .detach() return hessian g = get_flat_grads( ((-1) * (self.v(obs).squeeze() - rets)**2).mean(), self.v).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() alpha = torch.sqrt(2 * eps / torch.dot(s, Hs)) new_params = old_params + alpha * s set_params(self.v, new_params) self.pi.train() old_params = get_flat_params(self.pi).detach() old_distb = self.pi(obs) def L(): distb = self.pi(obs) return (advs.to(torch.device("cuda")) * torch.exp( distb.log_prob(acts) - old_distb.log_prob(acts).detach()) ).mean() def kld(): distb = self.pi(obs) if self.discrete: old_p = old_distb.probs.detach() p = distb.probs return (old_p * (torch.log(old_p) - torch.log(p)))\ .sum(-1)\ .mean() else: old_mean = old_distb.mean.detach() old_cov = old_distb.covariance_matrix.sum(-1).detach() mean = distb.mean cov = distb.covariance_matrix.sum(-1) return (0.5) * ((old_cov / cov).sum(-1) + (((old_mean - mean)**2) / cov).sum(-1) - self.action_dim + torch.log(cov).sum(-1) - torch.log(old_cov).sum(-1)).mean() grad_kld_old_param = get_flat_grads(kld(), self.pi) def Hv(v): hessian = get_flat_grads(torch.dot(grad_kld_old_param, v), self.pi).detach() return hessian + cg_damping * v g = get_flat_grads(L(), self.pi).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() new_params = rescale_and_linesearch(g, s, Hs, max_kl, L, kld, old_params, self.pi) disc_causal_entropy = ((-1) * gms * self.pi(obs).log_prob(acts))\ .mean() grad_disc_causal_entropy = get_flat_grads(disc_causal_entropy, self.pi) new_params += lambda_ * grad_disc_causal_entropy set_params(self.pi, new_params) return exp_rwd_mean, rwd_iter_means
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ assert len(self.param_groups) == 1 loss = None if closure is not None: loss = closure() group = self.param_groups[0] weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] grad = self._gather_flat_grad_with_weight_decay(weight_decay) # NOTE: op_Sgd_lop_Sgdn has only global state, but we register it as state for # the first param, because this helps with casting in load_state_dict state = self.state[self._params[0]] # State initialization if len(state) == 0: state['step'] = 0 state['grad_prev'] = torch.zeros_like(grad) # Accumulated momentum for the hypergradient state['momentum_buffer_h'] = grad.new_tensor(0) state['step'] += 1 grad_prev = state['grad_prev'] # Hypergradient for SGD optimizer h = torch.dot(grad, grad_prev) h = -h ''' Hypergradient descent with momentum (HD-momentum) coefficients Parameters ----------- momentum_h : momentum coefficient for the hypergradient dampening_h : dampening coefficient for the hypergradient nesterov_h : bool, if true : use nesterov momentum for the l.r update, else use sgd + momemtum ''' momentum_h = group['momentum_h'] dampening_h = group['dampening_h'] nesterov_h = group['nesterov_h'] # Hypergradient descent with momentum (HD momentum) for the learning rate if momentum_h and state['step'] > 1: buf_h = state['momentum_buffer_h'] buf_h.mul_(momentum_h).add_(1 - dampening_h, h) state['momentum_buffer_h'] = buf_h if nesterov_h: h.add_(momentum_h, buf_h) else: h = buf_h group['lr'] -= group['hypergrad_lr'] * h if momentum != 0: if 'momentum_buffer' not in state: buf = state['momentum_buffer'] = torch.zeros_like(grad) buf.mul_(momentum).add_(grad) else: buf = state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, grad) if nesterov: grad.add_(momentum, buf) else: grad = buf state['grad_prev'] = grad self._add_grad(-group['lr'], grad) return loss
def get_pure_mspbe(self): A_theta_minus_b = torch.mv(self.A, self.theta) - self.b return (1/2) * torch.dot(A_theta_minus_b, torch.mv(self.C_inv, A_theta_minus_b))
def test(learner, args, train_envs, test_envs, log_dir): batch_sampler = sampler(args.batch_size, args.num_bandits) batch_sampler.build(args.num_tasks_train, train_envs, args.batch_size) max_kl = args.max_kl cg_iters = args.cg_iters cg_damping = args.cg_damping ls_max_steps = args.ls_max_steps ls_backtrack_ratio = args.ls_backtrack_ratio train_rew = [] test_rew = [] for i in range(args.num_updates): #print(i) adapt_params = [] inner_losses = [] adapt_episodes = [] rew_rem = [] rew_rem_test = [] for j in range(args.num_tasks_test): e = batch_sampler.sample(test_envs[j], learner) inner_loss = learner.cal_loss(e.s, e.a, e.r) params = learner.update_params(inner_loss, args.inner_lr, args.first_order) a_e = batch_sampler.sample_policy(test_envs[j], learner, params) mean_rew = torch.mean(a_e.r).data.numpy() rew_rem_test.append(mean_rew) for j in range(args.num_tasks_train): e = batch_sampler.sample(train_envs[j], learner) inner_loss = learner.cal_loss(e.s, e.a, e.r) params = learner.update_params(inner_loss, args.inner_lr, args.first_order) a_e = batch_sampler.sample_policy(train_envs[j], learner, params) adapt_params.append(params) adapt_episodes.append(a_e) inner_losses.append(inner_loss) mean_rew = torch.mean(a_e.r).data.numpy() rew_rem.append(mean_rew) print(i, np.mean(rew_rem), np.mean(rew_rem_test)) #print(batch_sampler.poss/batch_sampler.cnt) train_rew.append(np.mean(rew_rem)) test_rew.append(np.mean(rew_rem_test)) old_loss, _, old_pis = learner.surrogate_loss(adapt_episodes, inner_losses) grads = torch.autograd.grad(old_loss, learner.parameters(), retain_graph=True) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = learner.hessian_vector_product( adapt_episodes, inner_losses, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(learner.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, learner.parameters()) loss, kl, _ = learner.surrogate_loss(adapt_episodes, inner_losses, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, learner.parameters()) if (i + 1) % 10 == 0: test_input = torch.FloatTensor([[1]]) test_output = learner.forward(test_input).data.numpy()[0] plt.figure() plt.bar(np.arange(len(test_output)), test_output) plt.savefig(log_dir + 'figures/before%i.png' % i) plt.close() for j in range(args.num_tasks_train): test_output = learner.forward(test_input, adapt_params[j]).data.numpy()[0] plt.figure() plt.bar(np.arange(len(test_output)), test_output) plt.savefig(log_dir + 'figures/after%i_%i.png' % (j, i)) plt.close() np.save(log_dir + 'train_rew' + str(args.inner_lr) + '.npy', train_rew) np.save(log_dir + 'test_rew' + str(args.inner_lr) + '.npy', test_rew) plt.figure() plt.plot(train_rew) plt.show() plt.figure() plt.plot(train_rew) plt.savefig(log_dir + 'train_rew.png') plt.close() plt.figure() plt.plot(test_rew) plt.savefig(log_dir + 'test_rew.png') plt.figure() plt.bar(np.arange(len(batch_sampler.poss)), batch_sampler.poss / batch_sampler.cnt) plt.savefig(log_dir + 'sample.png') plt.close() return
def adj_broyden_correl(opa_freq, n_runs=1, random_prescribed=True, dataset='imagenet', model_size='LARGE'): # setup model = setup_model(opa_freq is not None, dataset, model_size) if dataset == 'imagenet': traindir = os.path.join(config.DATASET.ROOT + '/images', config.DATASET.TRAIN_SET) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(config.MODEL.IMAGE_SIZE[0]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) train_dataset = datasets.ImageFolder(traindir, transform_train) else: normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) augment_list = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip() ] if config.DATASET.AUGMENT else [] transform_train = transforms.Compose(augment_list + [ transforms.ToTensor(), normalize, ]) train_dataset = datasets.CIFAR10(root=f'{config.DATASET.ROOT}', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=32, shuffle=True, num_workers=10, pin_memory=True, worker_init_fn=partial(worker_init_fn, seed=42), ) methods_results = { method_name: { 'correl': [], 'ratio': [] } for method_name in ['shine-adj-br', 'shine', 'shine-opa', 'fpn'] } methods_solvers = { 'shine': broyden, 'shine-adj-br': adj_broyden, 'shine-opa': adj_broyden, 'fpn': broyden, } random_results = {'correl': [], 'ratio': []} iter_loader = iter(train_loader) for i_run in range(n_runs): input, target = next(iter_loader) target = target.cuda(non_blocking=True) x_list, z_list = model.feature_extraction(input.cuda()) model.fullstage._reset(z_list) model.fullstage_copy._copy(model.fullstage) # fixed point solving x_list = [x.clone().detach().requires_grad_() for x in x_list] cutoffs = [(elem.size(1), elem.size(2), elem.size(3)) for elem in z_list] args = (27, int(1e9), None) nelem = sum([elem.nelement() for elem in z_list]) eps = 1e-5 * np.sqrt(nelem) z1_est = DEQFunc2d.list2vec(z_list) directions_dir = { 'random': torch.randn(z1_est.shape), 'prescribed': torch.randn(z1_est.shape), } for method_name in methods_results.keys(): z1_est = torch.zeros_like(z1_est) g = lambda x: DEQFunc2d.g(model.fullstage_copy, x, x_list, cutoffs, *args) if random_prescribed: inverse_direction_fun = lambda x: directions_dir['prescribed'] else: model.copy_modules() loss_function = lambda y_est: model.get_fixed_point_loss( y_est, target) def inverse_direction_fun_vec(x): x_temp = x.clone().detach().requires_grad_() with torch.enable_grad(): x_list = DEQFunc2d.vec2list(x_temp, cutoffs) loss = loss_function(x_list) loss.backward() dl_dx = x_temp.grad return dl_dx inverse_direction_fun = inverse_direction_fun_vec solver = methods_solvers[method_name] if 'opa' in method_name: add_kwargs = dict( inverse_direction_freq=opa_freq, inverse_direction_fun=inverse_direction_fun if opa_freq is not None else None, ) else: add_kwargs = {} result_info = solver( g, z1_est, threshold=config.MODEL.F_THRES, eps=eps, name="forward", **add_kwargs, ) z1_est = result_info['result'] Us = result_info['Us'] VTs = result_info['VTs'] nstep = result_info['lowest_step'] if opa_freq is not None: nstep += (nstep - 1) // opa_freq # compute true incoming gradient if needed if not random_prescribed: directions_dir['prescribed'] = inverse_direction_fun_vec( z1_est) # making sure the random direction norm is not unrealistic directions_dir[ 'random'] = directions_dir['random'] * torch.norm( directions_dir['prescribed']) / torch.norm( directions_dir['random']) # inversion on random gradients z1_temp = z1_est.clone().detach().requires_grad_() with torch.enable_grad(): y = DEQFunc2d.g(model.fullstage_copy, z1_temp, x_list, cutoffs, *args) eps = 2e-10 for direction_name, direction in directions_dir.items(): def g(x): y.backward(x, retain_graph=True) res = z1_temp.grad + direction z1_temp.grad.zero_() return res result_info_inversion = broyden( g, direction, # we initialize Jacobian Free style # in order to accelerate the convergence threshold=35, eps=eps, name="backward", ) true_inv = result_info_inversion['result'] inv_dir = { 'fpn': direction, 'shine': -rmatvec(Us[:, :, :, :nstep - 1], VTs[:, :nstep - 1], direction), } inv_dir['shine-opa'] = inv_dir['shine'] inv_dir['shine-adj-br'] = inv_dir['shine'] approx_inv = inv_dir[method_name] correl = torch.dot( torch.flatten(true_inv), torch.flatten(approx_inv), ) scaling = torch.norm(true_inv) * torch.norm(approx_inv) correl = correl / scaling ratio = torch.norm(true_inv) / torch.norm(approx_inv) if direction_name == 'prescribed': methods_results[method_name]['correl'].append( correl.item()) methods_results[method_name]['ratio'].append(ratio.item()) else: if method_name == 'fpn': random_results['correl'].append(correl.item()) random_results['ratio'].append(ratio.item()) y.backward(torch.zeros_like(true_inv), retain_graph=False) return methods_results, random_results
def get_att_score( self, dec_output, enc_output ): # enc_outputs [batch_size, num_directions(=1) * n_hidden] score = self.attn(enc_output) # score : [batch_size, n_hidden] return torch.dot(dec_output.view(-1), score.view(-1)) # inner product make scalar value
def main( data_path, model_path, w2i_path, hidden_size, classifier_path=None, intervention=False, learning_rate=None, component_names=None, generate_labels=False): # load word-to-index vocabulary with open(w2i_path, 'r') as f: vocab_lines = f.readlines() w2i = {} for i, line in enumerate(vocab_lines): w2i[line.strip()] = i unk_idx = w2i['<unk>'] vocab_size = len(w2i) # load and initialise model lstm = Forward_LSTM(vocab_size, hidden_size, hidden_size, vocab_size, w2i_path, model_path) # initialise hidden state for time step -1 # the hidden state will not be reset for each sentence relevant_activations = {} relevant_activations['hx_l0'] = torch.Tensor(torch.zeros(hidden_size)) relevant_activations['cx_l0'] = torch.Tensor(torch.zeros(hidden_size)) relevant_activations['hx_l1'] = torch.Tensor(torch.zeros(hidden_size)) relevant_activations['cx_l1'] = torch.Tensor(torch.zeros(hidden_size)) # load testing data with open(data_path, 'r') as f_in: test_set = f_in.readlines()[1:] # collect scores for all subcategories scores_original_nvv = [] scores_nonce_nvv = [] scores_original_vnpcv = [] scores_nonce_vnpcv = [] scores_original = [] scores_nonce = [] scores = [] if intervention: classifiers = defaultdict() # load diagnostic classifiers for the intervention for act in component_names: with open("{}/{}.pickle".format(classifier_path, act), 'rb') as trained_classifier: classifiers[act] = pickle.load(trained_classifier) # process sentences for line_idx in range(0, len(test_set), 2): # read two consecutive lines with the following structure: # 0:pattern 1:constr_id 2:sent_id 3:correct_number 4:form 5:class 6:type # 7:prefix 8:n_attr 9:punct 10:freq 11:len_context 12:len_prefix 13:sent sent_data1 = test_set[line_idx].split('\t') sent_data2 = test_set[line_idx + 1].split('\t') # L__NOUN_VERB_VERB or R__VERB_NOUN_CCONJ_VERB pattern1 = sent_data1[0] pattern2 = sent_data2[0] assert(pattern1[0] in ['R', 'L'] and pattern2[0] in ['R', 'L']) assert(pattern1[0] == pattern2[0]) construction_id = 0 if pattern1[0] == 'R' else 1 assert(sent_data1[3] == sent_data2[3]) if not generate_labels: label = 0 if sent_data1[3].strip() == 'sing' else 1 assert(sent_data1[5] != sent_data2[5]) if sent_data1[5] == 'correct': correct_form = sent_data1[4] wrong_form = sent_data2[4] else: correct_form = sent_data2[4] wrong_form = sent_data1[4] assert(sent_data1[6] == sent_data2[6]) type_of_sent = sent_data1[6] assert(sent_data1[11] == sent_data2[11]) context_length = int(sent_data1[11]) assert(sent_data1[12] == sent_data2[12]) target_idx = int(sent_data1[12]) subject_idx = target_idx - context_length assert(sent_data1[13] == sent_data2[13]) sentence = sent_data1[13].split() # process sentence for t, word in enumerate(sentence): output, layer0, layer1 = lstm(word, relevant_activations['hx_l0'], relevant_activations['cx_l0'], relevant_activations['hx_l1'], relevant_activations['cx_l1']) relevant_activations['hx_l0'] = layer0[0] relevant_activations['hx_l1'] = layer1[0] relevant_activations['cx_l0'] = layer0[1] relevant_activations['cx_l1'] = layer1[1] if t == target_idx - 1: vocab_probs = F.log_softmax( output.view(-1, len(w2i)), dim=1)[0] # intervention at subject timestep if intervention and t == subject_idx: for act in component_names: weight, bias = classifiers[act] weight = Variable(torch.tensor( weight, dtype=torch.double).squeeze(0), requires_grad=False) bias = Variable(torch.tensor( bias, dtype=torch.double), requires_grad=False) current_activation = Variable(torch.tensor( relevant_activations[act], dtype=torch.double), requires_grad=True) total_prob = torch.tensor(1.0, dtype=torch.double) class_1_prob = torch.dot(weight, current_activation) + bias class_1_prob = F.sigmoid(class_1_prob) class_0_prob = total_prob - class_1_prob class_0_log_prob = torch.log(class_0_prob) class_1_log_prob = torch.log(class_1_prob) params = [current_activation] optimiser = torch.optim.SGD(params, lr=learning_rate) optimiser.zero_grad() prediction = (class_0_log_prob, class_1_log_prob) prediction = torch.tensor( torch.cat(prediction)).unsqueeze(0) # unsupervised intervention requires generated labels if generate_labels: label = 0 if class_0_prob > class_1_prob else 1 gold_label = torch.tensor(label).unsqueeze(0) criterion = nn.NLLLoss() loss = criterion(prediction, gold_label) loss.backward() optimiser.step() relevant_activations[act] = torch.tensor(current_activation, dtype=torch.float) correct_form_score = vocab_probs[w2i[correct_form]].data wrong_form_score = vocab_probs[w2i[wrong_form]].data if (correct_form_score > wrong_form_score).all(): score = 1 else: score = 0 scores.append(score) if construction_id == 0 and type_of_sent == 'original': scores_original_vnpcv.append(score) scores_original.append(score) if construction_id == 1 and type_of_sent == 'original': scores_original_nvv.append(score) scores_original.append(score) if construction_id == 0 and type_of_sent == 'generated': scores_nonce_vnpcv.append(score) scores_nonce.append(score) if construction_id == 1 and type_of_sent == 'generated': scores_nonce_nvv.append(score) scores_nonce.append(score) assert(len(scores) == len(test_set) / 2) # Print accuracy results print ('Original V NP Conj V ', np.sum( scores_original_vnpcv) / len(scores_original_vnpcv)) print ('Nonce V NP Conj V ', np.sum( scores_nonce_vnpcv) / len(scores_nonce_vnpcv)) print ('Original N V V ', np.sum( scores_original_nvv) / len(scores_original_nvv)) print ('Nonce N V V ', np.sum( scores_nonce_nvv) / len(scores_nonce_nvv)) print ('Original Overall ', np.sum( scores_original) / len(scores_original)) print ('Nonce Overall ', np.sum(scores_nonce) / len(scores_nonce)) print ('Overall ', np.sum(scores) / len(scores))
def cosine(a, b): numerator = torch.dot(a, b) denominator = torch.norm(a, 2) * torch.norm(b, 2) return float(numerator / denominator)
def get_att_score(self, hidden, encoder_hidden): score = self.attn(encoder_hidden) return torch.dot(hidden.view(-1), score.view(-1))
# print(max_1) # print(value) # print(index) # print(max_1_0) # print(max_1_1) ''' (tensor([2., 4., 6., 8.]), tensor([1, 1, 1, 1])) tensor([2., 4., 6., 8.]) tensor([1, 1, 1, 1]) tensor([2., 4., 6., 8.]) tensor([1, 1, 1, 1]) ''' ### Dot product tensor = torch.Tensor([1, 2, 3, 4, 5]) dot = torch.dot(tensor, tensor) # print(dot) ''' tensor(55.) ''' ### Mathematical functions tensor = torch.Tensor([[1, 2, 3, 4], [5, 6, 7, 8]]) sqrt = torch.sqrt(tensor) exp = torch.exp(tensor) log = torch.log(tensor) # print(sqrt) # print(exp) # print(log) '''
def matrix_factorization(R, K, steps=100, lr=0.002): """ Input ----- R - Tensor: Ratings matrix_factorization: Dimensions: N-users by M-items K - Int: Number of latent features Output ------ P: User-feature matrix Dimensions: N-users by K-features Qt: Transpose of Item-feature matrix Dimensions: K-features by M-items Reference --------- https://towardsdatascience.com/recommendation-system-matrix-factorization-d61978660b4b """ # Initialize matrices P and Qt with random values between 1 and 0 N_users, M_items = R.size() P = torch.randint(1, 5, (N_users, K)) Qt = torch.randint(1, 5, (K, M_items)) beta = 0.02 prev_e = float("inf") # Setup training loop for step in range(steps): # Calculate and apply gradients for user_i in range(N_users): for item_j in range(M_items): if R[user_i][item_j] > 0: pred = torch.dot(P[user_i,:], Qt[:,item_j]) err_ij = R[user_i][item_j] - pred for k in range(K): # Calculate and shift the gradient P[user_i][k] = P[user_i][k] + lr * (2*err_ij*Qt[k][item_j] - beta*P[user_i][k]) Qt[k][item_j] = Qt[k][item_j] + lr * (2*err_ij*P[user_i][k] - beta*Qt[k][item_j]) # Calculate difference in loss e = 0.0 for user_i in range(N_users): for item_j in range(M_items): if R[user_i][item_j] > 0: e = e + pow(R[user_i][item_j] - torch.dot(P[user_i,:],Qt[:,item_j]), 2) for k in range(K): e = e + (beta/2) * (pow(P[user_i][k],2) + pow(Qt[k][item_j],2)) if 0 < (prev_e - e) < 50: break prev_e = e if step % 1 == 0: print("step: %s, loss: %s" % (step+1, e)) return P, Qt # Calculate Root mean squared error for all values of R and P*Qt # Calculate gradients for all values of R and P*Qt # Apply gradients to P and Q # Return P and Qt
def Hv(v): hessian = get_flat_grads(torch.dot(grad_kld_old_param, v), self.pi).detach() return hessian + cg_damping * v
def main(dom="driving", reptype="wordfeat", splittype="LOOtask", excludeid=2, taskrepsize=2, modeltype="neural", gpmode=0, pval=0.1, seed=0, nfolds=10): modelname = modeltype + "_" + str(taskrepsize) + "_" + str( gpmode) + "_" + dom + "_" + splittype + "_" + str(excludeid) # check what kind of modifications to the GP we are using print("Modelname: ", modelname) usepriormean, usepriorpoints = getGPParams(gpmode) verbose = False torch.manual_seed(seed) # set up our seed for reproducibility np.random.seed(seed) # load the data data, nparts = loadData(dom) # print(data) # recreate word vectors if needed # e.g., when you download new word features from glove. recreate_word_vectors = False if recreate_word_vectors: recreateWordVectors() # load word features wordfeatures = loadWordFeatures(dom, loadpickle=True) print(wordfeatures.shape) # in the experiments in the paper, we use the word features directly. However, # you can also use tsne or pca dim-reduced features. tsnefeatures = computeTSNEFeatures(wordfeatures) pcafeatures = computePCAFeatures(wordfeatures) allfeatures = { "wordfeat": wordfeatures, "tsne": tsnefeatures, "pca": pcafeatures } # create primary dataset dataset = createDataset(data, reptype, allfeatures) # create dataset splits expdata = getTrainTestValSplit(data, dataset, splittype, excludeid=excludeid, pval=pval, nfolds=nfolds) nfeats = allfeatures[reptype].shape[1] # we don't use an initial projection matrix. You can substitute one here if you like Ainit = None inptasksobs = Variable(dtype(expdata["tasksobsfeats_train"]), requires_grad=False) inptasksperf = Variable(dtype(expdata["tasksobsperf_train"]), requires_grad=False) inptaskspred = Variable(dtype(expdata["taskspredfeats_train"]), requires_grad=False) outtrustpred = Variable(dtype(expdata["trustpred_train"]), requires_grad=False) inptasksobs_val = Variable(dtype(expdata["tasksobsfeats_val"]), requires_grad=False) inptasksperf_val = Variable(dtype(expdata["tasksobsperf_val"]), requires_grad=False) inptaskspred_val = Variable(dtype(expdata["taskspredfeats_val"]), requires_grad=False) outtrustpred_val = Variable(dtype(expdata["trustpred_val"]), requires_grad=False) inptasksobs_test = Variable(dtype(expdata["tasksobsfeats_test"]), requires_grad=False) inptasksperf_test = Variable(dtype(expdata["tasksobsperf_test"]), requires_grad=False) inptaskspred_test = Variable(dtype(expdata["taskspredfeats_test"]), requires_grad=False) outtrustpred_test = Variable(dtype(expdata["trustpred_test"]), requires_grad=False) learning_rate = 1e-3 if modeltype == "gp": learning_rate = 1e-1 usepriormean = usepriormean obsseqlen = 2 phiinit = 1.0 weight_decay = 0.01 #0.01 modelparams = { "inputsize": inptasksobs.shape[2], "reptype": reptype, "taskrepsize": taskrepsize, "phiinit": phiinit, "Ainit": None, # np.array(Ainit), "obsseqlen": obsseqlen, "verbose": verbose, "usepriormean": usepriormean, "usepriorpoints": usepriorpoints } elif modeltype == "neural": perfrepsize = taskrepsize numGRUlayers = 2 nperf = 2 weight_decay = 0.00 modelparams = { "perfrepsize": perfrepsize, "numGRUlayers": numGRUlayers, "nperf": nperf, "verbose": verbose, "taskrepsize": taskrepsize, "Ainit": None, #np.array(Ainit), "nfeats": inptasksobs.shape[2] } elif modeltype == "lineargaussian": obsseqlen = 2 weight_decay = 0.01 modelparams = { "inputsize": inptasksobs.shape[2], "obsseqlen": obsseqlen, } elif modeltype == "constant": obsseqlen = 2 weight_decay = 0.01 modelparams = { "inputsize": inptasksobs.shape[2], "obsseqlen": obsseqlen, } else: raise ValueError("No such model") verbose = False reportperiod = 1 # these two parameters control the early stopping # we save the stopcount-th model after the best validation is achived # but keep the model running for burnin longer in case a better # model is attained if splittype == "3participant": stopcount = 3 burnin = 50 elif splittype == "LOOtask": stopcount = 3 burnin = 50 t0 = time.time() bestvalloss = 1e10 modeldir = "savedmodels" for rep in range(1): print("REP", rep) model = initModel(modeltype, modelname, parameters=modelparams) # optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate) #if modeltype == "neural" optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) #optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate, max_iter=10, max_eval=20) #optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate) counter = 0 torch.save(model, os.path.join(modeldir, model.modelname + ".pth")) restartopt = False t = 1 #l2comp = nn.L2Loss() while t < 500: def closure(): N = inptaskspred.shape[0] predtrust = model(inptasksobs, inptasksperf, inptaskspred) predtrust = torch.squeeze(predtrust) # logloss = torch.mean(torch.pow(predtrust - outtrustpred, 2.0)) # / 2*torch.exp(obsnoise)) loss = -( torch.dot(outtrustpred, torch.log(predtrust)) + torch.dot( (1 - outtrustpred), torch.log(1.0 - predtrust))) / N optimizer.zero_grad() loss.backward() return loss optimizer.step(closure) if t % reportperiod == 0: # compute training loss predtrust = model(inptasksobs, inptasksperf, inptaskspred) predtrust = torch.squeeze(predtrust) loss = -(torch.dot(outtrustpred, torch.log(predtrust)) + torch.dot( (1 - outtrustpred), torch.log(1.0 - predtrust)) ) / inptaskspred.shape[0] # compute validation loss predtrust_val = model(inptasksobs_val, inptasksperf_val, inptaskspred_val) predtrust_val = torch.squeeze(predtrust_val) valloss = -(torch.dot( outtrustpred_val, torch.log(predtrust_val)) + torch.dot( (1 - outtrustpred_val), torch.log(1.0 - predtrust_val)) ) / predtrust_val.shape[0] # compute prediction loss predtrust_test = torch.squeeze( model(inptasksobs_test, inptasksperf_test, inptaskspred_test)) predloss = -(torch.dot( outtrustpred_test, torch.log(predtrust_test)) + torch.dot( (1 - outtrustpred_test), torch.log( 1.0 - predtrust_test))) / predtrust_test.shape[0] #print(model.wb, model.wtp, model.trust0, model.sigma0) #check for nans checkval = np.sum(np.array(predtrust_test.data)) if np.isnan(checkval) or np.isinf(checkval): # check if we have already restarted once if restartopt: #we've already done this, fail out. #break out. print("Already restarted once. Quitting") break # reinitialize model and switch optimizer print("NaN value encountered. Restarting opt") model = initModel(modeltype, modelname, parameters=modelparams) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) t = 1 counter = 0 restartopt = True else: # print(predtrust_test.data, outtrustpred_test.data) mae = metrics.mean_absolute_error(predtrust_test.data, outtrustpred_test.data) print(t, loss.data[0], valloss.data[0], predloss.data[0], mae) optimizer.zero_grad() # if validation loss has increased for stopcount iterations augname = model.modelname + "_" + str(excludeid) + ".pth" if valloss.data[0] <= bestvalloss: torch.save(model, os.path.join(modeldir, augname)) print(valloss.data[0], bestvalloss, "Model saved") bestvalloss = valloss.data[0] counter = 0 else: if counter < stopcount and (valloss.data[0] - bestvalloss) <= 0.1: torch.save(model, os.path.join(modeldir, augname)) print(valloss.data[0], bestvalloss, "Model saved : POST", counter) counter += 1 if counter >= stopcount and t > burnin: #torch.save(model, modeldir+ model.modelname + ".pth") break t = t + 1 t1 = time.time() print("Total time: ", t1 - t0) model = torch.load( os.path.join(modeldir, modelname + "_" + str(excludeid) + ".pth")) # make predictions using trained model and compute metrics predtrust_test = torch.squeeze( model(inptasksobs_test, inptasksperf_test, inptaskspred_test)) res = np.zeros((predtrust_test.shape[0], 2)) res[:, 0] = predtrust_test.data[:] res[:, 1] = outtrustpred_test.data[:] print(res) mae = metrics.mean_absolute_error(predtrust_test.data, outtrustpred_test.data) predloss = -(torch.dot(outtrustpred_test, torch.log(predtrust_test)) + torch.dot((1 - outtrustpred_test), torch.log(1.0 - predtrust_test))) / \ predtrust_test.shape[0] predloss = predloss.data[0] return (mae, predloss, res)
def Hv(v): hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\ .detach() return hessian
def regularize_expectation(loss): u = loss / loss.sum() return torch.dot(loss, u)
def get_att_score(self, dec_output, enc_output): score = self.attn(enc_output) # score : [batch_size, n_hidden] return torch.dot(dec_output.view(-1), score.view(-1)) # 标量值
def matmul(tensor1, tensor2, out=None): r"""Matrix product of two tensors. The behavior depends on the dimensionality of the tensors as follows: - If both tensors are 1-dimensional, the dot product (scalar) is returned. - If both arguments are 2-dimensional, the matrix-matrix product is returned. - If the first argument is 1-dimensional and the second argument is 2-dimensional, a 1 is prepended to its dimension for the purpose of the matrix multiply. After the matrix multiply, the prepended dimension is removed. - If the first argument is 2-dimensional and the second argument is 1-dimensional, the matrix-vector product is returned. - If both arguments are at least 1-dimensional and at least one argument is N-dimensional (where N > 2), then a batched matrix multiply is returned. If the first argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the batched matrix multiply and removed after. If the second argument is 1-dimensional, a 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus must be broadcastable). For example, if :attr:`tensor1` is a :math:`(j \times 1 \times n \times m)` tensor and :attr:`tensor2` is a :math:`(k \times m \times p)` tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor. .. note:: The 1-dimensional dot product version of this function does not support an :attr:`out` parameter. Arguments: tensor1 (Tensor): the first tensor to be multiplied tensor2 (Tensor): the second tensor to be multiplied out (Tensor, optional): the output tensor """ dim_tensor1 = tensor1.dim() dim_tensor2 = tensor2.dim() if dim_tensor1 == 1 and dim_tensor2 == 1: if out is None: return torch.dot(tensor1, tensor2) else: raise ValueError("out must be None for 1-d tensor matmul, returns a scalar") if dim_tensor1 == 2 and dim_tensor2 == 1: if out is None: return torch.mv(tensor1, tensor2) else: return torch.mv(tensor1, tensor2, out=out) elif dim_tensor1 == 1 and dim_tensor2 == 2: if out is None: return torch.mm(tensor1.unsqueeze(0), tensor2).squeeze_(0) else: return torch.mm(tensor1.unsqueeze(0), tensor2, out=out).squeeze_(0) elif dim_tensor1 == 2 and dim_tensor2 == 2: if out is None: return torch.mm(tensor1, tensor2) else: return torch.mm(tensor1, tensor2, out=out) elif dim_tensor1 >= 3 and (dim_tensor2 == 1 or dim_tensor2 == 2): # optimization: use mm instead of bmm by folding tensor1's batch into # its leading matrix dimension. if dim_tensor2 == 1: tensor2 = tensor2.unsqueeze(-1) size1 = tensor1.size() size2 = tensor2.size() output_size = size1[:-1] + size2[-1:] # fold the batch into the first dimension tensor1 = tensor1.contiguous().view(-1, size1[-1]) if out is None or not out.is_contiguous(): output = torch.mm(tensor1, tensor2) else: output = torch.mm(tensor1, tensor2, out=out) output = output.view(output_size) if dim_tensor2 == 1: output = output.squeeze(-1) if out is not None: out.set_(output) return out return output elif (dim_tensor1 >= 1 and dim_tensor2 >= 1) and (dim_tensor1 >= 3 or dim_tensor2 >= 3): # ensure each tensor size is at least 3-dimensional tensor1_exp_size = torch.Size((1,) * max(3 - tensor1.dim(), 0) + tensor1.size()) # rhs needs to be a separate case since we can't freely expand 1s on the rhs, but can on lhs if dim_tensor2 == 1: tensor2 = tensor2.unsqueeze(1) tensor2_exp_size = torch.Size((1,) * max(3 - tensor2.dim(), 0) + tensor2.size()) # expand the batch portion (i.e. cut off matrix dimensions and expand rest) expand_batch_portion = torch._C._infer_size(tensor1_exp_size[:-2], tensor2_exp_size[:-2]) # flatten expanded batches tensor1_expanded = tensor1.expand(*(expand_batch_portion + tensor1_exp_size[-2:])) \ .contiguous().view(reduce(mul, expand_batch_portion), *tensor1_exp_size[-2:]) tensor2_expanded = tensor2.expand(*(expand_batch_portion + tensor2_exp_size[-2:])) \ .contiguous().view(reduce(mul, expand_batch_portion), *tensor2_exp_size[-2:]) # reshape batches back into result total_expansion = expand_batch_portion + (tensor1_exp_size[-2], tensor2_exp_size[-1]) def maybeSqueeze(tensor): if dim_tensor1 == 1: return tensor.squeeze(-2) elif dim_tensor2 == 1: return tensor.squeeze(-1) else: return tensor if out is None or not out.is_contiguous(): output = torch.bmm(tensor1_expanded, tensor2_expanded) else: output = torch.bmm(tensor1_expanded, tensor2_expanded, out=out) output = maybeSqueeze(output.view(total_expansion)) if out is not None: out.set_(output) return out return output raise ValueError("both arguments to __matmul__ need to be at least 1D, " "but they are {}D and {}D".format(dim_tensor1, dim_tensor2))
def regularize_expectation_exp(loss): u = loss.exp() / loss.exp().sum() return torch.dot(loss, u)