def _test_n_k_q_combination(self, n, k, q): n_shot_taskloader = DataLoader(self.dataset, batch_sampler=NShotTaskSampler( self.dataset, 100, n, k, q)) # Load a single n-shot, k-way task for batch in n_shot_taskloader: x, y = batch break # Take just dummy label features and a little bit of noise # So distances are never 0 support = x[:n * k, 1:] queries = x[n * k:, 1:] support += torch.rand_like(support) queries += torch.rand_like(queries) distances = pairwise_distances(queries, support, 'cosine') # Calculate "attention" as softmax over distances attention = (-distances).softmax(dim=1).cuda() y_pred = matching_net_predictions(attention, n, k, q) self.assertEqual( y_pred.shape, (q * k, k), 'Matching Network predictions must have shape (q * k, k).') y_pred_sum = y_pred.sum(dim=1) self.assertTrue( torch.all( torch.isclose(y_pred_sum, torch.ones_like(y_pred_sum).double())), 'Matching Network predictions probabilities must sum to 1 for each ' 'query sample.')
def proto_net_episode(model: Module, optimiser: Optimizer, loss_fn: Callable, input_ids: torch.Tensor, attention_mask: torch.Tensor, y: torch.Tensor, n_shot: int, k_way: int, q_queries: int, distance: str, train: bool): """Performs a single training episode for a Prototypical Network. # Arguments model: Prototypical Network to be trained. optimiser: Optimiser to calculate gradient step loss_fn: Loss function to calculate between predictions and outputs. Should be cross-entropy x: Input samples of few shot classification task y: Input labels of few shot classification task n_shot: Number of examples per class in the support set k_way: Number of classes in the few shot classification task q_queries: Number of examples per class in the query set distance: Distance metric to use when calculating distance between class prototypes and queries train: Whether (True) or not (False) to perform a parameter update # Returns loss: Loss of the Prototypical Network on this task y_pred: Predicted class probabilities for the query set on this task """ if train: # Zero gradients model.train() optimiser.zero_grad() else: model.eval() # Embed all samples embeddings = model(input_ids, attention_mask) # Samples are ordered by the NShotWrapper class as follows: # k lots of n support samples from a particular class # k lots of q query samples from those classes support = embeddings[:n_shot * k_way] queries = embeddings[n_shot * k_way:] prototypes = compute_prototypes(support, k_way, n_shot) # Calculate squared distances between all queries and all prototypes # Output should have shape (q_queries * k_way, k_way) = (num_queries, k_way) distances = pairwise_distances(queries, prototypes, distance) # Calculate log p_{phi} (y = k | x) log_p_y = (-distances).log_softmax(dim=1) loss = loss_fn(log_p_y, y) # Prediction probabilities are softmax over distances y_pred = (-distances).softmax(dim=1) if train: # Take gradient step loss.backward() optimiser.step() else: pass return loss, y_pred
def evaluate_batch(self, x, y): embeddings = self.model(x) support = embeddings[:self.n_shot * self.k_way] queries = embeddings[self.n_shot * self.k_way:] prototypes = compute_prototypes(support, self.k_way, self.n_shot) distances = pairwise_distances(queries, prototypes, self.distance_metric) log_p_y = (-distances).log_softmax(dim=1) loss = self.loss_fn(log_p_y, y) y_pred = (-distances).softmax(dim=1) return loss, y_pred, distances, prototypes, support, queries
def get_class_fit(support, prototypes, k_way, n_shot, distance_metric): sigmoid = nn.Sigmoid() support_distances = pairwise_distances(support, prototypes, distance_metric) support_prob = sigmoid(-support_distances) support_prob = support_prob.reshape(k_way, n_shot, -1) #print(support_prob.reshape(k_way, n_shot, -1).mean(dim=1)) support_prob = support_prob.cpu().detach().numpy() mu_stds = [] for i in range(k_way): pos_mu, pos_std = fit(support_prob[i, :, i]) mu_stds.append([pos_mu, pos_std]) return mu_stds
def autoencoder_episode(model: Module, optimiser: Optimizer, loss_fn: Callable, x: torch.Tensor, y: torch.Tensor, n_shot: int, k_way: int, q_queries: int, distance: str, train: bool): """Performs a single training episode for the baseline nearest neigbhbour model. """ if train: # Zero gradients model.train() optimiser.zero_grad() else: model.eval() x = x.view(x.size(0), 1, -1) # Embed all samples embeddings = model(x) # Samples are ordered by the NShotWrapper class as follows: # k lots of n support samples from a particular class # k lots of q query samples from those classes support = embeddings[:n_shot * k_way] queries = embeddings[n_shot * k_way:] # Calculate squared distances between all queries and all prototypes # Output should have shape (q_queries * k_way, k_way) = (num_queries, k_way) distances = pairwise_distances(queries, support, distance) # Calculate log p_{phi} (y = k | x) log_p_y = (-distances).log_softmax(dim=1) loss = loss_fn(log_p_y, y) # Prediction probabilities are softmax over distances y_pred = (-distances).softmax(dim=1) if train: # Take gradient step loss.backward() optimiser.step() else: pass return loss, y_pred
def matching_net_episode(model: Module, optimiser, loss_fn: Loss, x: torch.Tensor, y: torch.Tensor, n_shot: int, k_way: int, q_queries: int, distance: str, fce: bool, train: bool): """Performs a single training episode for a Matching Network. # Arguments model: Matching Network to be trained. optimiser: Optimiser to calculate gradient step from loss loss_fn: Loss function to calculate between predictions and outputs x: Input samples of few shot classification task y: Input labels of few shot classification task n_shot: Number of examples per class in the support set k_way: Number of classes in the few shot classification task q_queries: Number of examples per class in the query set distance: Distance metric to use when calculating distance between support and query set samples fce: Whether or not to us fully conditional embeddings train: Whether (True) or not (False) to perform a parameter update # Returns loss: Loss of the Matching Network on this task y_pred: Predicted class probabilities for the query set on this task """ if train: # Zero gradients model.train() optimiser.zero_grad() else: model.eval() # Embed all samples embeddings = model.encoder(x) # Samples are ordered by the NShotWrapper class as follows: # k lots of n support samples from a particular class # k lots of q query samples from those classes support = embeddings[:n_shot * k_way] queries = embeddings[n_shot * k_way:] # Optionally apply full context embeddings if fce: # LSTM requires input of shape (seq_len, batch, input_size). `support` is of # shape (k_way * n_shot, embedding_dim) and we want the LSTM to treat the # support set as a sequence so add a single dimension to transform support set # to the shape (k_way * n_shot, 1, embedding_dim) and then remove the batch dimension # afterwards # Calculate the fully conditional embedding, g, for support set samples as described # in appendix A.2 of the paper. g takes the form of a bidirectional LSTM with a # skip connection from inputs to outputs support, _, _ = model.g(support.unsqueeze(1)) support = support.squeeze(1) # Calculate the fully conditional embedding, f, for the query set samples as described # in appendix A.1 of the paper. queries = model.f(support, queries) # Efficiently calculate distance between all queries and all prototypes # Output should have shape (q_queries * k_way, k_way) = (num_queries, k_way) distances = pairwise_distances(queries, support, distance) # Calculate "attention" as softmax over support-query distances attention = (-distances).softmax(dim=1) # Calculate predictions as in equation (1) from Matching Networks # y_hat = \sum_{i=1}^{k} a(x_hat, x_i) y_i y_pred = matching_net_predictions(attention, n_shot, k_way, q_queries) # Calculated loss with negative log likelihood # Clip predictions for numerical stability clipped_y_pred = y_pred.clamp(EPSILON, 1 - EPSILON) loss = loss_fn(clipped_y_pred.log(), y) if train: # Backpropagate gradients loss.backward() # I found training to be quite unstable so I clip the norm # of the gradient to be at most 1 clip_grad_norm_(model.parameters(), 1) # Take gradient step optimiser.step() return loss, y_pred
def proto_net_episode(model: Module, optimiser: Optimizer, loss_fn: Callable, x: torch.Tensor, y: torch.Tensor, n_shot: int, k_way: int, q_queries: int, distance: str, train: bool, stnmodel = None, stnoptim = None, args = None,): """Performs a single training episode for a Prototypical Network. # Arguments model: Prototypical Network to be trained. optimiser: Optimiser to calculate gradient step loss_fn: Loss function to calculate between predictions and outputs. Should be cross-entropy x: Input samples of few shot classification task y: Input labels of few shot classification task n_shot: Number of examples per class in the support set k_way: Number of classes in the few shot classification task q_queries: Number of examples per class in the query set distance: Distance metric to use when calculating distance between class prototypes and queries train: Whether (True) or not (False) to perform a parameter update # Returns loss: Loss of the Prototypical Network on this task y_pred: Predicted class probabilities for the query set on this task """ if train: # Zero gradients model.train() optimiser.zero_grad() if stnmodel: stnmodel.train() stnoptim.zero_grad() else: model.eval() if stnmodel: stnmodel.eval() # If there is an STN, then modify some of the samples theta = None info = None if stnmodel: if args.targetonly: supnum = n_shot*k_way xsup, thetasup, info = stnmodel(x[:supnum], 1) xtar, thetatar, info = stnmodel(x[supnum:], 0) x = torch.cat([xsup, xtar], 0) theta = torch.cat([thetasup, thetatar], 0) else: x, theta, info = stnmodel(x) # Embed all samples embeddings = model(x) # Samples are ordered by the NShotWrapper class as follows: # k lots of n support samples from a particular class # k lots of q query samples from those classes support = embeddings[:n_shot*k_way] queries = embeddings[n_shot*k_way:] prototypes = compute_prototypes(support, k_way, n_shot) # Calculate squared distances between all queries and all prototypes # Output should have shape (q_queries * k_way, k_way) = (num_queries, k_way) distances = pairwise_distances(queries, prototypes, distance, model) # Calculate log p_{phi} (y = k | x) log_p_y = (-distances).log_softmax(dim=1) loss = loss_fn(log_p_y, y) # Calculate the stn loss if stnmodel and train: #print(loss, stnidentityloss(theta)) loss = -loss + args.stn_reg_coeff * stnidentityloss(theta) loss.backward() #for p in stnmodel.parameters(): #print(p.grad) stnoptim.step() # Reset optimizers optimiser.zero_grad() # Prediction probabilities are softmax over distances # Embed all samples embeddings = model(x.detach()) # Samples are ordered by the NShotWrapper class as follows: # k lots of n support samples from a particular class # k lots of q query samples from those classes support = embeddings[:n_shot*k_way] queries = embeddings[n_shot*k_way:] prototypes = compute_prototypes(support, k_way, n_shot) # Calculate squared distances between all queries and all prototypes # Output should have shape (q_queries * k_way, k_way) = (num_queries, k_way) distances = pairwise_distances(queries, prototypes, distance) # Calculate log p_{phi} (y = k | x) log_p_y = (-distances).log_softmax(dim=1) loss = loss_fn(log_p_y, y) y_pred = (-distances).softmax(dim=1) if train: # Take gradient step loss.backward() optimiser.step() else: pass return loss, y_pred, x.detach()