def get_log_probs_at_index(self, text_list, word_index): """ Gets the probability of the word at index `word_index` according to GPT-2. Assumes that all items in `text_list` have the same prefix up until `word_index`. """ prefix = text_list[0].text_until_word_index(word_index) if not utils.has_letter(prefix): # This language model perplexity is not defined with respect to # a word without a prefix. If the prefix is null, just return the # log-probability 0.0. return torch.zeros(len(text_list), dtype=torch.float) token_ids = self.tokenizer.encode(prefix) tokens_tensor = torch.tensor([token_ids]) tokens_tensor = tokens_tensor.to(utils.device) with torch.no_grad(): outputs = self.model(tokens_tensor) predictions = outputs[0] probs = [] for attacked_text in text_list: next_word_ids = self.tokenizer.encode( attacked_text.words[word_index]) next_word_prob = predictions[0, -1, next_word_ids[0]] probs.append(next_word_prob) return probs
def _get_replacement_words_by_grad(self, text, indices_to_replace): """ Returns returns a list containing all possible words to replace `word` with, based off of the model's gradient. Arguments: text (TokenizedText): The full text input to perturb word_index (int): index of the word to replace """ self.model.train() lookup_table = self.model.lookup_table.to(utils.get_device()) lookup_table_transpose = lookup_table.transpose(0, 1) # set backward hook on the word embeddings for input x emb_hook = Hook(self.model.word_embeddings, backward=True) self.model.zero_grad() predictions = self._call_model(text) original_label = predictions.argmax() y_true = torch.Tensor([original_label]).long().to(utils.get_device()) loss = self.loss(predictions, y_true) loss.backward() # grad w.r.t to word embeddings emb_grad = emb_hook.output[0].to(utils.get_device()).squeeze() # grad differences between all flips and original word (eq. 1 from paper) vocab_size = lookup_table.size(0) diffs = torch.zeros(len(indices_to_replace), vocab_size) indices_to_replace = list(indices_to_replace) for j, word_idx in enumerate(indices_to_replace): # Get the grad w.r.t the one-hot index of the word. b_grads = emb_grad[word_idx].view( 1, -1).mm(lookup_table_transpose).squeeze() a_grad = b_grads[text.ids[0][word_idx]] diffs[j] = b_grads - a_grad # Don't change to the pad token. diffs[:, self.model.tokenizer.pad_id] = float('-inf') # Find best indices within 2-d tensor by flattening. word_idxs_sorted_by_grad = (-diffs).flatten().argsort() candidates = [] num_words_in_text, num_words_in_vocab = diffs.shape for idx in word_idxs_sorted_by_grad.tolist(): idx_in_diffs = idx // num_words_in_vocab idx_in_vocab = idx % (num_words_in_vocab) idx_in_sentence = indices_to_replace[idx_in_diffs] word = self.model.tokenizer.convert_id_to_word(idx_in_vocab) if not utils.has_letter(word): # Do not consider words that are solely letters or punctuation. continue candidates.append((word, idx_in_sentence)) if len(candidates) == self.top_n: break self.model.eval() return candidates
def _get_replacement_words_by_grad(self, attacked_text, indices_to_replace): """Returns returns a list containing all possible words to replace `word` with, based off of the model's gradient. Arguments: attacked_text (AttackedText): The full text input to perturb word_index (int): index of the word to replace """ lookup_table = self.model.get_input_embeddings().weight.data.cpu() grad_output = self.model_wrapper.get_grad( attacked_text.tokenizer_input) emb_grad = torch.tensor(grad_output["gradient"]) text_ids = grad_output["ids"] # grad differences between all flips and original word (eq. 1 from paper) vocab_size = lookup_table.size(0) diffs = torch.zeros(len(indices_to_replace), vocab_size) indices_to_replace = list(indices_to_replace) for j, word_idx in enumerate(indices_to_replace): # Make sure the word is in bounds. if word_idx >= len(emb_grad): continue # Get the grad w.r.t the one-hot index of the word. b_grads = lookup_table.mv(emb_grad[word_idx]).squeeze() a_grad = b_grads[text_ids[word_idx]] diffs[j] = b_grads - a_grad # Don't change to the pad token. diffs[:, self.tokenizer.pad_token_id] = float("-inf") # Find best indices within 2-d tensor by flattening. word_idxs_sorted_by_grad = (-diffs).flatten().argsort() candidates = [] num_words_in_text, num_words_in_vocab = diffs.shape for idx in word_idxs_sorted_by_grad.tolist(): idx_in_diffs = idx // num_words_in_vocab idx_in_vocab = idx % (num_words_in_vocab) idx_in_sentence = indices_to_replace[idx_in_diffs] word = self.tokenizer.convert_id_to_word(idx_in_vocab) if (not utils.has_letter(word)) or (len( utils.words_from_text(word)) != 1): # Do not consider words that are solely letters or punctuation. continue candidates.append((word, idx_in_sentence)) if len(candidates) == self.top_n: break return candidates