def _remove_one_token(instance: Instance, input_field_to_attack: str, grads: np.ndarray, ignore_tokens: List[str]) -> Tuple[Instance, int]: """ Finds the token with the smallest gradient and removes it. """ # Compute L2 norm of all grads. grads_mag = [np.sqrt(grad.dot(grad)) for grad in grads] # Skip all ignore_tokens by setting grad to infinity text_field: TextField = instance[input_field_to_attack] # type: ignore for token_idx, token in enumerate(text_field.tokens): if token in ignore_tokens: grads_mag[token_idx] = float("inf") # For NER, skip all tokens that are not in outside if "tags" in instance: tag_field: SequenceLabelField = instance["tags"] # type: ignore labels: List[str] = tag_field.labels # type: ignore for idx, label in enumerate(labels): if label != "O": grads_mag[idx] = float("inf") smallest = np.argmin(grads_mag) if smallest == float("inf"): # if all are ignored tokens, return. return instance, smallest # remove smallest inputs_before_smallest = text_field.tokens[0:smallest] inputs_after_smallest = text_field.tokens[smallest + 1:] text_field.tokens = inputs_before_smallest + inputs_after_smallest if "tags" in instance: tag_field_before_smallest = tag_field.labels[0:smallest] tag_field_after_smallest = tag_field.labels[smallest + 1:] tag_field.labels = tag_field_before_smallest + tag_field_after_smallest # type: ignore tag_field.sequence_field = text_field instance.indexed = False return instance, smallest
def text_to_instance(self, input_json) -> Instance: instance = Instance(input_json) instance.indexed = True return instance
def attack_instance( self, instance: Instance, inputs: JsonDict, input_field_to_attack: str = "tokens", grad_input_field: str = "grad_input_1", ignore_tokens: List[str] = None, target: JsonDict = None, ) -> Tuple[List[Token], JsonDict]: if self.embedding_matrix is None: self.initialize() ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens # If `target` is `None`, we move away from the current prediction, otherwise we move # _towards_ the target. sign = -1 if target is None else 1 # Gets a list of the fields that we want to check to see if they change. fields_to_compare = utils.get_fields_to_compare( inputs, instance, input_field_to_attack) # We'll be modifying the tokens in this text field below, and grabbing the modified # list after the `while` loop. text_field: TextField = instance[input_field_to_attack] # type: ignore # Because we can save computation by getting grads and outputs at the same time, we do # them together at the end of the loop, even though we use grads at the beginning and # outputs at the end. This is our initial gradient for the beginning of the loop. The # output can be ignored here. grads, outputs = self.predictor.get_gradients([instance]) # Ignore any token that is in the ignore_tokens list by setting the token to already # flipped. flipped: List[int] = [] for index, token in enumerate(text_field.tokens): if token.text in ignore_tokens: flipped.append(index) if "clusters" in outputs: # Coref unfortunately needs a special case here. We don't want to flip words in # the same predicted coref cluster, but we can't really specify a list of tokens, # because, e.g., "he" could show up in several different clusters. # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to # return the set of tokens that shouldn't be changed for each instance? E.g., you # could imagine setting a field on the `Token` object, that we could then read # here... for cluster in outputs["clusters"]: for mention in cluster: for index in range(mention[0], mention[1] + 1): flipped.append(index) while True: # Compute L2 norm of all grads. grad = grads[grad_input_field][0] grads_magnitude = [g.dot(g) for g in grad] # only flip a token once for index in flipped: grads_magnitude[index] = -1 # We flip the token with highest gradient norm. index_of_token_to_flip = numpy.argmax(grads_magnitude) if grads_magnitude[index_of_token_to_flip] == -1: # If we've already flipped all of the tokens, we give up. break flipped.append(index_of_token_to_flip) text_field_tensors = text_field.as_tensor( text_field.get_padding_lengths()) input_tokens = util.get_token_ids_from_text_field_tensors( text_field_tensors) original_id_of_token_to_flip = input_tokens[index_of_token_to_flip] # Get new token using taylor approximation. new_id = self._first_order_taylor(grad[index_of_token_to_flip], original_id_of_token_to_flip, sign) # Flip token. We need to tell the instance to re-index itself, so the text field # will actually update. new_token = Token(self.vocab._index_to_token[self.namespace] [new_id]) # type: ignore text_field.tokens[index_of_token_to_flip] = new_token instance.indexed = False # Get model predictions on instance, and then label the instances grads, outputs = self.predictor.get_gradients([instance ]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # TODO(mattg): taking the first result here seems brittle, if we're in a case where # there are multiple predictions. labeled_instance = self.predictor.predictions_to_labeled_instances( instance, outputs)[0] # If we've met our stopping criterion, we stop. has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare) if target is None and has_changed: # With no target, we just want to change the prediction. break if target is not None and not has_changed: # With a given target, we want to *match* the target, which we check by # `not has_changed`. break return text_field.tokens, outputs