Esempio n. 1
0
def _remove_one_token(instance: Instance,
                      input_field_to_attack: str,
                      grads: np.ndarray,
                      ignore_tokens: List[str]) -> Tuple[Instance, int]:
    """
    Finds the token with the smallest gradient and removes it.
    """
    # Compute L2 norm of all grads.
    grads_mag = [np.sqrt(grad.dot(grad)) for grad in grads]

    # Skip all ignore_tokens by setting grad to infinity
    text_field: TextField = instance[input_field_to_attack]  # type: ignore
    for token_idx, token in enumerate(text_field.tokens):
        if token in ignore_tokens:
            grads_mag[token_idx] = float("inf")

    # For NER, skip all tokens that are not in outside
    if "tags" in instance:
        tag_field: SequenceLabelField = instance["tags"]  # type: ignore
        labels: List[str] = tag_field.labels  # type: ignore
        for idx, label in enumerate(labels):
            if label != "O":
                grads_mag[idx] = float("inf")

    smallest = np.argmin(grads_mag)
    if smallest == float("inf"):  # if all are ignored tokens, return.
        return instance, smallest

    # remove smallest
    inputs_before_smallest = text_field.tokens[0:smallest]
    inputs_after_smallest = text_field.tokens[smallest + 1:]
    text_field.tokens = inputs_before_smallest + inputs_after_smallest

    if "tags" in instance:
        tag_field_before_smallest = tag_field.labels[0:smallest]
        tag_field_after_smallest = tag_field.labels[smallest + 1:]
        tag_field.labels = tag_field_before_smallest + tag_field_after_smallest  # type: ignore
        tag_field.sequence_field = text_field

    instance.indexed = False
    return instance, smallest
 def text_to_instance(self, input_json) -> Instance:
     instance = Instance(input_json)
     instance.indexed = True
     return instance
Esempio n. 3
0
    def attack_instance(
        self,
        instance: Instance,
        inputs: JsonDict,
        input_field_to_attack: str = "tokens",
        grad_input_field: str = "grad_input_1",
        ignore_tokens: List[str] = None,
        target: JsonDict = None,
    ) -> Tuple[List[Token], JsonDict]:
        if self.embedding_matrix is None:
            self.initialize()

        ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens

        # If `target` is `None`, we move away from the current prediction, otherwise we move
        # _towards_ the target.
        sign = -1 if target is None else 1

        # Gets a list of the fields that we want to check to see if they change.
        fields_to_compare = utils.get_fields_to_compare(
            inputs, instance, input_field_to_attack)

        # We'll be modifying the tokens in this text field below, and grabbing the modified
        # list after the `while` loop.
        text_field: TextField = instance[input_field_to_attack]  # type: ignore

        # Because we can save computation by getting grads and outputs at the same time, we do
        # them together at the end of the loop, even though we use grads at the beginning and
        # outputs at the end.  This is our initial gradient for the beginning of the loop.  The
        # output can be ignored here.
        grads, outputs = self.predictor.get_gradients([instance])

        # Ignore any token that is in the ignore_tokens list by setting the token to already
        # flipped.
        flipped: List[int] = []
        for index, token in enumerate(text_field.tokens):
            if token.text in ignore_tokens:
                flipped.append(index)
        if "clusters" in outputs:
            # Coref unfortunately needs a special case here.  We don't want to flip words in
            # the same predicted coref cluster, but we can't really specify a list of tokens,
            # because, e.g., "he" could show up in several different clusters.
            # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to
            # return the set of tokens that shouldn't be changed for each instance?  E.g., you
            # could imagine setting a field on the `Token` object, that we could then read
            # here...
            for cluster in outputs["clusters"]:
                for mention in cluster:
                    for index in range(mention[0], mention[1] + 1):
                        flipped.append(index)

        while True:
            # Compute L2 norm of all grads.
            grad = grads[grad_input_field][0]
            grads_magnitude = [g.dot(g) for g in grad]

            # only flip a token once
            for index in flipped:
                grads_magnitude[index] = -1

            # We flip the token with highest gradient norm.
            index_of_token_to_flip = numpy.argmax(grads_magnitude)
            if grads_magnitude[index_of_token_to_flip] == -1:
                # If we've already flipped all of the tokens, we give up.
                break
            flipped.append(index_of_token_to_flip)

            text_field_tensors = text_field.as_tensor(
                text_field.get_padding_lengths())
            input_tokens = util.get_token_ids_from_text_field_tensors(
                text_field_tensors)
            original_id_of_token_to_flip = input_tokens[index_of_token_to_flip]

            # Get new token using taylor approximation.
            new_id = self._first_order_taylor(grad[index_of_token_to_flip],
                                              original_id_of_token_to_flip,
                                              sign)

            # Flip token.  We need to tell the instance to re-index itself, so the text field
            # will actually update.
            new_token = Token(self.vocab._index_to_token[self.namespace]
                              [new_id])  # type: ignore
            text_field.tokens[index_of_token_to_flip] = new_token
            instance.indexed = False

            # Get model predictions on instance, and then label the instances
            grads, outputs = self.predictor.get_gradients([instance
                                                           ])  # predictions
            for key, output in outputs.items():
                if isinstance(output, torch.Tensor):
                    outputs[key] = output.detach().cpu().numpy().squeeze()
                elif isinstance(output, list):
                    outputs[key] = output[0]

            # TODO(mattg): taking the first result here seems brittle, if we're in a case where
            # there are multiple predictions.
            labeled_instance = self.predictor.predictions_to_labeled_instances(
                instance, outputs)[0]

            # If we've met our stopping criterion, we stop.
            has_changed = utils.instance_has_changed(labeled_instance,
                                                     fields_to_compare)
            if target is None and has_changed:
                # With no target, we just want to change the prediction.
                break
            if target is not None and not has_changed:
                # With a given target, we want to *match* the target, which we check by
                # `not has_changed`.
                break
        return text_field.tokens, outputs