Exemple #1
0
 def predictions_to_labeled_instances(
     self, instance: Instance, outputs: Dict[str, numpy.ndarray]
 ) -> List[Instance]:
     """
     Takes each predicted cluster and makes it into a labeled `Instance` with only that
     cluster labeled, so we can compute gradients of the loss `on the model's prediction of that
     cluster`.  This lets us run interpretation methods using those gradients.  See superclass
     docstring for more info.
     """
     # Digging into an Instance makes mypy go crazy, because we have all kinds of things where
     # the type has been lost.  So there are lots of `type: ignore`s here...
     predicted_clusters = outputs["clusters"]
     span_field: ListField = instance["spans"]  # type: ignore
     instances = []
     for cluster in predicted_clusters:
         new_instance = instance.duplicate()
         span_labels = [
             0 if (span.span_start, span.span_end) in cluster else -1  # type: ignore
             for span in span_field
         ]  # type: ignore
         new_instance.add_field(
             "span_labels", SequenceLabelField(span_labels, span_field), self._model.vocab
         )
         new_instance["metadata"].metadata["clusters"] = [cluster]  # type: ignore
         instances.append(new_instance)
     if not instances:
         # No predicted clusters; we just give an empty coref prediction.
         new_instance = instance.duplicate()
         span_labels = [-1] * len(span_field)  # type: ignore
         new_instance.add_field(
             "span_labels", SequenceLabelField(span_labels, span_field), self._model.vocab
         )
         new_instance["metadata"].metadata["clusters"] = []  # type: ignore
         instances.append(new_instance)
     return instances
Exemple #2
0
    def labeled_json_to_labeled_instances(
            self, json_dict: JsonDict) -> Dict[int, Instance]:
        seq_offset = 0
        seq_len = -1
        adhoc_vocab = Vocabulary()
        instances = {}
        for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())):
            inst_obj = json_dict[str_i]
            if seq_len == -1:
                seq_len = len(inst_obj['words'])
                text_field = TextField(
                    [Token(tok['text']) for tok in inst_obj['words']], {})
                instance = Instance({'tokens': text_field})

            new_instance = instance.duplicate()

            tags_field = ConstructiveSupertagField(
                [json_to_cat(tag) for tag in inst_obj['tags']], text_field,
                [i - seq_offset])
            adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels')
            new_instance.add_field('tags', tags_field)
            new_instance.index_fields(adhoc_vocab)

            instances[i] = new_instance

            if i + 1 - seq_offset == seq_len:
                seq_offset += seq_len
                seq_len = -1

        return instances
 def predictions_to_labeled_instances(
     self, instance: Instance, outputs: Dict[str, numpy.ndarray]
 ) -> List[Instance]:
     new_instance = instance.duplicate()
     label = numpy.argmax(outputs["probs"])
     new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
     return [new_instance]
Exemple #4
0
 def predictions_to_labeled_instances(
     self, instance: Instance, outputs: Dict[str, numpy.ndarray]
 ) -> List[Instance]:
     new_instance = instance.duplicate()
     label = numpy.argmax(outputs["probs"])
     # Skip indexing, we have integer representations of the strings "entailment", etc.
     new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
     return [new_instance]
Exemple #5
0
 def predictions_to_labeled_instances(
         self, instance: Instance, outputs: Dict[str,
                                                 np.ndarray]) -> Instance:
     new_instance = instance.duplicate()
     label = np.argmax(outputs["class_probabilities"])
     new_instance.add_field("label",
                            LabelField(int(label), skip_indexing=True))
     return [new_instance]
Exemple #6
0
    def predictions_to_labeled_instances(
        self, instance: Instance, outputs: Dict[str, numpy.ndarray]
    ):
        new_instance = instance.duplicate()
        token_field: TextField = instance["tokens"]  # type: ignore
        mask_targets = [Token(target_top_k[0]) for target_top_k in outputs["top_tokens"][0]]

        new_instance.add_field(
            "target_ids",
            TextField(mask_targets, token_field._token_indexers),
            vocab=self._model.vocab,
        )
        return [new_instance]
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, np.ndarray]) -> List[Instance]:
        new_instance = instance.duplicate()
        text_field: TextField = instance["tokens"]
        for name in self._model.all_model_keys:
            predicted_tags = np.argmax(outputs[f"{name}_class_probabilities"],
                                       axis=-1)[:len(text_field)].tolist()
            new_instance.add_field(
                f"{name}_tags", SequenceLabelField(predicted_tags, text_field),
                self._model.vocab)

        return [new_instance]
Exemple #8
0
    def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance({
            "words":
            TextField(
                [Token("hello")],
                {"tokens": PretrainedTransformerIndexer("bert-base-uncased")})
        })

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method.
Exemple #9
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        new_instance = instance.duplicate()
        span_start = int(outputs["best_span"][0])
        span_end = int(outputs["best_span"][1])

        start_of_context = (
            len(self._dataset_reader._tokenizer.sequence_pair_start_tokens) +
            len(instance["metadata"]["question_tokens"]) +
            len(self._dataset_reader._tokenizer.sequence_pair_mid_tokens))

        answer_span = SpanField(
            start_of_context + span_start,
            start_of_context + span_end,
            instance["question_with_context"],
        )
        new_instance.add_field("answer_span", answer_span)
        return [new_instance]
Exemple #10
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = instance.duplicate()
            text_field: TextField = instance["tokens"]
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            new_instance.add_field("ignore_loss_on_o_tags", FlagField(True))
            instances.append(new_instance)

        return instances
Exemple #11
0
    def predictions_to_labeled_instances(
        self, instance: Instance, outputs: Dict[str, numpy.ndarray]
    ) -> List[Instance]:
        new_instance = instance.duplicate()
        # For BiDAF
        if "best_span" in outputs:
            span_start_label = outputs["best_span"][0]
            span_end_label = outputs["best_span"][1]
            passage_field: SequenceField = new_instance["passage"]  # type: ignore
            new_instance.add_field(
                "span_start", IndexField(int(span_start_label), passage_field), self._model.vocab
            )
            new_instance.add_field(
                "span_end", IndexField(int(span_end_label), passage_field), self._model.vocab
            )

        # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans,
        # answer_as_add_sub_expressions, answer_as_counts. We need labels for all.
        elif "answer" in outputs:
            answer_type = outputs["answer"]["answer_type"]

            # When the problem is a counting problem
            if answer_type == "count":
                field = ListField([LabelField(int(outputs["answer"]["count"]), skip_indexing=True)])
                new_instance.add_field("answer_as_counts", field, self._model.vocab)

            # When the answer is in the passage
            elif answer_type == "passage_span":
                # TODO(mattg): Currently we only handle one predicted span.
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                offsets = new_instance["metadata"].metadata["passage_token_offsets"]  # type: ignore
                for index, offset in enumerate(offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                passage_field: SequenceField = new_instance["passage"]  # type: ignore
                field = ListField([SpanField(word_span_start, word_span_end, passage_field)])
                new_instance.add_field("answer_as_passage_spans", field, self._model.vocab)

            # When the answer is an arithmetic calculation
            elif answer_type == "arithmetic":
                # The different numbers in the passage that the model encounters
                sequence_labels = outputs["answer"]["numbers"]

                numbers_field: ListField = instance["number_indices"]  # type: ignore

                # The numbers in the passage are given signs, that's what we are labeling here.
                # Negative signs are given the class label 2 (for 0 and 1, the sign matches the
                # label).
                labels = []
                for label in sequence_labels:
                    if label["sign"] == -1:
                        labels.append(2)
                    else:
                        labels.append(label["sign"])
                # There's a dummy number added in the dataset reader to handle passages with no
                # numbers; it has a label of 0 (not included).
                labels.append(0)

                field = ListField([SequenceLabelField(labels, numbers_field)])
                new_instance.add_field("answer_as_add_sub_expressions", field, self._model.vocab)

            # When the answer is in the question
            elif answer_type == "question_span":
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                question_offsets = new_instance["metadata"].metadata[  # type: ignore
                    "question_token_offsets"
                ]
                for index, offset in enumerate(question_offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                question_field: SequenceField = new_instance["question"]  # type: ignore
                field = ListField([SpanField(word_span_start, word_span_end, question_field)])
                new_instance.add_field("answer_as_question_spans", field, self._model.vocab)

        return [new_instance]
Exemple #12
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org
        ```

        We create three instances.

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        ```

        We additionally add a flag to these instances to tell the model to only compute loss on
        non-O tags, so that we get gradients that are specific to the particular span prediction
        that each instance represents.
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = instance.duplicate()
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            new_instance.add_field("ignore_loss_on_o_tags", FlagField(True))
            instances.append(new_instance)

        return instances