Ejemplo n.º 1
0
    def text_to_instance(self, index: int, field_type: str):  # type: ignore
        field = TextField(
            [Token(t) for t in ["The", "number", "is",
                                str(index), "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )

        return Instance({
            "text":
            field,
            "label":
            LabelField(index, skip_indexing=True),
            "flag":
            FlagField(23),
            "index":
            IndexField(index % self.batch_size, field),
            "metadata":
            MetadataField(
                {"some_key": "This will not be logged as a histogram."}),
            "adjacency":
            AdjacencyField([(0, 1), (1, 2)], field),
            "multilabel":
            MultiLabelField(["l1", "l2"]),
            "span":
            SpanField(2, 3, field),
            "tensor":
            TensorField(torch.randn(2, 3)),
        })
Ejemplo n.º 2
0
    def test_batch_tensors_crashes_with_non_uniform_values(self):
        field = FlagField(True)
        with pytest.raises(ValueError):
            field.batch_tensors([True, False, True])

        with pytest.raises(ValueError):
            field.batch_tensors([1, 2, 3, 4])

        with pytest.raises(ValueError):
            field.batch_tensors(["different", "string", "flags"])
Ejemplo n.º 3
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = instance.duplicate()
            text_field: TextField = instance["tokens"]
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            new_instance.add_field("ignore_loss_on_o_tags", FlagField(True))
            instances.append(new_instance)

        return instances
Ejemplo n.º 4
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org
        ```

        We create three instances.

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        ```

        We additionally add a flag to these instances to tell the model to only compute loss on
        non-O tags, so that we get gradients that are specific to the particular span prediction
        that each instance represents.
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            new_instance.add_field("ignore_loss_on_o_tags", FlagField(True))
            instances.append(new_instance)

        return instances
Ejemplo n.º 5
0
 def test_get_padding_lengths_returns_nothing(self):
     flag_field = FlagField(True)
     assert flag_field.get_padding_lengths() == {}
Ejemplo n.º 6
0
 def test_batch_tensors_returns_single_value(self):
     value = True
     fields = [FlagField(value) for _ in range(5)]
     values = [field.as_tensor({}) for field in fields]
     batched_value = fields[0].batch_tensors(values)
     assert batched_value == value
Ejemplo n.º 7
0
 def test_human_readable_repr(self):
     flag = FlagField(True)
     assert flag.human_readable_repr() is True
Ejemplo n.º 8
0
 def test_printing_doesnt_crash(self):
     flag = FlagField(True)
     print(flag)
Ejemplo n.º 9
0
 def test_as_tensor_just_returns_value(self):
     for value in [True, 3.234, "this is a string"]:
         assert FlagField(value).as_tensor({}) == value