def text_to_instance(self, index: int, field_type: str): # type: ignore field = TextField( [Token(t) for t in ["The", "number", "is", str(index), "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) return Instance({ "text": field, "label": LabelField(index, skip_indexing=True), "flag": FlagField(23), "index": IndexField(index % self.batch_size, field), "metadata": MetadataField( {"some_key": "This will not be logged as a histogram."}), "adjacency": AdjacencyField([(0, 1), (1, 2)], field), "multilabel": MultiLabelField(["l1", "l2"]), "span": SpanField(2, 3, field), "tensor": TensorField(torch.randn(2, 3)), })
def test_batch_tensors_crashes_with_non_uniform_values(self): field = FlagField(True) with pytest.raises(ValueError): field.batch_tensors([True, False, True]) with pytest.raises(ValueError): field.batch_tensors([1, 2, 3, 4]) with pytest.raises(ValueError): field.batch_tensors(["different", "string", "flags"])
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = instance.duplicate() text_field: TextField = instance["tokens"] new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) new_instance.add_field("ignore_loss_on_o_tags", FlagField(True)) instances.append(new_instance) return instances
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: ```text Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org ``` We create three instances. ```text Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org ``` We additionally add a flag to these instances to tell the model to only compute loss on non-O tags, so that we get gradients that are specific to the particular span prediction that each instance represents. """ predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = deepcopy(instance) text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) new_instance.add_field("ignore_loss_on_o_tags", FlagField(True)) instances.append(new_instance) return instances
def test_get_padding_lengths_returns_nothing(self): flag_field = FlagField(True) assert flag_field.get_padding_lengths() == {}
def test_batch_tensors_returns_single_value(self): value = True fields = [FlagField(value) for _ in range(5)] values = [field.as_tensor({}) for field in fields] batched_value = fields[0].batch_tensors(values) assert batched_value == value
def test_human_readable_repr(self): flag = FlagField(True) assert flag.human_readable_repr() is True
def test_printing_doesnt_crash(self): flag = FlagField(True) print(flag)
def test_as_tensor_just_returns_value(self): for value in [True, 3.234, "this is a string"]: assert FlagField(value).as_tensor({}) == value