Beispiel #1
0
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
    offsets_converted = [ent for ent in offsets if ent[2]]
    assert offsets_converted == offsets
Beispiel #2
0
def offsets_from_tags(
    doc: Doc,
    tags: List[str],
    label_encoding: Optional[str] = "BIOUL",
    only_token_spans: bool = False,
) -> List[Dict]:
    """Converts BIOUL or BIO tags to offsets

    Parameters
    ----------
    doc
        A spaCy Doc created with `text` and the backbone tokenizer
    tags
        A list of BIOUL or BIO tags
    label_encoding
        The label encoding of the tags: BIOUL or BIO
    only_token_spans
        If True, offsets contains only token index references. Default is False

    Returns
    -------
    offsets
        A list of dicts with start and end character/token index with respect to the doc and the span label:
        `{"start": int, "end": int, "start_token": int, "end_token": int, "label": str}`
    """
    # spacy's biluo_tags_to_offsets surprisingly does not check this ...
    if len(doc) != len(tags):
        raise ValueError(
            f"Number of tokens and tags must be the same, "
            f"but 'len({list(doc)}) != len({tags})"
        )

    if label_encoding == "BIO":
        tags = to_bioul(tags, encoding="BIO")

    offsets = []
    for start, end, label in biluo_tags_to_offsets(doc, tags):
        span = doc.char_span(start, end)
        data = {
            "start_token": span.start,
            "end_token": span.end,
            "label": label,
        }
        if not only_token_spans:
            data.update({"start": start, "end": end})
        offsets.append(data)
    return offsets