Ejemplo n.º 1
0
def test_scispacy_tokenization():
    from flair.tokenization import SciSpacyTokenizer

    tokenizer = SciSpacyTokenizer()

    tokens = tokenizer.tokenize("HBeAg(+) patients")

    assert len(tokens) == 5
    assert tokens[0].text == "HBeAg"
    assert tokens[0].start_pos == 0
    assert tokens[1].text == "("
    assert tokens[1].start_pos == 5
    assert tokens[2].text == "+"
    assert tokens[2].start_pos == 6
    assert tokens[3].text == ")"
    assert tokens[3].start_pos == 7
    assert tokens[4].text == "patients"
    assert tokens[4].start_pos == 9

    tokens = tokenizer.tokenize("HBeAg(+)/HBsAg(+)")

    assert len(tokens) == 9

    assert tokens[0].text == "HBeAg"
    assert tokens[0].start_pos == 0
    assert tokens[1].text == "("
    assert tokens[1].start_pos == 5
    assert tokens[2].text == "+"
    assert tokens[2].start_pos == 6
    assert tokens[3].text == ")"
    assert tokens[3].start_pos == 7
    assert tokens[4].text == "/"
    assert tokens[4].start_pos == 8
    assert tokens[5].text == "HBsAg"
    assert tokens[5].start_pos == 9
    assert tokens[6].text == "("
    assert tokens[6].start_pos == 14
    assert tokens[7].text == "+"
    assert tokens[7].start_pos == 15
    assert tokens[8].text == ")"
    assert tokens[8].start_pos == 16

    tokens = tokenizer.tokenize("doxorubicin (DOX)-induced")

    assert len(tokens) == 5
    assert tokens[0].text == "doxorubicin"
    assert tokens[1].text == "("
    assert tokens[2].text == "DOX"
    assert tokens[3].text == ")"
    assert tokens[4].text == "-induced"
Ejemplo n.º 2
0
def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]):
    dataset_name = CorpusType.__class__.__name__.lower()
    base_path = flair.cache_root / "datasets"
    data_folder = base_path / dataset_name

    from flair.tokenization import SciSpacyTokenizer
    tokenizer = SciSpacyTokenizer()

    corpus = CorpusType()
    internal = corpus.to_internal(data_folder)
    for doc_id, doc_text in internal.documents.items():
        misaligned_starts = []
        misaligned_ends = []

        token_starts = set()
        token_ends = set()
        for token, token_start in zip(*tokenizer.tokenize(doc_text)):
            token_starts.add(token_start)
            token_ends.add(token_start + len(token))

        entities = internal.entities_per_document[doc_id]
        entity_starts = [i.char_span.start for i in entities]
        entity_ends = [i.char_span.stop for i in entities]

        for start in entity_starts:
            if start not in entity_starts:
                misaligned_starts.append(start)

        for end in entity_ends:
            if end not in entity_ends:
                misaligned_starts.append(end)

        assert len(misaligned_starts) <= len(entities) // 10
        assert len(misaligned_ends) <= len(entities) // 10