def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]):
    dataset_name = CorpusType.__class__.__name__.lower()
    base_path = flair.cache_root / "datasets"
    data_folder = base_path / dataset_name

    from flair.tokenization import SciSpacyTokenizer
    tokenizer = SciSpacyTokenizer()

    corpus = CorpusType()
    internal = corpus.to_internal(data_folder)
    for doc_id, doc_text in internal.documents.items():
        misaligned_starts = []
        misaligned_ends = []

        token_starts = set()
        token_ends = set()
        for token, token_start in zip(*tokenizer.tokenize(doc_text)):
            token_starts.add(token_start)
            token_ends.add(token_start + len(token))

        entities = internal.entities_per_document[doc_id]
        entity_starts = [i.char_span.start for i in entities]
        entity_ends = [i.char_span.stop for i in entities]

        for start in entity_starts:
            if start not in entity_starts:
                misaligned_starts.append(start)

        for end in entity_ends:
            if end not in entity_ends:
                misaligned_starts.append(end)

        assert len(misaligned_starts) <= len(entities) // 10
        assert len(misaligned_ends) <= len(entities) // 10
Exemple #2
0
def test_scispacy_tokenization():
    from flair.tokenization import SciSpacyTokenizer

    tokenizer = SciSpacyTokenizer()

    tokens = tokenizer.tokenize("HBeAg(+) patients")

    assert len(tokens) == 5
    assert tokens[0].text == "HBeAg"
    assert tokens[0].start_pos == 0
    assert tokens[1].text == "("
    assert tokens[1].start_pos == 5
    assert tokens[2].text == "+"
    assert tokens[2].start_pos == 6
    assert tokens[3].text == ")"
    assert tokens[3].start_pos == 7
    assert tokens[4].text == "patients"
    assert tokens[4].start_pos == 9

    tokens = tokenizer.tokenize("HBeAg(+)/HBsAg(+)")

    assert len(tokens) == 9

    assert tokens[0].text == "HBeAg"
    assert tokens[0].start_pos == 0
    assert tokens[1].text == "("
    assert tokens[1].start_pos == 5
    assert tokens[2].text == "+"
    assert tokens[2].start_pos == 6
    assert tokens[3].text == ")"
    assert tokens[3].start_pos == 7
    assert tokens[4].text == "/"
    assert tokens[4].start_pos == 8
    assert tokens[5].text == "HBsAg"
    assert tokens[5].start_pos == 9
    assert tokens[6].text == "("
    assert tokens[6].start_pos == 14
    assert tokens[7].text == "+"
    assert tokens[7].start_pos == 15
    assert tokens[8].text == ")"
    assert tokens[8].start_pos == 16

    tokens = tokenizer.tokenize("doxorubicin (DOX)-induced")

    assert len(tokens) == 5
    assert tokens[0].text == "doxorubicin"
    assert tokens[1].text == "("
    assert tokens[2].text == "DOX"
    assert tokens[3].text == ")"
    assert tokens[4].text == "-induced"
Exemple #3
0
def test_create_sentence_using_scispacy_tokenizer():
    sentence: Sentence = Sentence(
        "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron",
        use_tokenizer=SciSpacyTokenizer()
    )

    assert 13 == len(sentence.tokens)
    assert "Spinal" == sentence.tokens[0].text
    assert "and" == sentence.tokens[1].text
    assert "bulbar" == sentence.tokens[2].text
    assert "muscular" == sentence.tokens[3].text
    assert "atrophy" == sentence.tokens[4].text
    assert "(" == sentence.tokens[5].text
    assert "SBMA" == sentence.tokens[6].text
    assert ")" == sentence.tokens[7].text
    assert "is" == sentence.tokens[8].text
    assert "an" == sentence.tokens[9].text
    assert "inherited" == sentence.tokens[10].text
    assert "motor" == sentence.tokens[11].text
    assert "neuron" == sentence.tokens[12].text

    assert 0 == sentence.tokens[0].start_pos
    assert 7 == sentence.tokens[1].start_pos
    assert 11 == sentence.tokens[2].start_pos
    assert 18 == sentence.tokens[3].start_pos
    assert 27 == sentence.tokens[4].start_pos
    assert 35 == sentence.tokens[5].start_pos
    assert 36 == sentence.tokens[6].start_pos
    assert 40 == sentence.tokens[7].start_pos
    assert 42 == sentence.tokens[8].start_pos
    assert 45 == sentence.tokens[9].start_pos
    assert 48 == sentence.tokens[10].start_pos
    assert 58 == sentence.tokens[11].start_pos
    assert 64 == sentence.tokens[12].start_pos

    assert True == sentence.tokens[4].whitespace_after
    assert False == sentence.tokens[5].whitespace_after
    assert False == sentence.tokens[6].whitespace_after
    assert True == sentence.tokens[7].whitespace_after
Exemple #4
0
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

scp = SciSpacyTokenizer()
tagger = MultiTagger.load("hunflair")
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 22 11:12:40 2020

@author: MAGESHWARAN
"""

# -----------------Template for using hunflair for BioMedical NER--------------
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

# make a sentence and tokenize with SciSpaCy
sentence = Sentence(
    "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome",
    use_tokenizer=SciSpacyTokenizer())

# load biomedical tagger
tagger = MultiTagger.load("hunflair")

# inference
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

# Entities may have multiple words, here's an easy way to get each annotated span
for disease in sentence.get_spans("hunflair-disease"):
    print(disease)

# Can be converted to dictionary, to get additional information