Ejemplo n.º 1
0
def test_chexpert_extractor():
    extractor = RegExExtractor(
        __tests_dir / 'data/patterns/chexpert_phrases.yml', 'CheXpert labeler')

    dir = get_example_dir()
    with open(dir / '1.chexpert.xml') as fp:
        c = bioc.load(fp)

    actual_documents = c.documents
    expected_documents = []
    for doc in actual_documents:
        doc = copy.deepcopy(doc)
        for p in doc.passages:
            del p.annotations[:]
        expected_documents.append(doc)

    for expected_doc, actual_doc in zip(expected_documents, actual_documents):
        extractor.__call__(expected_doc)
        expected_anns = sorted(list(
            bioc.annotations(expected_doc, bioc.PASSAGE)),
                               key=lambda a: a.total_span.offset)
        actual_anns = sorted(list(bioc.annotations(actual_doc, bioc.PASSAGE)),
                             key=lambda a: a.total_span.offset)

        assert len(expected_anns) == len(actual_anns), \
            '{} vs {}'.format(len(expected_anns), len(actual_anns))
        for expected_ann, actual_ann in zip(expected_anns, actual_anns):
            assert expected_ann.total_span == actual_ann.total_span
            for k in ['observation', 'annotator']:
                assert expected_ann.infons[k] == actual_ann.infons[k]
Ejemplo n.º 2
0
def test_annotations():
    results = list(bioc.annotations(collection))
    assert {'1', '2', '3', '4', '5'} == {r.annotation.id for r in results}

    results = list(bioc.annotations(collection, level=DOCUMENT))
    assert {'5'} == {r.annotation.id for r in results}
    assert {'1'} == {r.document.id for r in results}
    assert {None} == {r.passage for r in results}

    results = list(bioc.annotations(collection, level=PASSAGE))
    assert {'1', '2'} == {r.annotation.id for r in results}
    assert {'1'} == {r.document.id for r in results}
    assert {0} == {r.passage.offset for r in results}
    assert {None} == {r.sentence for r in results}

    results = list(bioc.annotations(collection, level=SENTENCE))
    assert {'3', '4'} == {r.annotation.id for r in results}
    assert {'2'} == {r.document.id for r in results}
    assert {27} == {r.passage.offset for r in results}
    assert {27, 34} == {r.sentence.offset for r in results}

    results = list(bioc.annotations(collection.documents[0], level=SENTENCE))
    assert len(results) == 0

    with pytest.raises(TypeError):
        next(bioc.annotations('Foo'))
Ejemplo n.º 3
0
def test_sibling_intext_citations(table_article):
    all_passages = []
    all_annotations = []
    file = StringIO(table_article)

    for doc in docs2bioc(file, 'pmcxml', trim_sentences=False):
        all_passages.extend(doc.passages)
        all_annotations.extend(bioc.annotations(doc))

    assert any([
        'inspected using the graphics program PyMOL.' in chunk.text
        for chunk in all_passages
    ])
    assert '[14],[16],[23]\u2013[25]' in [
        a.infons['citation_text'] for a in all_annotations
    ]
Ejemplo n.º 4
0
def test_annotations():
    annotations = list(bioc.annotations(collection))
    assert 2 == len(annotations)
    assert '1' == annotations[0].id
    assert '2' == annotations[1].id

    annotations = list(bioc.annotations(collection, level=bioc.SENTENCE))
    assert 2 == len(annotations)
    assert '3' == annotations[0].id
    assert '4' == annotations[1].id

    annotations = list(bioc.annotations(collection, level=bioc.DOCUMENT))
    assert 1 == len(annotations)
    assert '5' == annotations[0].id

    annotations = list(
        bioc.annotations(collection.documents[1], level=bioc.SENTENCE))
    assert 2 == len(annotations)
    assert '4' == annotations[1].id

    with pytest.raises(TypeError):
        next(bioc.annotations('Foo'))

    with pytest.raises(ValueError):
        next(bioc.annotations(collection, level=-1))

    with pytest.raises(ValueError):
        next(
            bioc.annotations(collection.documents[0].passages[0],
                             level=bioc.DOCUMENT))

    with pytest.raises(ValueError):
        next(
            bioc.annotations(collection.documents[1].passages[0].sentences[0],
                             level=bioc.DOCUMENT))
        next(
            bioc.annotations(collection.documents[1].passages[0].sentences[0],
                             level=bioc.PASSAGE))
Ejemplo n.º 5
0
 def __extract_tags(self, tagged_article, document, wanted_tags):
     """
     This function extract the words corresponding to the wanted_tags and 
     lemmatizes these, to reduce redundancy. These words are saved in 
     the Tag object. 
     :param tagged_article: A tagged article.
     :param document: The document corresponding to the tagged_article. 
     :param wanted_tags: The words corresponding to these tags are saved.
     :return: An Tag object containing the article information + the wanted words. 
     """
     for anno in list(bioc.annotations(document)):
         tag = anno.infons['type']
         for wanted_tag in wanted_tags:
             if wanted_tag == tag:
                 word = LEMMATIZER.lemmatize(
                     anno.text
                 )  # e.g. rats --> rat, so to reduce redundancy
                 tagged_article.add_annotation(tag, word)
     return tagged_article