Exemple #1
0
def _preprocess(
    examples: List[str],
    sort_rels: bool = True,
    entity_hinting: Optional[EntityHinting] = None,
) -> List[str]:
    kwargs = {
        "concepts": ["gene", "disease"],
        "skip_malformed": True
    } if entity_hinting else {}

    abstracts, anns, labels = examples
    pubtator_content = _convert_to_pubtator(abstracts=abstracts,
                                            anns=anns,
                                            labels=labels)
    pubtator_annotations = util.parse_pubtator(
        pubtator_content=pubtator_content,
        text_segment=util.TextSegment.both,
    )

    seq2rel_annotations = util.pubtator_to_seq2rel(
        pubtator_annotations,
        sort_rels=sort_rels,
        entity_hinting=entity_hinting,
        **kwargs)

    return seq2rel_annotations
Exemple #2
0
def _preprocess(
    pubtator_content: str,
    sort_rels: bool = True,
    entity_hinting: Optional[EntityHinting] = None,
    filter_hypernyms: bool = False,
) -> List[str]:
    kwargs = {
        "concepts": ["chemical", "disease"],
        "skip_malformed": True
    } if entity_hinting else {}

    pubtator_annotations = util.parse_pubtator(
        pubtator_content=pubtator_content,
        text_segment=util.TextSegment.both,
    )

    # This is unique the the CDR corpus, which contains many negative relations that are
    # actually valid, but are not annotated because they contain a disease entity which is the
    # hypernym of a disease entity in a positive relation. We need to filter these out before
    # evaluation, so this function finds all such cases and adds them to the filtered_relations
    # field of the annoations. See: https://arxiv.org/abs/1909.00228 for details.
    if filter_hypernyms:
        _filter_hypernyms(pubtator_annotations)

    seq2rel_annotations = util.pubtator_to_seq2rel(
        pubtator_annotations,
        sort_rels=sort_rels,
        entity_hinting=entity_hinting,
        **kwargs,
    )

    return seq2rel_annotations
def test_parse_pubtator_compound_ent() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "17854040"
    title_text = (
        "Mutations associated with lamivudine-resistance in therapy-na  ve hepatitis B virus (HBV)"
        " infected patients with and without HIV co-infection: implications for antiretroviral"
        " therapy in HBV and HIV co-infected South African patients. infected patients with and"
        " without HIV co-infection: implications for antiretroviral therapy in HBV and HIV"
        " co-infected South African patients.")
    abstract_text = (
        "This was an exploratory study to investigate lamivudine-resistant hepatitis B virus (HBV)"
        " strains in selected lamivudine-na  ve HBV carriers with and without human"
        " immunodeficiency virus (HIV) co-infection in South African patients. Thirty-five"
        " lamivudine-naive HBV infected patients with or without HIV co-infection were studied: 15"
        " chronic HBV mono-infected patients and 20 HBV-HIV co-infected patients."
    )
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t26\t36\tlamivudine\tChemical\tD019259
    {pmid}\t59\t61\tna\tChemical\tD012964
    {pmid}\t66\t98\thepatitis B virus (HBV) infected\tDisease\tD006509
    {pmid}\t125\t141\tHIV co-infection\tDisease\tD015658
    {pmid}\t186\t209\tHBV and HIV co-infected\tDisease\tD006509|D015658	HBV infected|HIV infected
    """

    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=f"{title_text} {abstract_text}",
        entities={
            "D019259":
            schemas.PubtatorEntity(
                mentions=["lamivudine"],
                offsets=[(26, 36)],
                label="Chemical",
            ),
            "D012964":
            schemas.PubtatorEntity(mentions=["na"],
                                   offsets=[(59, 61)],
                                   label="Chemical"),
            "D006509":
            schemas.PubtatorEntity(
                mentions=["hepatitis B virus (HBV) infected", "HBV infected"],
                offsets=[(66, 98), (186, 209)],
                label="Disease",
            ),
            "D015658":
            schemas.PubtatorEntity(
                mentions=["HIV co-infection", "HIV infected"],
                offsets=[(125, 141), (194, 209)],
                label="Disease",
            ),
        },
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.both)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations
def test_parse_pubtator_no_abstract_raises_value_error() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "2339463"
    title_text = "Cerebral sinus thrombosis as a potential hazard of antifibrinolytic treatment in menorrhagia."
    abstract_text = ""
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t0\t25\tCerebral sinus thrombosis\tDisease
    """

    # A ValueError should be raised if no abstract provided but text_segment is "abstract" or "both"
    with pytest.raises(ValueError):
        _ = util.parse_pubtator(pubtator_content,
                                text_segment=util.TextSegment.abstract)
    with pytest.raises(ValueError):
        _ = util.parse_pubtator(pubtator_content,
                                text_segment=util.TextSegment.both)
def test_parse_pubtator_skip_malformed_raises_value_error() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "2339463"
    title_text = "Cerebral sinus thrombosis as a potential hazard of antifibrinolytic treatment in menorrhagia."
    abstract_text = (
        "We describe a 42-year-old woman who developed superior sagittal and left transverse sinus"
        " thrombosis associated with prolonged epsilon-aminocaproic acid therapy for menorrhagia."
    )
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t0\t25\tCerebral sinus thrombosis\tDisease
    """

    # There should be no error when skip_malformed is True...
    _ = util.parse_pubtator(pubtator_content, skip_malformed=True)
    # ...and and error when it is False
    with pytest.raises(ValueError):
        _ = util.parse_pubtator(pubtator_content, skip_malformed=False)
Exemple #6
0
def _preprocess(
    examples: List[Dict[str, Any]],
    rel_labels: Optional[Dict[str, str]] = None,
    sort_rels: bool = True,
) -> List[str]:
    pubtator_content = _convert_to_pubtator(examples, rel_labels=rel_labels)
    pubtator_annotations = util.parse_pubtator(
        pubtator_content=pubtator_content,
        text_segment=util.TextSegment.abstract,
    )
    seq2rel_annotations = util.pubtator_to_seq2rel(pubtator_annotations,
                                                   sort_rels=sort_rels)

    return seq2rel_annotations
Exemple #7
0
def _preprocess(
    examples: List[Dict[str, Any]],
    sort_rels: bool = True,
    entity_hinting: Optional[EntityHinting] = None,
) -> List[str]:
    kwargs = ({
        "concepts": ["chemical", "gene", "mutation"],
        "skip_malformed": True
    } if entity_hinting else {})

    pubtator_content = _convert_to_pubtator(examples)
    pubtator_annotations = util.parse_pubtator(
        pubtator_content=pubtator_content,
        text_segment=util.TextSegment.abstract,
    )
    seq2rel_annotations = util.pubtator_to_seq2rel(
        pubtator_annotations,
        sort_rels=sort_rels,
        entity_hinting=entity_hinting,
        **kwargs)
    return seq2rel_annotations
def test_parse_pubtator() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "18020536"
    title_text = (
        "Associations between use of benzodiazepines or related drugs and health, physical"
        " abilities and cognitive function: a non-randomised clinical study in the elderly."
    )
    abstract_text = (
        "OBJECTIVE: To describe associations between the use of benzodiazepines or related drugs"
        " (BZDs/RDs) and health, functional abilities and cognitive function in the elderly."
        " METHODS: A non-randomised clinical study of patients aged > or =65 years admitted to"
        " acute hospital wards during 1 month. 164 patients (mean age +/- standard deviation [SD]"
        " 81.6 +/- 6.8 years) were admitted. Of these, nearly half (n = 78) had used BZDs/RDs"
        " before admission, and the remainder (n = 86) were non-users. Cognitive ability was"
        " assessed by the Mini-Mental State Examination (MMSE). Patients scoring > or =20 MMSE"
        " sum points were interviewed (n = 79) and questioned regarding symptoms and functional"
        " abilities during the week prior to admission.")
    # Include a dummy annotation with ID == -1. These should be ignored.
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t28\t43\tbenzodiazepines\tChemical\tD001569
    {pmid}\t219\t234\tbenzodiazepines\tChemical\tD001569
    {pmid}\t253\t257\tBZDs\tChemical\tD001569
    {pmid}\t583\t587\tBZDs\tChemical\tD001569
    {pmid}\t1817\t1826\ttiredness\tDisease\tD005221
    {pmid}\t0\t0\tArbitrary\tArbitrary\t-1
    {pmid}\tCID\tD001569\tD005221
    """

    title_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=["benzodiazepines"],
            offsets=[(28, 43)],
            label="Chemical",
        ),
    }
    abstract_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=["benzodiazepines", "BZDs", "BZDs"],
            offsets=[(219, 234), (253, 257), (583, 587)],
            label="Chemical",
        ),
        "D005221":
        schemas.PubtatorEntity(mentions=["tiredness"],
                               offsets=[(1817, 1826)],
                               label="Disease"),
    }
    both_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=title_entities["D001569"].mentions +
            abstract_entities["D001569"].mentions,
            offsets=title_entities["D001569"].offsets +
            abstract_entities["D001569"].offsets,
            label="Chemical",
        ),
        "D005221":
        schemas.PubtatorEntity(mentions=["tiredness"],
                               offsets=[(1817, 1826)],
                               label="Disease"),
    }

    # Title only
    expected = schemas.PubtatorAnnotation(pmid=pmid,
                                          text=title_text,
                                          entities=title_entities,
                                          relations=[])
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.title)
    # Breaking up the asserts leads to much clearer outputs when the test fails
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations

    # Abstract only
    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=abstract_text,
        entities=abstract_entities,
        relations=[("D001569", "D005221", "CID")],
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.abstract)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations

    # Both
    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=f"{title_text} {abstract_text}",
        entities=both_entities,
        relations=[("D001569", "D005221", "CID")],
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.both)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations