def _preprocess( examples: List[str], sort_rels: bool = True, entity_hinting: Optional[EntityHinting] = None, ) -> List[str]: kwargs = { "concepts": ["gene", "disease"], "skip_malformed": True } if entity_hinting else {} abstracts, anns, labels = examples pubtator_content = _convert_to_pubtator(abstracts=abstracts, anns=anns, labels=labels) pubtator_annotations = util.parse_pubtator( pubtator_content=pubtator_content, text_segment=util.TextSegment.both, ) seq2rel_annotations = util.pubtator_to_seq2rel( pubtator_annotations, sort_rels=sort_rels, entity_hinting=entity_hinting, **kwargs) return seq2rel_annotations
def _preprocess( pubtator_content: str, sort_rels: bool = True, entity_hinting: Optional[EntityHinting] = None, filter_hypernyms: bool = False, ) -> List[str]: kwargs = { "concepts": ["chemical", "disease"], "skip_malformed": True } if entity_hinting else {} pubtator_annotations = util.parse_pubtator( pubtator_content=pubtator_content, text_segment=util.TextSegment.both, ) # This is unique the the CDR corpus, which contains many negative relations that are # actually valid, but are not annotated because they contain a disease entity which is the # hypernym of a disease entity in a positive relation. We need to filter these out before # evaluation, so this function finds all such cases and adds them to the filtered_relations # field of the annoations. See: https://arxiv.org/abs/1909.00228 for details. if filter_hypernyms: _filter_hypernyms(pubtator_annotations) seq2rel_annotations = util.pubtator_to_seq2rel( pubtator_annotations, sort_rels=sort_rels, entity_hinting=entity_hinting, **kwargs, ) return seq2rel_annotations
def test_parse_pubtator_compound_ent() -> None: # A truncated example taken from the CDR dataset pmid = "17854040" title_text = ( "Mutations associated with lamivudine-resistance in therapy-na ve hepatitis B virus (HBV)" " infected patients with and without HIV co-infection: implications for antiretroviral" " therapy in HBV and HIV co-infected South African patients. infected patients with and" " without HIV co-infection: implications for antiretroviral therapy in HBV and HIV" " co-infected South African patients.") abstract_text = ( "This was an exploratory study to investigate lamivudine-resistant hepatitis B virus (HBV)" " strains in selected lamivudine-na ve HBV carriers with and without human" " immunodeficiency virus (HIV) co-infection in South African patients. Thirty-five" " lamivudine-naive HBV infected patients with or without HIV co-infection were studied: 15" " chronic HBV mono-infected patients and 20 HBV-HIV co-infected patients." ) pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t26\t36\tlamivudine\tChemical\tD019259 {pmid}\t59\t61\tna\tChemical\tD012964 {pmid}\t66\t98\thepatitis B virus (HBV) infected\tDisease\tD006509 {pmid}\t125\t141\tHIV co-infection\tDisease\tD015658 {pmid}\t186\t209\tHBV and HIV co-infected\tDisease\tD006509|D015658 HBV infected|HIV infected """ expected = schemas.PubtatorAnnotation( pmid=pmid, text=f"{title_text} {abstract_text}", entities={ "D019259": schemas.PubtatorEntity( mentions=["lamivudine"], offsets=[(26, 36)], label="Chemical", ), "D012964": schemas.PubtatorEntity(mentions=["na"], offsets=[(59, 61)], label="Chemical"), "D006509": schemas.PubtatorEntity( mentions=["hepatitis B virus (HBV) infected", "HBV infected"], offsets=[(66, 98), (186, 209)], label="Disease", ), "D015658": schemas.PubtatorEntity( mentions=["HIV co-infection", "HIV infected"], offsets=[(125, 141), (194, 209)], label="Disease", ), }, ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.both) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations
def test_parse_pubtator_no_abstract_raises_value_error() -> None: # A truncated example taken from the CDR dataset pmid = "2339463" title_text = "Cerebral sinus thrombosis as a potential hazard of antifibrinolytic treatment in menorrhagia." abstract_text = "" pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t0\t25\tCerebral sinus thrombosis\tDisease """ # A ValueError should be raised if no abstract provided but text_segment is "abstract" or "both" with pytest.raises(ValueError): _ = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.abstract) with pytest.raises(ValueError): _ = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.both)
def test_parse_pubtator_skip_malformed_raises_value_error() -> None: # A truncated example taken from the CDR dataset pmid = "2339463" title_text = "Cerebral sinus thrombosis as a potential hazard of antifibrinolytic treatment in menorrhagia." abstract_text = ( "We describe a 42-year-old woman who developed superior sagittal and left transverse sinus" " thrombosis associated with prolonged epsilon-aminocaproic acid therapy for menorrhagia." ) pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t0\t25\tCerebral sinus thrombosis\tDisease """ # There should be no error when skip_malformed is True... _ = util.parse_pubtator(pubtator_content, skip_malformed=True) # ...and and error when it is False with pytest.raises(ValueError): _ = util.parse_pubtator(pubtator_content, skip_malformed=False)
def _preprocess( examples: List[Dict[str, Any]], rel_labels: Optional[Dict[str, str]] = None, sort_rels: bool = True, ) -> List[str]: pubtator_content = _convert_to_pubtator(examples, rel_labels=rel_labels) pubtator_annotations = util.parse_pubtator( pubtator_content=pubtator_content, text_segment=util.TextSegment.abstract, ) seq2rel_annotations = util.pubtator_to_seq2rel(pubtator_annotations, sort_rels=sort_rels) return seq2rel_annotations
def _preprocess( examples: List[Dict[str, Any]], sort_rels: bool = True, entity_hinting: Optional[EntityHinting] = None, ) -> List[str]: kwargs = ({ "concepts": ["chemical", "gene", "mutation"], "skip_malformed": True } if entity_hinting else {}) pubtator_content = _convert_to_pubtator(examples) pubtator_annotations = util.parse_pubtator( pubtator_content=pubtator_content, text_segment=util.TextSegment.abstract, ) seq2rel_annotations = util.pubtator_to_seq2rel( pubtator_annotations, sort_rels=sort_rels, entity_hinting=entity_hinting, **kwargs) return seq2rel_annotations
def test_parse_pubtator() -> None: # A truncated example taken from the CDR dataset pmid = "18020536" title_text = ( "Associations between use of benzodiazepines or related drugs and health, physical" " abilities and cognitive function: a non-randomised clinical study in the elderly." ) abstract_text = ( "OBJECTIVE: To describe associations between the use of benzodiazepines or related drugs" " (BZDs/RDs) and health, functional abilities and cognitive function in the elderly." " METHODS: A non-randomised clinical study of patients aged > or =65 years admitted to" " acute hospital wards during 1 month. 164 patients (mean age +/- standard deviation [SD]" " 81.6 +/- 6.8 years) were admitted. Of these, nearly half (n = 78) had used BZDs/RDs" " before admission, and the remainder (n = 86) were non-users. Cognitive ability was" " assessed by the Mini-Mental State Examination (MMSE). Patients scoring > or =20 MMSE" " sum points were interviewed (n = 79) and questioned regarding symptoms and functional" " abilities during the week prior to admission.") # Include a dummy annotation with ID == -1. These should be ignored. pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t28\t43\tbenzodiazepines\tChemical\tD001569 {pmid}\t219\t234\tbenzodiazepines\tChemical\tD001569 {pmid}\t253\t257\tBZDs\tChemical\tD001569 {pmid}\t583\t587\tBZDs\tChemical\tD001569 {pmid}\t1817\t1826\ttiredness\tDisease\tD005221 {pmid}\t0\t0\tArbitrary\tArbitrary\t-1 {pmid}\tCID\tD001569\tD005221 """ title_entities = { "D001569": schemas.PubtatorEntity( mentions=["benzodiazepines"], offsets=[(28, 43)], label="Chemical", ), } abstract_entities = { "D001569": schemas.PubtatorEntity( mentions=["benzodiazepines", "BZDs", "BZDs"], offsets=[(219, 234), (253, 257), (583, 587)], label="Chemical", ), "D005221": schemas.PubtatorEntity(mentions=["tiredness"], offsets=[(1817, 1826)], label="Disease"), } both_entities = { "D001569": schemas.PubtatorEntity( mentions=title_entities["D001569"].mentions + abstract_entities["D001569"].mentions, offsets=title_entities["D001569"].offsets + abstract_entities["D001569"].offsets, label="Chemical", ), "D005221": schemas.PubtatorEntity(mentions=["tiredness"], offsets=[(1817, 1826)], label="Disease"), } # Title only expected = schemas.PubtatorAnnotation(pmid=pmid, text=title_text, entities=title_entities, relations=[]) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.title) # Breaking up the asserts leads to much clearer outputs when the test fails assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations # Abstract only expected = schemas.PubtatorAnnotation( pmid=pmid, text=abstract_text, entities=abstract_entities, relations=[("D001569", "D005221", "CID")], ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.abstract) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations # Both expected = schemas.PubtatorAnnotation( pmid=pmid, text=f"{title_text} {abstract_text}", entities=both_entities, relations=[("D001569", "D005221", "CID")], ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.both) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations