def test_insert_hints_overlapping() -> None: """Asserts that insert_hints works as expected for overlapping entities.""" text = ( "Mutation pattern in clinically asymptomatic coagulation factor VII deficiency. A total of" " 122 subjects, referred after presurgery screening or checkup for prolonged prothrombin" " time, were characterized for the presence of coagulation factor VII deficiency." ) pubator_annotation = schemas.PubtatorAnnotation( pmid="8844208", text=text, entities={ "2155": schemas.PubtatorEntity( mentions=["coagulation factor VII", "coagulation factor VII"], offsets=[(44, 67), (222, 244)], label="Gene", ), "D005168": schemas.PubtatorEntity( mentions=["factor VII deficiency", "factor VII deficiency"], offsets=[(56, 78), (234, 255)], label="Disease", ), }, ) expected = ( f"coagulation factor vii @GENE@ factor vii deficiency @DISEASE@ {HINT_SEP_SYMBOL} {text}" ) pubator_annotation.insert_hints() actual = pubator_annotation.text assert actual == expected
def test_insert_hints_no_mutation() -> None: """Asserts that insert_hints does not mutate any attribute beside `text`.""" text = ( "Different lobular distributions of altered hepatocyte tight junctions in rat models of" " intrahepatic and extrahepatic cholestasis.") pubator_annotation = schemas.PubtatorAnnotation( pmid="9862868", text=text, entities={ "D002780": schemas.PubtatorEntity(mentions=["intrahepatic cholestasis"], offsets=[(87, 128)], label="Disease"), "D001651": schemas.PubtatorEntity(mentions=["extrahepatic cholestasis"], offsets=[(104, 128)], label="Disease"), }, ) expected = copy.deepcopy(pubator_annotation) pubator_annotation.insert_hints() assert pubator_annotation.text != expected.text assert pubator_annotation.pmid == expected.pmid assert pubator_annotation.entities == expected.entities assert pubator_annotation.relations == expected.relations
def test_insert_hints_compound() -> None: """Asserts that insert_hints works as expected for compound entities.""" text = ( "Different lobular distributions of altered hepatocyte tight junctions in rat models of" " intrahepatic and extrahepatic cholestasis.") pubator_annotation = schemas.PubtatorAnnotation( pmid="9862868", text=text, entities={ "D002780": schemas.PubtatorEntity(mentions=["intrahepatic cholestasis"], offsets=[(87, 128)], label="Disease"), "D001651": schemas.PubtatorEntity(mentions=["extrahepatic cholestasis"], offsets=[(104, 128)], label="Disease"), }, ) expected = f"intrahepatic cholestasis @DISEASE@ extrahepatic cholestasis @DISEASE@ {HINT_SEP_SYMBOL} {text}" pubator_annotation.insert_hints() actual = pubator_annotation.text assert actual == expected
def test_parse_pubtator_compound_ent() -> None: # A truncated example taken from the CDR dataset pmid = "17854040" title_text = ( "Mutations associated with lamivudine-resistance in therapy-na ve hepatitis B virus (HBV)" " infected patients with and without HIV co-infection: implications for antiretroviral" " therapy in HBV and HIV co-infected South African patients. infected patients with and" " without HIV co-infection: implications for antiretroviral therapy in HBV and HIV" " co-infected South African patients.") abstract_text = ( "This was an exploratory study to investigate lamivudine-resistant hepatitis B virus (HBV)" " strains in selected lamivudine-na ve HBV carriers with and without human" " immunodeficiency virus (HIV) co-infection in South African patients. Thirty-five" " lamivudine-naive HBV infected patients with or without HIV co-infection were studied: 15" " chronic HBV mono-infected patients and 20 HBV-HIV co-infected patients." ) pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t26\t36\tlamivudine\tChemical\tD019259 {pmid}\t59\t61\tna\tChemical\tD012964 {pmid}\t66\t98\thepatitis B virus (HBV) infected\tDisease\tD006509 {pmid}\t125\t141\tHIV co-infection\tDisease\tD015658 {pmid}\t186\t209\tHBV and HIV co-infected\tDisease\tD006509|D015658 HBV infected|HIV infected """ expected = schemas.PubtatorAnnotation( pmid=pmid, text=f"{title_text} {abstract_text}", entities={ "D019259": schemas.PubtatorEntity( mentions=["lamivudine"], offsets=[(26, 36)], label="Chemical", ), "D012964": schemas.PubtatorEntity(mentions=["na"], offsets=[(59, 61)], label="Chemical"), "D006509": schemas.PubtatorEntity( mentions=["hepatitis B virus (HBV) infected", "HBV infected"], offsets=[(66, 98), (186, 209)], label="Disease", ), "D015658": schemas.PubtatorEntity( mentions=["HIV co-infection", "HIV infected"], offsets=[(125, 141), (194, 209)], label="Disease", ), }, ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.both) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations
def test_pubtator_entity_to_string() -> None: ent = schemas.PubtatorEntity( # Contains: # - multi-word mentions # - overlapping mentions # - multiple duplicate mentions # - at least two unique mentions (case-insensitive) # - mentions that are not already ordered by first appearance mentions=[ "factor vii deficiency", "factor vii deficiency", "Factor VII Deficiency", "factor vii deficient", ], offsets=[(200, 221), (100, 121), (20, 41), (0, 21)], label="Disease", ) # Test with sorting (which is and should be the default) actual = ent.to_string() expected = f"factor vii deficient {COREF_SEP_SYMBOL} factor vii deficiency @DISEASE@" assert actual == expected # Test without sorting # Note: because the mentions are randomly sorted when sort=False, we check a couple other # attributes, like length of the string. actual = ent.to_string(sort=False) assert len(actual) == len(expected) assert "factor vii deficient" in actual assert "factor vii deficiency" in actual assert "@DISEASE@" in actual assert COREF_SEP_SYMBOL in actual
def test_insert_hints() -> None: """Asserts that insert_hints works as expected for a list of edge cases.""" # A truncated example taken from the GDA dataset. It contains a few edge cases: # - coreferent mention # - entites that differ in case # - paranthesized entity # - multiple identical mentions of an entity text = ( "Apolipoprotein E epsilon4 allele, elevated midlife total cholesterol level, and high" " midlife systolic blood pressure are independent risk factors for late-life Alzheimer disease." " BACKGROUND: Presence of the apolipoprotein E (apoE) epsilon4 allele, which is involved in" " cholesterol metabolism, is the most important genetic risk factor for Alzheimer disease." " Elevated midlife values for total cholesterol level and blood pressure have been" " implicated recently as risk factors for Alzheimer disease.") pubator_annotation = schemas.PubtatorAnnotation( pmid="12160362", text=text, entities={ "348": schemas.PubtatorEntity( mentions=["Apolipoprotein E", "apolipoprotein E", "apoE"], offsets=[(0, 17), (207, 223), (225, 229)], label="Gene", ), "D000544": schemas.PubtatorEntity( mentions=[ "Alzheimer disease", "Alzheimer disease", "Alzheimer disease" ], offsets=[(160, 177), (339, 356), (479, 496)], label="Disease", ), }, ) expected = f"apolipoprotein e {COREF_SEP_SYMBOL} apoe @GENE@ alzheimer disease @DISEASE@ {HINT_SEP_SYMBOL} {text}" pubator_annotation.insert_hints() actual = pubator_annotation.text assert actual == expected
def test_filter_hypernyms(self): annotation = schemas.PubtatorAnnotation( text=( "Carbamazepine-induced cardiac dysfunction. A patient with sinus bradycardia and" " atrioventricular block, induced by carbamazepine, prompted an extensive" " literature review of all previously reported cases." ), pmid="", entities={ "D002220": schemas.PubtatorEntity( mentions=["Carbamazepine", "carbamazepine"], offsets=[(0, 13), (115, 128)], label="Chemical", ), "D006331": schemas.PubtatorEntity( mentions=["cardiac dysfunction"], offsets=[(22, 41)], label="Disease", ), "D001919": schemas.PubtatorEntity( mentions=["bradycardia"], offsets=[(64, 75)], label="Disease", ), "D054537": schemas.PubtatorEntity( mentions=["atrioventricular block"], offsets=[(80, 102)], label="Disease", ), }, relations=[("D002220", "D001919", "CID"), ("D002220", "D054537", "CID")], ) cdr._filter_hypernyms([annotation]) actual = annotation.filtered_relations # D006331 is a hypernym of D001919 and/or D054537 and so it should be filtered. expected = [("D002220", "D006331", "CID")] assert actual == expected
def test_pubtator_entity_get_offsets() -> None: ent = schemas.PubtatorEntity( # We don't need actual mentions or a label to test this method. mentions=[ "", "", "", "", ], offsets=[(200, 221), (100, 121), (20, 41), (0, 21)], label="", ) expected = (0, 21) actual = ent.get_offsets() assert actual == expected
def test_pubtator_annotation_to_string() -> None: # Contains: # - at least one entity with multiple mentions, including a unique mention # - at least two relations with different head entities # - at least one n-ary relation # - relations that are not already ordered by first appearance ann = schemas.PubtatorAnnotation( # We don't need text or a PMID to test this method. pmid="", text="", entities={ "D008094": schemas.PubtatorEntity( mentions=["lithium", "lithium", "Li", "Li"], offsets=[(54, 61), (111, 118), (941, 943), (1333, 1335)], label="Chemical", ), "D006973": schemas.PubtatorEntity( mentions=["hypertension", "hypertension"], offsets=[(1000, 1012), (1500, 1512)], label="Disease", ), "D011507": schemas.PubtatorEntity( mentions=["proteinuria", "proteinuria"], offsets=[(975, 986), (1466, 1477)], label="Disease", ), "D007676": schemas.PubtatorEntity( mentions=["chronic renal failure", "chronic renal failure"], offsets=[(70, 91), (1531, 1552)], label="Disease", ), }, relations=[ ("D008094", "D006973", "CID"), ("D008094", "D011507", "CID"), ("D008094", "D007676", "CID"), # This is an artificial n-ary relation. ("D008094", "D006973", "D011507", "CID"), ], ) # Test with sorting (which is and should be the default) actual = ann.to_string() expected = ( f"lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ chronic renal failure @DISEASE@ @CID@" f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ proteinuria @DISEASE@ @CID@" f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ hypertension @DISEASE@ @CID@" f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ hypertension @DISEASE@ proteinuria @DISEASE@ @CID@" ) assert actual == expected # Test without sorting # Note: because the mentions are randomly sorted when sort=False, we check a couple other # attributes, like length of the string. actual = ann.to_string(sort=False) assert len(actual) == len(expected) assert "lithium" in actual assert "li" in actual assert "chronic renal failure" in actual assert "proteinuria" in actual assert "hypertension" in actual assert "@CHEMICAL@" in actual assert "@DISEASE@" in actual assert "@CID@" in actual assert COREF_SEP_SYMBOL in actual
def test_parse_pubtator() -> None: # A truncated example taken from the CDR dataset pmid = "18020536" title_text = ( "Associations between use of benzodiazepines or related drugs and health, physical" " abilities and cognitive function: a non-randomised clinical study in the elderly." ) abstract_text = ( "OBJECTIVE: To describe associations between the use of benzodiazepines or related drugs" " (BZDs/RDs) and health, functional abilities and cognitive function in the elderly." " METHODS: A non-randomised clinical study of patients aged > or =65 years admitted to" " acute hospital wards during 1 month. 164 patients (mean age +/- standard deviation [SD]" " 81.6 +/- 6.8 years) were admitted. Of these, nearly half (n = 78) had used BZDs/RDs" " before admission, and the remainder (n = 86) were non-users. Cognitive ability was" " assessed by the Mini-Mental State Examination (MMSE). Patients scoring > or =20 MMSE" " sum points were interviewed (n = 79) and questioned regarding symptoms and functional" " abilities during the week prior to admission.") # Include a dummy annotation with ID == -1. These should be ignored. pubtator_content = f""" {pmid}|t|{title_text} {pmid}|a|{abstract_text} {pmid}\t28\t43\tbenzodiazepines\tChemical\tD001569 {pmid}\t219\t234\tbenzodiazepines\tChemical\tD001569 {pmid}\t253\t257\tBZDs\tChemical\tD001569 {pmid}\t583\t587\tBZDs\tChemical\tD001569 {pmid}\t1817\t1826\ttiredness\tDisease\tD005221 {pmid}\t0\t0\tArbitrary\tArbitrary\t-1 {pmid}\tCID\tD001569\tD005221 """ title_entities = { "D001569": schemas.PubtatorEntity( mentions=["benzodiazepines"], offsets=[(28, 43)], label="Chemical", ), } abstract_entities = { "D001569": schemas.PubtatorEntity( mentions=["benzodiazepines", "BZDs", "BZDs"], offsets=[(219, 234), (253, 257), (583, 587)], label="Chemical", ), "D005221": schemas.PubtatorEntity(mentions=["tiredness"], offsets=[(1817, 1826)], label="Disease"), } both_entities = { "D001569": schemas.PubtatorEntity( mentions=title_entities["D001569"].mentions + abstract_entities["D001569"].mentions, offsets=title_entities["D001569"].offsets + abstract_entities["D001569"].offsets, label="Chemical", ), "D005221": schemas.PubtatorEntity(mentions=["tiredness"], offsets=[(1817, 1826)], label="Disease"), } # Title only expected = schemas.PubtatorAnnotation(pmid=pmid, text=title_text, entities=title_entities, relations=[]) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.title) # Breaking up the asserts leads to much clearer outputs when the test fails assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations # Abstract only expected = schemas.PubtatorAnnotation( pmid=pmid, text=abstract_text, entities=abstract_entities, relations=[("D001569", "D005221", "CID")], ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.abstract) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations # Both expected = schemas.PubtatorAnnotation( pmid=pmid, text=f"{title_text} {abstract_text}", entities=both_entities, relations=[("D001569", "D005221", "CID")], ) actual = util.parse_pubtator(pubtator_content, text_segment=util.TextSegment.both) assert actual[0].text == expected.text assert actual[0].entities == expected.entities assert actual[0].relations == expected.relations
def test_query_pubtator() -> None: pmid = "19285439" title_text = ( "The ubiquitin ligase RNF5 regulates antiviral responses by mediating degradation" " of the adaptor protein MITA.") abstract_text = ( "Viral infection activates transcription factors NF-kappaB and IRF3, which collaborate to" " induce type I interferons (IFNs) and elicit innate antiviral response. MITA (also known" " as STING) has recently been identified as an adaptor that links virus-sensing receptors" " to IRF3 activation. Here, we showed that the E3 ubiquitin ligase RNF5 interacted with" " MITA in a viral-infection-dependent manner. Overexpression of RNF5 inhibited" " virus-triggered IRF3 activation, IFNB1 expression, and cellular antiviral response," " whereas knockdown of RNF5 had opposite effects. RNF5 targeted MITA at Lys150 for" " ubiquitination and degradation after viral infection. Both MITA and RNF5 were located at" " the mitochondria and endoplasmic reticulum (ER) and viral infection caused their" " redistribution to the ER and mitochondria, respectively. We further found that" " virus-induced ubiquitination and degradation of MITA by RNF5 occurred at the" " mitochondria. These findings suggest that RNF5 negatively regulates virus-triggered" " signaling by targeting MITA for ubiquitination and degradation at the mitochondria." ) title_entities = { "6048": schemas.PubtatorEntity( mentions=["RNF5"], offsets=[(21, 25)], label="Gene", ), "340061": schemas.PubtatorEntity(mentions=["MITA"], offsets=[(104, 108)], label="Gene"), } abstract_entities = { "4790": schemas.PubtatorEntity(mentions=["NF-kappaB"], offsets=[(158, 167)], label="Gene"), "3661": schemas.PubtatorEntity( mentions=["IRF3", "IRF3", "IRF3"], offsets=[(172, 176), (378, 382), (554, 558)], label="Gene", ), "340061": schemas.PubtatorEntity( mentions=["MITA", "STING", "MITA", "MITA", "MITA", "MITA", "MITA"], offsets=[ (270, 274), (290, 295), (461, 465), (684, 688), (762, 766), (1000, 1004), (1136, 1140), ], label="Gene", ), "6048": schemas.PubtatorEntity( mentions=["RNF5", "RNF5", "RNF5", "RNF5", "RNF5", "RNF5", "RNF5"], offsets=[ (440, 444), (523, 527), (643, 647), (670, 674), (771, 775), (1008, 1012), (1071, 1075), ], label="Gene", ), "3456": schemas.PubtatorEntity(mentions=["IFNB1"], offsets=[(571, 576)], label="Gene"), } both_entities = { "6048": schemas.PubtatorEntity( mentions=title_entities["6048"].mentions + abstract_entities["6048"].mentions, offsets=title_entities["6048"].offsets + abstract_entities["6048"].offsets, label="Gene", ), "340061": schemas.PubtatorEntity( mentions=title_entities["340061"].mentions + abstract_entities["340061"].mentions, offsets=title_entities["340061"].offsets + abstract_entities["340061"].offsets, label="Gene", ), "4790": schemas.PubtatorEntity(mentions=["NF-kappaB"], offsets=[(158, 167)], label="Gene"), "3661": schemas.PubtatorEntity( mentions=abstract_entities["3661"].mentions, offsets=abstract_entities["3661"].offsets, label="Gene", ), "3456": schemas.PubtatorEntity(mentions=["IFNB1"], offsets=[(571, 576)], label="Gene"), } # Title only expected = schemas.PubtatorAnnotation(pmid=pmid, text=title_text, entities=title_entities, relations=[]) actual = util.query_pubtator(pmids=[pmid], concepts=["gene"], text_segment=util.TextSegment.title) # Breaking up the asserts leads to much clearer outputs when the test fails assert len(actual) == 1 assert actual[expected.pmid].text == expected.text assert actual[expected.pmid].entities == expected.entities assert actual[expected.pmid].relations == expected.relations # Abstract only expected = schemas.PubtatorAnnotation(pmid=pmid, text=abstract_text, entities=abstract_entities, relations=[]) actual = util.query_pubtator(pmids=[pmid], concepts=["gene"], text_segment=util.TextSegment.abstract) assert len(actual) == 1 assert actual[expected.pmid].text == expected.text assert actual[expected.pmid].entities == expected.entities assert actual[expected.pmid].relations == expected.relations # Both expected = schemas.PubtatorAnnotation(pmid=pmid, text=f"{title_text} {abstract_text}", entities=both_entities, relations=[]) actual = util.query_pubtator(pmids=[pmid], concepts=["gene"], text_segment=util.TextSegment.both) assert len(actual) == 1 assert actual[expected.pmid].text == expected.text assert actual[expected.pmid].entities == expected.entities assert actual[expected.pmid].relations == expected.relations