def test_get_url(): url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=19894120' # with pytest.raises(requests.exceptions.Timeout): # r = utils.get_url(url, timeout=0.0001) r = utils.get_url(url) r = utils.get_url(url) assert r.from_cache
def orthologize_context(orthologize_target: str, annotations: Mapping[str, Any]) -> Mapping[str, Any]: """Orthologize context Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom """ url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}' r = utils.get_url(url) species_label = r.json().get("label", "unlabeled") orthologized_from = {} for idx, annotation in enumerate(annotations): if annotation["type"] == "Species": orthologized_from = { "id": annotation["id"], "label": annotation["label"] } annotations[idx] = { "type": "Species", "id": orthologize_target, "label": species_label } if "id" in orthologized_from: annotations.append({ "type": "OrigSpecies", "id": f'Orig-{orthologized_from["id"]}', "label": f'Orig-{orthologized_from["label"]}', }) return annotations
def orthologize_context(orthologize_target: str, annotations: Mapping[str, Any]) -> Mapping[str, Any]: """Orthologize context Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom """ url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}' r = utils.get_url(url) species_label = r.json().get("label", "unlabeled") orthologized_from = {} for idx, annotation in enumerate(annotations): if annotation['type'] == 'Species': orthologized_from = { 'id': annotation['id'], 'label': annotation['label'] } annotations[idx] = { 'type': 'Species', 'id': orthologize_target, 'label': species_label } if 'id' in orthologized_from: annotations.append({ 'type': 'OrigSpecies', 'id': f'Orig-{orthologized_from["id"]}', 'label': f'Orig-{orthologized_from["label"]}' }) return annotations
def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]: """Enhance pubmed namespace IDs Add additional entity and annotation types to annotations Use preferred id for namespaces as needed Add strings from Title, Abstract matching Pubtator BioConcept spans NOTE - basically duplicated code with bel_api:api.services.pubmed Args: pubmed Returns: pubmed object """ text = pubmed["title"] + pubmed["abstract"] annotations = {} for nsarg in pubmed["annotations"]: url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{url_path_param_quoting(nsarg)}' log.info(f"URL: {url}") r = get_url(url) log.info(f"Result: {r}") new_nsarg = "" if r and r.status_code == 200: term = r.json() new_nsarg = bel_utils.convert_nsarg(term["id"], decanonicalize=True) pubmed["annotations"][nsarg]["name"] = term["name"] pubmed["annotations"][nsarg]["label"] = term["label"] pubmed["annotations"][nsarg]["entity_types"] = list( set(pubmed["annotations"][nsarg]["entity_types"] + term.get("entity_types", []))) pubmed["annotations"][nsarg]["annotation_types"] = list( set(pubmed["annotations"][nsarg]["annotation_types"] + term.get("annotation_types", []))) if new_nsarg != nsarg: annotations[new_nsarg] = copy.deepcopy( pubmed["annotations"][nsarg]) else: annotations[nsarg] = copy.deepcopy(pubmed["annotations"][nsarg]) for nsarg in annotations: for idx, span in enumerate(annotations[nsarg]["spans"]): string = text[span["begin"] - 1:span["end"] - 1] annotations[nsarg]["spans"][idx]["text"] = string pubmed["annotations"] = copy.deepcopy(annotations) return pubmed
def get_pubmed(pmid: str) -> Mapping[str, Any]: """Get pubmed xml for pmid and convert to JSON Remove MESH terms if they are duplicated in the compound term set ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate> Only getting pub_year at this point from the <PubDate> element. Args: pmid: pubmed id number as a string Returns: pubmed json """ doc = { "abstract": "", "pmid": pmid, "title": "", "authors": [], "pub_date": "", "joural_iso_title": "", "journal_title": "", "doi": "", "compounds": [], "mesh": [], } try: pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid)) r = get_url(pubmed_url) content = r.content log.info(f"Getting Pubmed URL {pubmed_url}") root = etree.fromstring(content) except Exception as e: log.error( f"Bad Pubmed request, status: {r.status_code} error: {e}", url=f'{PUBMED_TMPL.replace("PMID", pmid)}', ) return {"doc": {}, "message": f"Cannot get PMID: {pubmed_url}"} doc["pmid"] = root.xpath("//PMID/text()")[0] print("PMID", doc["pmid"]) if doc["pmid"] != pmid: log.error("Requested PMID doesn't match record PMID", url=pubmed_url) if root.find("PubmedArticle") is not None: doc = parse_journal_article_record(doc, root) elif root.find("PubmedBookArticle") is not None: doc = parse_book_record(doc, root) return doc
def convert_nsarg( nsarg: str, api_url: str = None, namespace_targets: Mapping[str, List[str]] = None, canonicalize: bool = False, decanonicalize: bool = False, ) -> str: """[De]Canonicalize NSArg Args: nsarg (str): bel statement string or partial string (e.g. subject or object) api_url (str): BEL.bio api url to use, e.g. https://api.bel.bio/v1 namespace_targets (Mapping[str, List[str]]): formatted as in configuration file example canonicalize (bool): use canonicalize endpoint/namespace targets decanonicalize (bool): use decanonicalize endpoint/namespace targets Results: str: converted NSArg """ if not api_url: api_url = config["bel_api"]["servers"]["api_url"] if not api_url: log.error("Missing api url - cannot convert namespace") return None params = None if namespace_targets: namespace_targets_str = json.dumps(namespace_targets) params = {"namespace_targets": namespace_targets_str} if not namespace_targets: if canonicalize: api_url = api_url + "/terms/{}/canonicalized" elif decanonicalize: api_url = api_url + "/terms/{}/decanonicalized" else: log.warning( "Missing (de)canonical flag - cannot convert namespaces") return nsarg else: api_url = api_url + "/terms/{}/canonicalized" # overriding with namespace_targets request_url = api_url.format(url_path_param_quoting(nsarg)) r = get_url(request_url, params=params, timeout=10) if r and r.status_code == 200: nsarg = r.json().get("term_id", nsarg) elif not r or r.status_code == 404: log.error(f"[de]Canonicalization endpoint missing: {request_url}") return nsarg
def validate_arg_values(ast, bo): """Recursively validate arg (NSArg and StrArg) values Check that NSArgs are found in BELbio API and match appropriate entity_type. Check that StrArgs match their value - either default namespace or regex string Generate a WARNING if not. Args: bo: bel object Returns: bel object """ if not bo.api_url: log.info("No API endpoint defined") return bo log.debug(f"AST: {ast}") # Test NSArg terms if isinstance(ast, NSArg): term_id = "{}:{}".format(ast.namespace, ast.value) value_types = ast.value_types log.debug(f"Value types: {value_types} AST value: {ast.value}") # Default namespaces are defined in the bel_specification file if ast.namespace == "DEFAULT": # may use the DEFAULT namespace or not for value_type in value_types: default_namespace = [ ns["name"] for ns in bo.spec["namespaces"][value_type]["info"] ] + [ ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"] ] if ast.value in default_namespace: log.debug( "Default namespace valid term: {}".format(term_id)) break else: # if for loop doesn't hit the break, run this else log.debug("Default namespace invalid term: {}".format(term_id)) bo.validation_messages.append( ("WARNING", f"Default Term: {term_id} not found")) # Process normal, non-default-namespace terms else: request_url = bo.api_url + "/terms/{}".format( url_path_param_quoting(term_id)) log.info(f"Validate Arg Values url {request_url}") r = get_url(request_url) if r and r.status_code == 200: result = r.json() # function signature term value_types doesn't match up with API term entity_types log.debug( f'AST.value_types {ast.value_types} Entity types {result.get("entity_types", [])}' ) # Check that entity types match if len( set(ast.value_types).intersection( result.get("entity_types", []))) == 0: log.debug( "Invalid Term - statement term {} allowable entity types: {} do not match API term entity types: {}" .format(term_id, ast.value_types, result.get("entity_types", []))) bo.validation_messages.append(( "WARNING", "Invalid Term - statement term {} allowable entity types: {} do not match API term entity types: {}" .format(term_id, ast.value_types, result.get("entity_types", [])), )) if term_id in result.get("obsolete_ids", []): bo.validation_messages.append(( "WARNING", f'Obsolete term: {term_id} Current term: {result["id"]}' )) elif r.status_code == 404: bo.validation_messages.append( ("WARNING", f"Term: {term_id} not found in namespace")) else: log.error(f"Status {r.status_code} - Bad URL: {request_url}") # Process StrArgs if isinstance(ast, StrArg): log.debug(f" Check String Arg: {ast.value} {ast.value_types}") for value_type in ast.value_types: # Is this a regex to match against if re.match("/", value_type): value_type = re.sub("^/", "", value_type) value_type = re.sub("/$", "", value_type) match = re.match(value_type, ast.value) if match: break if value_type in bo.spec["namespaces"]: default_namespace = [ ns["name"] for ns in bo.spec["namespaces"][value_type]["info"] ] + [ ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"] ] if ast.value in default_namespace: break else: # If for loop doesn't hit the break, no matches found, therefore for StrArg value is bad bo.validation_messages.append(( "WARNING", f"String value {ast.value} does not match default namespace value or regex pattern: {ast.value_types}", )) # Recursively process every NSArg by processing BELAst and Functions if hasattr(ast, "args"): for arg in ast.args: validate_arg_values(arg, bo) return bo
def nsarg_completions( completion_text: str, entity_types: list, bel_spec: BELSpec, namespace: str, species_id: str, bel_fmt: str, size: int, ): """Namespace completions Args: completion_text entity_types: used to filter namespace search results bel_spec: used to search default namespaces namespace: used to filter namespace search results species_id: used to filter namespace search results bel_fmt: used to select full name or abbrev for default namespaces size: how many completions to return Results: list of replacement text objects """ minimal_nsarg_completion_len = 1 species = [species_id] namespaces = [namespace] replace_list = [] if len(completion_text) >= minimal_nsarg_completion_len: # Use BEL.bio API module if running bel module in BEL.bio API, otherwise call BEL.bio API endpoint # is there a better way to handle this? url = f'{config["bel_api"]["servers"]["api_url"]}/terms/completions/{url_path_param_quoting(completion_text)}' params = { "size": size, "entity_types": entity_types, "namespaces": namespaces, "species": species, } r = get_url(url, params=params) if r.status_code == 200: ns_completions = r.json() else: log.error(f"Status code of {r.status_code} for {url}") ns_completions = {} for complete in ns_completions.get("completions", []): replace_list.append({ "replacement": complete["id"], "label": f"{complete['id']} ({complete['label']})", "highlight": complete["highlight"][-1], "type": "NSArg", }) # Check default namespaces for entity_type in entity_types: default_namespace = bel_spec["namespaces"].get(entity_type, []) if default_namespace: for obj in default_namespace["info"]: replacement = None if bel_fmt == "long" and re.match(completion_text, obj["name"], re.IGNORECASE): replacement = obj["name"] elif bel_fmt in ["short", "medium"] and re.match( completion_text, obj["abbreviation"], re.IGNORECASE): replacement = obj["abbreviation"] if replacement: highlight = replacement.replace( completion_text, f"<em>{completion_text}</em>") replace_list.insert( 0, { "replacement": replacement, "label": replacement, "highlight": highlight, "type": "NSArg", }, ) return replace_list[:size]
def get_pubtator(pmid): """Get Pubtator Bioconcepts from Pubmed Abstract Re-configure the denotations into an annotation dictionary format and collapse duplicate terms so that their spans are in a list. """ r = get_url(PUBTATOR_TMPL.replace("PMID", pmid), timeout=10) if r and r.status_code == 200: pubtator = r.json()[0] else: log.error( f"Cannot access Pubtator, status: {r.status_code} url: {PUBTATOR_TMPL.replace('PMID', pmid)}" ) return None known_types = ["CHEBI", "Chemical", "Disease", "Gene", "Species"] for idx, anno in enumerate(pubtator["denotations"]): s_match = re.match(r"(\w+):(\w+)", anno["obj"]) c_match = re.match(r"(\w+):(\w+):(\w+)", anno["obj"]) if c_match: (ctype, namespace, cid) = (c_match.group(1), c_match.group(2), c_match.group(3)) if ctype not in known_types: log.info(f"{ctype} not in known_types for Pubtator") if namespace not in known_types: log.info(f"{namespace} not in known_types for Pubtator") pubtator["denotations"][idx][ "obj"] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}' pubtator["denotations"][idx][ "entity_type"] = pubtator_entity_convert.get(ctype, None) pubtator["denotations"][idx][ "annotation_type"] = pubtator_annotation_convert.get( ctype, None) elif s_match: (ctype, cid) = (s_match.group(1), s_match.group(2)) if ctype not in known_types: log.info(f"{ctype} not in known_types for Pubtator") pubtator["denotations"][idx][ "obj"] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}' pubtator["denotations"][idx][ "entity_type"] = pubtator_entity_convert.get(ctype, None) pubtator["denotations"][idx][ "annotation_type"] = pubtator_annotation_convert.get( ctype, None) annotations = {} for anno in pubtator["denotations"]: log.info(anno) if anno["obj"] not in annotations: annotations[anno["obj"]] = {"spans": [anno["span"]]} annotations[anno["obj"]]["entity_types"] = [ anno.get("entity_type", []) ] annotations[anno["obj"]]["annotation_types"] = [ anno.get("annotation_type", []) ] else: annotations[anno["obj"]]["spans"].append(anno["span"]) del pubtator["denotations"] pubtator["annotations"] = copy.deepcopy(annotations) return pubtator
def get_pubmed(pmid: str) -> Mapping[str, Any]: """Get pubmed xml for pmid and convert to JSON Remove MESH terms if they are duplicated in the compound term set ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate> Only getting pub_year at this point from the <PubDate> element. Args: pmid: pubmed id number as a string Returns: pubmed json """ pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid)) r = get_url(pubmed_url) log.info(f"Getting Pubmed URL {pubmed_url}") try: root = etree.fromstring(r.content) doc = {"abstract": ""} doc["pmid"] = root.xpath("//PMID/text()")[0] doc["title"] = next(iter(root.xpath("//ArticleTitle/text()")), "") # TODO https://stackoverflow.com/questions/4770191/lxml-etree-element-text-doesnt-return-the-entire-text-from-an-element atext = next(iter(root.xpath("//Abstract/AbstractText/text()")), "") print("Text", atext) for abstracttext in root.xpath("//Abstract/AbstractText"): abstext = node_text(abstracttext) label = abstracttext.get("Label", None) if label: doc["abstract"] += f"{label}: {abstext}\n" else: doc["abstract"] += f"{abstext}\n" doc["abstract"] = doc["abstract"].rstrip() doc["authors"] = [] for author in root.xpath("//Author"): last_name = next(iter(author.xpath("LastName/text()")), "") first_name = next(iter(author.xpath("ForeName/text()")), "") initials = next(iter(author.xpath("Initials/text()")), "") if not first_name and initials: first_name = initials doc["authors"].append(f"{last_name}, {first_name}") pub_year = next( iter(root.xpath("//Journal/JournalIssue/PubDate/Year/text()")), None) pub_mon = next( iter(root.xpath("//Journal/JournalIssue/PubDate/Month/text()")), "Jan") pub_day = next( iter(root.xpath("//Journal/JournalIssue/PubDate/Day/text()")), "01") pub_date = process_pub_date(pub_year, pub_mon, pub_day) doc["pub_date"] = pub_date doc["journal_title"] = next(iter(root.xpath("//Journal/Title/text()")), "") doc["joural_iso_title"] = next( iter(root.xpath("//Journal/ISOAbbreviation/text()")), "") doc["doi"] = next( iter(root.xpath('//ArticleId[@IdType="doi"]/text()')), None) doc["compounds"] = [] for chem in root.xpath("//ChemicalList/Chemical/NameOfSubstance"): chem_id = chem.get("UI") doc["compounds"].append({ "id": f"MESH:{chem_id}", "name": chem.text }) compounds = [cmpd["id"] for cmpd in doc["compounds"]] doc["mesh"] = [] for mesh in root.xpath("//MeshHeading/DescriptorName"): mesh_id = f"MESH:{mesh.get('UI')}" if mesh_id in compounds: continue doc["mesh"].append({"id": mesh_id, "name": mesh.text}) return doc except Exception as e: log.error( f"Bad Pubmed request, status: {r.status_code} error: {e}", url=f'{PUBMED_TMPL.replace("PMID", pmid)}', ) return {"message": f"Cannot get PMID: {pubmed_url}"}