def github_belspec_files(spec_dir, force: bool = False): """Get belspec files from Github repo Args: spec_dir: directory to store the BEL Specification and derived files force: force update of BEL Specifications from Github - skipped if local files less than 1 day old """ if not force: dtnow = datetime.datetime.utcnow() delta = datetime.timedelta(1) yesterday = dtnow - delta for fn in glob.glob(f"{spec_dir}/bel*yaml"): if datetime.datetime.fromtimestamp( os.path.getmtime(fn)) > yesterday: log.info( "Skipping BEL Specification update - specs less than 1 day old" ) return repo_url = "https://api.github.com/repos/belbio/bel_specifications/contents/specifications" params = {} github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "") if github_access_token: params = {"access_token": github_access_token} r = http_client.get(repo_url, params=params) if r.status_code == 200: results = r.json() for f in results: url = f["download_url"] fn = os.path.basename(url) if "yaml" not in fn and "yml" in fn: fn = fn.replace("yml", "yaml") r = http_client.get(url, params=params, allow_redirects=True) if r.status_code == 200: open(f"{spec_dir}/{fn}", "wb").write(r.content) else: sys.exit( f"Could not get BEL Spec file {url} from Github -- Status: {r.status_code} Msg: {r.content}" ) else: sys.exit( f"Could not get BEL Spec directory listing from Github -- Status: {r.status_code} Msg: {r.content}" )
def orthologize_context(orthologize_target: str, annotations: Mapping[str, Any]) -> Mapping[str, Any]: """Orthologize context Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom """ url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}' r = http_client.get(url) species_label = r.json().get("label", "unlabeled") orthologized_from = {} for idx, annotation in enumerate(annotations): if annotation["type"] == "Species": orthologized_from = { "id": annotation["id"], "label": annotation["label"] } annotations[idx] = { "type": "Species", "id": orthologize_target, "label": species_label } if "id" in orthologized_from: annotations.append({ "type": "OrigSpecies", "id": f'Orig-{orthologized_from["id"]}', "label": f'Orig-{orthologized_from["label"]}', }) return annotations
def get_ebnf_template(): """Get EBNF template from Github belbio/bel_specifications repo""" spec_dir = config["bel"]["lang"]["specifications"] local_fp = f"{spec_dir}/bel.ebnf.j2" repo_url = ( "https://api.github.com/repos/belbio/bel_specifications/contents/resources/bel.ebnf.j2" ) params = {} github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "") if github_access_token: params = {"access_token": github_access_token} try: # Get download url for template file r = http_client.get(repo_url, params=params) if r.status_code == 200: template_url = r.json()["download_url"] else: log.warning("Could not get EBNF file download url from Github") # Get template file try: r = http_client.get(template_url, params=params, allow_redirects=True) if r.status_code == 200: open(local_fp, "wt").write(r.text) else: log.warning( f"Could not download EBNF file from Github -- Status: {r.status_code} Msg: {r.text}" ) except Exception as e: log.warning( f"Could not download EBNF file from Github -- Status: {r.status_code} Msg: {e}" ) except Exception as e: log.warning("Could not download BEL EBNF template file") if not os.path.exists(f"{spec_dir}/local_fp"): log.error("No BEL EBNF template file available") return local_fp
def get_pubmed(pmid: str) -> Mapping[str, Any]: """Get pubmed xml for pmid and convert to JSON Remove MESH terms if they are duplicated in the compound term set ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate> Only getting pub_year at this point from the <PubDate> element. Args: pmid: pubmed id number as a string Returns: pubmed json """ doc = { "abstract": "", "pmid": pmid, "title": "", "authors": [], "pub_date": "", "joural_iso_title": "", "journal_title": "", "doi": "", "compounds": [], "mesh": [], } try: pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid)) r = http_client.get(pubmed_url) content = r.content log.info(f"Getting Pubmed URL {pubmed_url}") root = etree.fromstring(content) except Exception as e: log.exception( f"Bad Pubmed request, status: {r.status_code} error: {e}", url=f'{PUBMED_TMPL.replace("PMID", pmid)}', ) return {"doc": {}, "message": f"Cannot get PMID: {pubmed_url}"} doc["pmid"] = root.xpath("//PMID/text()")[0] print("PMID", doc["pmid"]) if doc["pmid"] != pmid: log.error("Requested PMID doesn't match record PMID", url=pubmed_url) if root.find("PubmedArticle") is not None: doc = parse_journal_article_record(doc, root) elif root.find("PubmedBookArticle") is not None: doc = parse_book_record(doc, root) return doc
def convert_nsarg( nsarg: str, api_url: str = None, namespace_targets: Mapping[str, List[str]] = None, canonicalize: bool = False, decanonicalize: bool = False, ) -> str: """[De]Canonicalize NSArg Args: nsarg (str): bel statement string or partial string (e.g. subject or object) api_url (str): BEL.bio api url to use, e.g. https://api.bel.bio/v1 namespace_targets (Mapping[str, List[str]]): formatted as in configuration file example canonicalize (bool): use canonicalize endpoint/namespace targets decanonicalize (bool): use decanonicalize endpoint/namespace targets Results: str: converted NSArg """ if not api_url: api_url = config["bel_api"]["servers"]["api_url"] if not api_url: log.error("Missing api url - cannot convert namespace") return None params = None if namespace_targets: namespace_targets_str = json.dumps(namespace_targets) params = {"namespace_targets": namespace_targets_str} if not namespace_targets: if canonicalize: api_url = api_url + "/terms/{}/canonicalized" elif decanonicalize: api_url = api_url + "/terms/{}/decanonicalized" else: log.warning( "Missing (de)canonical flag - cannot convert namespaces") return nsarg else: api_url = api_url + "/terms/{}/canonicalized" # overriding with namespace_targets request_url = api_url.format(url_path_param_quoting(nsarg)) r = http_client.get(request_url, params=params, timeout=10) if r and r.status_code == 200: nsarg = r.json().get("term_id", nsarg) elif not r or r.status_code == 404: log.error(f"[de]Canonicalization endpoint missing: {request_url}") return nsarg
def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]: """Enhance pubmed namespace IDs Add additional entity and annotation types to annotations Use preferred id for namespaces as needed Add strings from Title, Abstract matching Pubtator BioConcept spans NOTE - basically duplicated code with bel_api:api.services.pubmed Args: pubmed Returns: pubmed object """ text = pubmed["title"] + pubmed["abstract"] annotations = {} for nsarg in pubmed["annotations"]: url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{url_path_param_quoting(nsarg)}' log.info(f"URL: {url}") r = http_client.get(url) log.info(f"Result: {r}") new_nsarg = "" if r and r.status_code == 200: term = r.json() new_nsarg = bel_ast.convert_nsarg(term["id"], decanonicalize=True) pubmed["annotations"][nsarg]["name"] = term["name"] pubmed["annotations"][nsarg]["label"] = term["label"] pubmed["annotations"][nsarg]["entity_types"] = list( set(pubmed["annotations"][nsarg]["entity_types"] + term.get("entity_types", []))) pubmed["annotations"][nsarg]["annotation_types"] = list( set(pubmed["annotations"][nsarg]["annotation_types"] + term.get("annotation_types", []))) if new_nsarg != nsarg: annotations[new_nsarg] = copy.deepcopy( pubmed["annotations"][nsarg]) else: annotations[nsarg] = copy.deepcopy(pubmed["annotations"][nsarg]) for nsarg in annotations: for idx, span in enumerate(annotations[nsarg]["spans"]): string = text[span["begin"] - 1:span["end"] - 1] annotations[nsarg]["spans"][idx]["text"] = string pubmed["annotations"] = copy.deepcopy(annotations) return pubmed
def validate_context(self, context: Mapping[str, Any]) -> Tuple[bool, List[Tuple[str, str]]]: """ Validate context Args: context (Mapping[str, Any]): context dictionary of type, id and label Returns: Tuple[bool, List[Tuple[str, str]]]: bool: Is valid? Yes = True, No = False List[Tuple[str, str]]: Validation issues, empty if valid, tuple is ('ERROR|WARNING', msg) e.g. [('WARNING', "Context ID not found")] """ url = f'{self.endpoint}/terms/{context["id"]}' res = http_client.get(url) if res.status_code == 200: return (True, []) else: return (False, [("WARNING", f'Context {context["id"]} not found at {url}')])
def get_pubtator(pmid): """Get Pubtator Bioconcepts from Pubmed Abstract Re-configure the denotations into an annotation dictionary format and collapse duplicate terms so that their spans are in a list. """ r = http_client.get(PUBTATOR_TMPL.replace("PMID", pmid), timeout=10) if r and r.status_code == 200: pubtator = r.json()[0] else: log.error( f"Cannot access Pubtator, status: {r.status_code} url: {PUBTATOR_TMPL.replace('PMID', pmid)}" ) return None known_types = ["CHEBI", "Chemical", "Disease", "Gene", "Species"] for idx, anno in enumerate(pubtator["denotations"]): s_match = re.match(r"(\w+):(\w+)", anno["obj"]) c_match = re.match(r"(\w+):(\w+):(\w+)", anno["obj"]) if c_match: (ctype, namespace, cid) = (c_match.group(1), c_match.group(2), c_match.group(3)) if ctype not in known_types: log.info(f"{ctype} not in known_types for Pubtator") if namespace not in known_types: log.info(f"{namespace} not in known_types for Pubtator") pubtator["denotations"][idx][ "obj"] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}' pubtator["denotations"][idx][ "entity_type"] = pubtator_entity_convert.get(ctype, None) pubtator["denotations"][idx][ "annotation_type"] = pubtator_annotation_convert.get( ctype, None) elif s_match: (ctype, cid) = (s_match.group(1), s_match.group(2)) if ctype not in known_types: log.info(f"{ctype} not in known_types for Pubtator") pubtator["denotations"][idx][ "obj"] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}' pubtator["denotations"][idx][ "entity_type"] = pubtator_entity_convert.get(ctype, None) pubtator["denotations"][idx][ "annotation_type"] = pubtator_annotation_convert.get( ctype, None) annotations = {} for anno in pubtator["denotations"]: log.info(anno) if anno["obj"] not in annotations: annotations[anno["obj"]] = {"spans": [anno["span"]]} annotations[anno["obj"]]["entity_types"] = [ anno.get("entity_type", []) ] annotations[anno["obj"]]["annotation_types"] = [ anno.get("annotation_type", []) ] else: annotations[anno["obj"]]["spans"].append(anno["span"]) del pubtator["denotations"] pubtator["annotations"] = copy.deepcopy(annotations) return pubtator
def validate_arg_values(ast, bo): """Recursively validate arg (NSArg and StrArg) values Check that NSArgs are found in BELbio API and match appropriate entity_type. Check that StrArgs match their value - either default namespace or regex string Generate a WARNING if not. Args: bo: bel object Returns: bel object """ if not bo.api_url: log.info("No API endpoint defined") return bo log.debug(f"AST: {ast}") # Test NSArg terms if isinstance(ast, NSArg): term_id = "{}:{}".format(ast.namespace, ast.value) value_types = ast.value_types log.debug(f"Value types: {value_types} AST value: {ast.value}") # Default namespaces are defined in the bel_specification file if ast.namespace == "DEFAULT": # may use the DEFAULT namespace or not for value_type in value_types: default_namespace = [ ns["name"] for ns in bo.spec["namespaces"][value_type]["info"] ] + [ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"]] if ast.value in default_namespace: log.debug("Default namespace valid term: {}".format(term_id)) break else: # if for loop doesn't hit the break, run this else log.debug("Default namespace invalid term: {}".format(term_id)) bo.validation_messages.append(("WARNING", f"Default Term: {term_id} not found")) # Process normal, non-default-namespace terms else: request_url = bo.api_url + "/terms/{}".format(url_path_param_quoting(term_id)) log.info(f"Validate Arg Values url {request_url}") r = http_client.get(request_url) if r and r.status_code == 200: result = r.json() # function signature term value_types doesn't match up with API term entity_types log.debug( f'AST.value_types {ast.value_types} Entity types {result.get("entity_types", [])}' ) # Check that entity types match if len(set(ast.value_types).intersection(result.get("entity_types", []))) == 0: log.debug( "Invalid Term - Assertion term {} allowable entity types: {} do not match API term entity types: {}".format( term_id, ast.value_types, result.get("entity_types", []) ) ) bo.validation_messages.append( ( "WARNING", "Invalid Term - Assertion term {} allowable entity types: {} do not match API term entity types: {}".format( term_id, ast.value_types, result.get("entity_types", []) ), ) ) if term_id in result.get("obsolete_ids", []): bo.validation_messages.append( ("WARNING", f'Obsolete term: {term_id} Current term: {result["id"]}') ) elif r.status_code == 404: bo.validation_messages.append( ("WARNING", f"Term: {term_id} not found in namespace") ) else: log.error(f"Status {r.status_code} - Bad URL: {request_url}") # Process StrArgs if isinstance(ast, StrArg): log.debug(f" Check String Arg: {ast.value} {ast.value_types}") for value_type in ast.value_types: # Is this a regex to match against if re.match("/", value_type): value_type = re.sub("^/", "", value_type) value_type = re.sub("/$", "", value_type) match = re.match(value_type, ast.value) if match: break if value_type in bo.spec["namespaces"]: default_namespace = [ ns["name"] for ns in bo.spec["namespaces"][value_type]["info"] ] + [ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"]] if ast.value in default_namespace: break else: # If for loop doesn't hit the break, no matches found, therefore for StrArg value is bad bo.validation_messages.append( ( "WARNING", f"String value {ast.value} does not match default namespace value or regex pattern: {ast.value_types}", ) ) # Recursively process every NSArg by processing BELAst and Functions if hasattr(ast, "args"): for arg in ast.args: validate_arg_values(arg, bo) return bo
def process_nanopub(nanopub_url, orthologize_targets: list = [], overwrite: bool = False, token: str = None): log.debug( "Process nanopub parameters", nanopub_url=nanopub_url, orthologize_targets=orthologize_targets, overwrite=overwrite, ) log.info("Processing nanopub", nanopub_url=nanopub_url) url_comps = urllib.parse.urlparse(nanopub_url) nanopub_id = os.path.basename(url_comps.path) # domain = url_comps.netloc start_time = datetime.datetime.now() # collect nanopub headers = {} if token: headers = {"Authorization": f"Bearer {token}"} r = http_client.get(nanopub_url, headers=headers) nanopub = r.json() end_time1 = datetime.datetime.now() delta_ms = f"{(end_time1 - start_time).total_seconds() * 1000:.1f}" log.debug("Timing - Get nanopub", delta_ms=delta_ms, nanopub=nanopub) assertions = [] for assertion in nanopub["nanopub"].get("assertions", []): assertions.append( f"{assertion['subject']} {assertion['relation']} {assertion['object']}" ) if not nanopub: log.error(f"Could not GET nanopub: {nanopub_url}") return { "msg": f"Could not GET nanopub: {nanopub_url}", "edges_cnt": 0, "assertions_cnt": 0, "assertions": assertions, "success": False, "errors": [], } nanopub["source_url"] = nanopub_url # Is nanopub in edge newer than from queue? If so, skip if not overwrite: # collect one edge for nanopub from edgestore edge = get_edges_for_nanopub(nanopub_id) if edge: # check if edge nanopub is newer # log.info("Nanopub to Edge comparison", nanopub=nanopub, edge=edge) if edge["metadata"].get("gd_updateTS", None): if nanopub["nanopub"]["metadata"]["gd_updateTS"] <= edge[ "metadata"]["gd_updateTS"]: log.info( "Nanopub older than edge nanopub", nanopub_dt=nanopub["nanopub"]["metadata"] ["gd_updateTS"], edge_dt=edge["metadata"]["gd_updateTS"], ) return { "msg": "Nanopub older than edge nanopub", "success": True, "e": "" } end_time2 = datetime.datetime.now() delta_ms = f"{(end_time2 - end_time1).total_seconds() * 1000:.1f}" log.debug("Timing - Get edge to check nanopub", delta_ms=delta_ms) results = bel.edge.edges.nanopub_to_edges( nanopub, orthologize_targets=orthologize_targets) end_time3 = datetime.datetime.now() delta_ms = f"{(end_time3 - end_time2).total_seconds() * 1000:.1f}" if results["success"]: db_results = load_edges_into_db(nanopub_id, nanopub["source_url"], edges=results["edges"]) # log.info("Convert nanopub to edges", db_results=db_results, results=results) end_time4 = datetime.datetime.now() delta_ms = f"{(end_time4 - end_time3).total_seconds() * 1000:.1f}" log.debug("Timing - Load edges into edgestore", delta_ms=delta_ms) delta_ms = f"{(end_time4 - start_time).total_seconds() * 1000:.1f}" log.debug("Timing - Process nanopub into edges", delta_ms=delta_ms) return { "msg": f"Loaded {len(results['edges'])} edges into edgestore", "edges_cnt": len(results["edges"]), "assertions_cnt": len(nanopub["nanopub"]["assertions"]), "assertions": assertions, "success": True, "errors": results["errors"], } else: log.error( f'Could not process nanopub {nanopub_id} into edges - error: {results["errors"]}' ) return { "msg": f'Could not process nanopub into edges - error: {results["errors"]}', "edges_cnt": 0, "assertions_cnt": len(nanopub["nanopub"]["assertions"]), "assertions": assertions, "success": False, "errors": results["errors"], }
def nsarg_completions( completion_text: str, entity_types: list, bel_spec: BELSpec, namespace: str, species_id: str, bel_fmt: str, size: int, ): """Namespace completions Args: completion_text entity_types: used to filter namespace search results bel_spec: used to search default namespaces namespace: used to filter namespace search results species_id: used to filter namespace search results bel_fmt: used to select full name or abbrev for default namespaces size: how many completions to return Results: list of replacement text objects """ minimal_nsarg_completion_len = 1 species = [species_id] namespaces = [namespace] replace_list = [] if len(completion_text) >= minimal_nsarg_completion_len: # Use BEL.bio API module if running bel module in BEL.bio API, otherwise call BEL.bio API endpoint # is there a better way to handle this? url = f'{config["bel_api"]["servers"]["api_url"]}/terms/completions/{url_path_param_quoting(completion_text)}' params = { "size": size, "entity_types": entity_types, "namespaces": namespaces, "species": species, } if "Species" in entity_types: params.pop("species", "") log.info( "NSArg completion", api_url=config["bel_api"]["servers"]["api_url"], url=url, params=params, ) r = http_client.get(url, params=params) if r.status_code == 200: ns_completions = r.json() else: log.error(f"Status code of {r.status_code} for {url}") ns_completions = {} for complete in ns_completions.get("completions", []): replace_list.append({ "replacement": complete["id"], "label": f"{complete['id']} ({complete['label']})", "highlight": complete["highlight"][-1], "type": "NSArg", }) # Check default namespaces for entity_type in entity_types: default_namespace = bel_spec["namespaces"].get(entity_type, []) if default_namespace: for obj in default_namespace["info"]: replacement = None if bel_fmt == "long" and re.match(completion_text, obj["name"], re.IGNORECASE): replacement = obj["name"] elif bel_fmt in ["short", "medium"] and re.match( completion_text, obj["abbreviation"], re.IGNORECASE): replacement = obj["abbreviation"] if replacement: highlight = replacement.replace( completion_text, f"<em>{completion_text}</em>") replace_list.insert( 0, { "replacement": replacement, "label": replacement, "highlight": highlight, "type": "NSArg", }, ) return replace_list[:size]