예제 #1
0
def github_belspec_files(spec_dir, force: bool = False):
    """Get belspec files from Github repo


    Args:
        spec_dir: directory to store the BEL Specification and derived files
        force: force update of BEL Specifications from Github - skipped if local files less than 1 day old
    """

    if not force:
        dtnow = datetime.datetime.utcnow()
        delta = datetime.timedelta(1)
        yesterday = dtnow - delta

        for fn in glob.glob(f"{spec_dir}/bel*yaml"):
            if datetime.datetime.fromtimestamp(
                    os.path.getmtime(fn)) > yesterday:
                log.info(
                    "Skipping BEL Specification update - specs less than 1 day old"
                )
                return

    repo_url = "https://api.github.com/repos/belbio/bel_specifications/contents/specifications"
    params = {}
    github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "")
    if github_access_token:
        params = {"access_token": github_access_token}

    r = http_client.get(repo_url, params=params)
    if r.status_code == 200:
        results = r.json()
        for f in results:
            url = f["download_url"]
            fn = os.path.basename(url)

            if "yaml" not in fn and "yml" in fn:
                fn = fn.replace("yml", "yaml")

            r = http_client.get(url, params=params, allow_redirects=True)
            if r.status_code == 200:
                open(f"{spec_dir}/{fn}", "wb").write(r.content)
            else:
                sys.exit(
                    f"Could not get BEL Spec file {url} from Github -- Status: {r.status_code}  Msg: {r.content}"
                )
    else:
        sys.exit(
            f"Could not get BEL Spec directory listing from Github -- Status: {r.status_code}  Msg: {r.content}"
        )
예제 #2
0
파일: edges.py 프로젝트: sailfish009/bel-1
def orthologize_context(orthologize_target: str,
                        annotations: Mapping[str, Any]) -> Mapping[str, Any]:
    """Orthologize context

    Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom
    """

    url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}'
    r = http_client.get(url)
    species_label = r.json().get("label", "unlabeled")

    orthologized_from = {}
    for idx, annotation in enumerate(annotations):
        if annotation["type"] == "Species":
            orthologized_from = {
                "id": annotation["id"],
                "label": annotation["label"]
            }
            annotations[idx] = {
                "type": "Species",
                "id": orthologize_target,
                "label": species_label
            }

    if "id" in orthologized_from:
        annotations.append({
            "type": "OrigSpecies",
            "id": f'Orig-{orthologized_from["id"]}',
            "label": f'Orig-{orthologized_from["label"]}',
        })

    return annotations
예제 #3
0
def get_ebnf_template():
    """Get EBNF template from Github belbio/bel_specifications repo"""

    spec_dir = config["bel"]["lang"]["specifications"]
    local_fp = f"{spec_dir}/bel.ebnf.j2"

    repo_url = (
        "https://api.github.com/repos/belbio/bel_specifications/contents/resources/bel.ebnf.j2"
    )

    params = {}
    github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "")
    if github_access_token:
        params = {"access_token": github_access_token}

    try:
        # Get download url for template file
        r = http_client.get(repo_url, params=params)

        if r.status_code == 200:
            template_url = r.json()["download_url"]
        else:
            log.warning("Could not get EBNF file download url from Github")

        # Get template file
        try:
            r = http_client.get(template_url,
                                params=params,
                                allow_redirects=True)
            if r.status_code == 200:
                open(local_fp, "wt").write(r.text)
            else:
                log.warning(
                    f"Could not download EBNF file from Github -- Status: {r.status_code}  Msg: {r.text}"
                )

        except Exception as e:
            log.warning(
                f"Could not download EBNF file from Github -- Status: {r.status_code}  Msg: {e}"
            )

    except Exception as e:
        log.warning("Could not download BEL EBNF template file")
        if not os.path.exists(f"{spec_dir}/local_fp"):
            log.error("No BEL EBNF template file available")

    return local_fp
예제 #4
0
파일: pubmed.py 프로젝트: sailfish009/bel-1
def get_pubmed(pmid: str) -> Mapping[str, Any]:
    """Get pubmed xml for pmid and convert to JSON

    Remove MESH terms if they are duplicated in the compound term set

    ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate>
    Only getting pub_year at this point from the <PubDate> element.

    Args:
        pmid: pubmed id number as a string

    Returns:
        pubmed json
    """

    doc = {
        "abstract": "",
        "pmid": pmid,
        "title": "",
        "authors": [],
        "pub_date": "",
        "joural_iso_title": "",
        "journal_title": "",
        "doi": "",
        "compounds": [],
        "mesh": [],
    }

    try:
        pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid))
        r = http_client.get(pubmed_url)
        content = r.content
        log.info(f"Getting Pubmed URL {pubmed_url}")
        root = etree.fromstring(content)

    except Exception as e:
        log.exception(
            f"Bad Pubmed request, status: {r.status_code} error: {e}",
            url=f'{PUBMED_TMPL.replace("PMID", pmid)}',
        )
        return {"doc": {}, "message": f"Cannot get PMID: {pubmed_url}"}

    doc["pmid"] = root.xpath("//PMID/text()")[0]
    print("PMID", doc["pmid"])

    if doc["pmid"] != pmid:
        log.error("Requested PMID doesn't match record PMID", url=pubmed_url)

    if root.find("PubmedArticle") is not None:
        doc = parse_journal_article_record(doc, root)
    elif root.find("PubmedBookArticle") is not None:
        doc = parse_book_record(doc, root)

    return doc
예제 #5
0
파일: ast.py 프로젝트: sailfish009/bel-1
def convert_nsarg(
    nsarg: str,
    api_url: str = None,
    namespace_targets: Mapping[str, List[str]] = None,
    canonicalize: bool = False,
    decanonicalize: bool = False,
) -> str:
    """[De]Canonicalize NSArg

    Args:
        nsarg (str): bel statement string or partial string (e.g. subject or object)
        api_url (str): BEL.bio api url to use, e.g. https://api.bel.bio/v1
        namespace_targets (Mapping[str, List[str]]): formatted as in configuration file example
        canonicalize (bool): use canonicalize endpoint/namespace targets
        decanonicalize (bool): use decanonicalize endpoint/namespace targets

    Results:
        str: converted NSArg
    """

    if not api_url:
        api_url = config["bel_api"]["servers"]["api_url"]
        if not api_url:
            log.error("Missing api url - cannot convert namespace")
            return None

    params = None
    if namespace_targets:
        namespace_targets_str = json.dumps(namespace_targets)
        params = {"namespace_targets": namespace_targets_str}

    if not namespace_targets:
        if canonicalize:
            api_url = api_url + "/terms/{}/canonicalized"
        elif decanonicalize:
            api_url = api_url + "/terms/{}/decanonicalized"
        else:
            log.warning(
                "Missing (de)canonical flag - cannot convert namespaces")
            return nsarg
    else:

        api_url = api_url + "/terms/{}/canonicalized"  # overriding with namespace_targets

    request_url = api_url.format(url_path_param_quoting(nsarg))

    r = http_client.get(request_url, params=params, timeout=10)

    if r and r.status_code == 200:
        nsarg = r.json().get("term_id", nsarg)
    elif not r or r.status_code == 404:
        log.error(f"[de]Canonicalization endpoint missing: {request_url}")

    return nsarg
예제 #6
0
파일: pubmed.py 프로젝트: sailfish009/bel-1
def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]:
    """Enhance pubmed namespace IDs

    Add additional entity and annotation types to annotations
    Use preferred id for namespaces as needed
    Add strings from Title, Abstract matching Pubtator BioConcept spans

    NOTE - basically duplicated code with bel_api:api.services.pubmed

    Args:
        pubmed

    Returns:
        pubmed object
    """

    text = pubmed["title"] + pubmed["abstract"]

    annotations = {}

    for nsarg in pubmed["annotations"]:
        url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{url_path_param_quoting(nsarg)}'
        log.info(f"URL: {url}")
        r = http_client.get(url)
        log.info(f"Result: {r}")
        new_nsarg = ""
        if r and r.status_code == 200:
            term = r.json()
            new_nsarg = bel_ast.convert_nsarg(term["id"], decanonicalize=True)

            pubmed["annotations"][nsarg]["name"] = term["name"]
            pubmed["annotations"][nsarg]["label"] = term["label"]
            pubmed["annotations"][nsarg]["entity_types"] = list(
                set(pubmed["annotations"][nsarg]["entity_types"] +
                    term.get("entity_types", [])))
            pubmed["annotations"][nsarg]["annotation_types"] = list(
                set(pubmed["annotations"][nsarg]["annotation_types"] +
                    term.get("annotation_types", [])))

        if new_nsarg != nsarg:
            annotations[new_nsarg] = copy.deepcopy(
                pubmed["annotations"][nsarg])
        else:
            annotations[nsarg] = copy.deepcopy(pubmed["annotations"][nsarg])

    for nsarg in annotations:
        for idx, span in enumerate(annotations[nsarg]["spans"]):
            string = text[span["begin"] - 1:span["end"] - 1]
            annotations[nsarg]["spans"][idx]["text"] = string

    pubmed["annotations"] = copy.deepcopy(annotations)

    return pubmed
예제 #7
0
    def validate_context(self, context: Mapping[str, Any]) -> Tuple[bool, List[Tuple[str, str]]]:
        """ Validate context

        Args:
            context (Mapping[str, Any]): context dictionary of type, id and label

        Returns:
            Tuple[bool, List[Tuple[str, str]]]:
                bool: Is valid?  Yes = True, No = False
                List[Tuple[str, str]]: Validation issues, empty if valid, tuple is ('ERROR|WARNING', msg)
                    e.g. [('WARNING', "Context ID not found")]
        """

        url = f'{self.endpoint}/terms/{context["id"]}'

        res = http_client.get(url)
        if res.status_code == 200:
            return (True, [])
        else:
            return (False, [("WARNING", f'Context {context["id"]} not found at {url}')])
예제 #8
0
파일: pubmed.py 프로젝트: sailfish009/bel-1
def get_pubtator(pmid):
    """Get Pubtator Bioconcepts from Pubmed Abstract

    Re-configure the denotations into an annotation dictionary format
    and collapse duplicate terms so that their spans are in a list.
    """
    r = http_client.get(PUBTATOR_TMPL.replace("PMID", pmid), timeout=10)
    if r and r.status_code == 200:
        pubtator = r.json()[0]
    else:
        log.error(
            f"Cannot access Pubtator, status: {r.status_code} url: {PUBTATOR_TMPL.replace('PMID', pmid)}"
        )
        return None

    known_types = ["CHEBI", "Chemical", "Disease", "Gene", "Species"]

    for idx, anno in enumerate(pubtator["denotations"]):
        s_match = re.match(r"(\w+):(\w+)", anno["obj"])
        c_match = re.match(r"(\w+):(\w+):(\w+)", anno["obj"])
        if c_match:
            (ctype, namespace, cid) = (c_match.group(1), c_match.group(2),
                                       c_match.group(3))

            if ctype not in known_types:
                log.info(f"{ctype} not in known_types for Pubtator")
            if namespace not in known_types:
                log.info(f"{namespace} not in known_types for Pubtator")

            pubtator["denotations"][idx][
                "obj"] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}'
            pubtator["denotations"][idx][
                "entity_type"] = pubtator_entity_convert.get(ctype, None)
            pubtator["denotations"][idx][
                "annotation_type"] = pubtator_annotation_convert.get(
                    ctype, None)
        elif s_match:
            (ctype, cid) = (s_match.group(1), s_match.group(2))

            if ctype not in known_types:
                log.info(f"{ctype} not in known_types for Pubtator")

            pubtator["denotations"][idx][
                "obj"] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}'
            pubtator["denotations"][idx][
                "entity_type"] = pubtator_entity_convert.get(ctype, None)
            pubtator["denotations"][idx][
                "annotation_type"] = pubtator_annotation_convert.get(
                    ctype, None)

    annotations = {}
    for anno in pubtator["denotations"]:
        log.info(anno)
        if anno["obj"] not in annotations:
            annotations[anno["obj"]] = {"spans": [anno["span"]]}
            annotations[anno["obj"]]["entity_types"] = [
                anno.get("entity_type", [])
            ]
            annotations[anno["obj"]]["annotation_types"] = [
                anno.get("annotation_type", [])
            ]

        else:
            annotations[anno["obj"]]["spans"].append(anno["span"])

    del pubtator["denotations"]
    pubtator["annotations"] = copy.deepcopy(annotations)

    return pubtator
예제 #9
0
def validate_arg_values(ast, bo):
    """Recursively validate arg (NSArg and StrArg) values

    Check that NSArgs are found in BELbio API and match appropriate entity_type.
    Check that StrArgs match their value - either default namespace or regex string

    Generate a WARNING if not.

    Args:
        bo: bel object

    Returns:
        bel object
    """

    if not bo.api_url:
        log.info("No API endpoint defined")
        return bo

    log.debug(f"AST: {ast}")

    # Test NSArg terms
    if isinstance(ast, NSArg):
        term_id = "{}:{}".format(ast.namespace, ast.value)
        value_types = ast.value_types
        log.debug(f"Value types: {value_types}  AST value: {ast.value}")
        # Default namespaces are defined in the bel_specification file
        if ast.namespace == "DEFAULT":  # may use the DEFAULT namespace or not
            for value_type in value_types:
                default_namespace = [
                    ns["name"] for ns in bo.spec["namespaces"][value_type]["info"]
                ] + [ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"]]

                if ast.value in default_namespace:
                    log.debug("Default namespace valid term: {}".format(term_id))
                    break
            else:  # if for loop doesn't hit the break, run this else
                log.debug("Default namespace invalid term: {}".format(term_id))
                bo.validation_messages.append(("WARNING", f"Default Term: {term_id} not found"))

        # Process normal, non-default-namespace terms
        else:
            request_url = bo.api_url + "/terms/{}".format(url_path_param_quoting(term_id))
            log.info(f"Validate Arg Values url {request_url}")
            r = http_client.get(request_url)
            if r and r.status_code == 200:
                result = r.json()
                # function signature term value_types doesn't match up with API term entity_types

                log.debug(
                    f'AST.value_types  {ast.value_types}  Entity types {result.get("entity_types", [])}'
                )

                # Check that entity types match
                if len(set(ast.value_types).intersection(result.get("entity_types", []))) == 0:
                    log.debug(
                        "Invalid Term - Assertion term {} allowable entity types: {} do not match API term entity types: {}".format(
                            term_id, ast.value_types, result.get("entity_types", [])
                        )
                    )
                    bo.validation_messages.append(
                        (
                            "WARNING",
                            "Invalid Term - Assertion term {} allowable entity types: {} do not match API term entity types: {}".format(
                                term_id, ast.value_types, result.get("entity_types", [])
                            ),
                        )
                    )

                if term_id in result.get("obsolete_ids", []):
                    bo.validation_messages.append(
                        ("WARNING", f'Obsolete term: {term_id}  Current term: {result["id"]}')
                    )

            elif r.status_code == 404:
                bo.validation_messages.append(
                    ("WARNING", f"Term: {term_id} not found in namespace")
                )
            else:
                log.error(f"Status {r.status_code} - Bad URL: {request_url}")

    # Process StrArgs
    if isinstance(ast, StrArg):
        log.debug(f"  Check String Arg: {ast.value}  {ast.value_types}")
        for value_type in ast.value_types:
            # Is this a regex to match against
            if re.match("/", value_type):
                value_type = re.sub("^/", "", value_type)
                value_type = re.sub("/$", "", value_type)
                match = re.match(value_type, ast.value)
                if match:
                    break
            if value_type in bo.spec["namespaces"]:
                default_namespace = [
                    ns["name"] for ns in bo.spec["namespaces"][value_type]["info"]
                ] + [ns["abbreviation"] for ns in bo.spec["namespaces"][value_type]["info"]]
                if ast.value in default_namespace:
                    break
        else:  # If for loop doesn't hit the break, no matches found, therefore for StrArg value is bad
            bo.validation_messages.append(
                (
                    "WARNING",
                    f"String value {ast.value} does not match default namespace value or regex pattern: {ast.value_types}",
                )
            )

    # Recursively process every NSArg by processing BELAst and Functions
    if hasattr(ast, "args"):
        for arg in ast.args:
            validate_arg_values(arg, bo)

    return bo
예제 #10
0
def process_nanopub(nanopub_url,
                    orthologize_targets: list = [],
                    overwrite: bool = False,
                    token: str = None):

    log.debug(
        "Process nanopub parameters",
        nanopub_url=nanopub_url,
        orthologize_targets=orthologize_targets,
        overwrite=overwrite,
    )
    log.info("Processing nanopub", nanopub_url=nanopub_url)

    url_comps = urllib.parse.urlparse(nanopub_url)
    nanopub_id = os.path.basename(url_comps.path)
    # domain = url_comps.netloc

    start_time = datetime.datetime.now()

    # collect nanopub
    headers = {}
    if token:
        headers = {"Authorization": f"Bearer {token}"}

    r = http_client.get(nanopub_url, headers=headers)

    nanopub = r.json()

    end_time1 = datetime.datetime.now()
    delta_ms = f"{(end_time1 - start_time).total_seconds() * 1000:.1f}"
    log.debug("Timing - Get nanopub", delta_ms=delta_ms, nanopub=nanopub)

    assertions = []
    for assertion in nanopub["nanopub"].get("assertions", []):
        assertions.append(
            f"{assertion['subject']} {assertion['relation']} {assertion['object']}"
        )

    if not nanopub:
        log.error(f"Could not GET nanopub: {nanopub_url}")

        return {
            "msg": f"Could not GET nanopub: {nanopub_url}",
            "edges_cnt": 0,
            "assertions_cnt": 0,
            "assertions": assertions,
            "success": False,
            "errors": [],
        }

    nanopub["source_url"] = nanopub_url

    # Is nanopub in edge newer than from queue? If so, skip
    if not overwrite:
        # collect one edge for nanopub from edgestore
        edge = get_edges_for_nanopub(nanopub_id)
        if edge:
            # check if edge nanopub is newer
            # log.info("Nanopub to Edge comparison", nanopub=nanopub, edge=edge)
            if edge["metadata"].get("gd_updateTS", None):
                if nanopub["nanopub"]["metadata"]["gd_updateTS"] <= edge[
                        "metadata"]["gd_updateTS"]:
                    log.info(
                        "Nanopub older than edge nanopub",
                        nanopub_dt=nanopub["nanopub"]["metadata"]
                        ["gd_updateTS"],
                        edge_dt=edge["metadata"]["gd_updateTS"],
                    )
                    return {
                        "msg": "Nanopub older than edge nanopub",
                        "success": True,
                        "e": ""
                    }

    end_time2 = datetime.datetime.now()
    delta_ms = f"{(end_time2 - end_time1).total_seconds() * 1000:.1f}"
    log.debug("Timing - Get edge to check nanopub", delta_ms=delta_ms)

    results = bel.edge.edges.nanopub_to_edges(
        nanopub, orthologize_targets=orthologize_targets)

    end_time3 = datetime.datetime.now()
    delta_ms = f"{(end_time3 - end_time2).total_seconds() * 1000:.1f}"

    if results["success"]:

        db_results = load_edges_into_db(nanopub_id,
                                        nanopub["source_url"],
                                        edges=results["edges"])

        # log.info("Convert nanopub to edges", db_results=db_results, results=results)

        end_time4 = datetime.datetime.now()
        delta_ms = f"{(end_time4 - end_time3).total_seconds() * 1000:.1f}"
        log.debug("Timing - Load edges into edgestore", delta_ms=delta_ms)

        delta_ms = f"{(end_time4 - start_time).total_seconds() * 1000:.1f}"
        log.debug("Timing - Process nanopub into edges", delta_ms=delta_ms)

        return {
            "msg": f"Loaded {len(results['edges'])} edges into edgestore",
            "edges_cnt": len(results["edges"]),
            "assertions_cnt": len(nanopub["nanopub"]["assertions"]),
            "assertions": assertions,
            "success": True,
            "errors": results["errors"],
        }

    else:
        log.error(
            f'Could not process nanopub {nanopub_id} into edges - error: {results["errors"]}'
        )
        return {
            "msg":
            f'Could not process nanopub into edges - error: {results["errors"]}',
            "edges_cnt": 0,
            "assertions_cnt": len(nanopub["nanopub"]["assertions"]),
            "assertions": assertions,
            "success": False,
            "errors": results["errors"],
        }
예제 #11
0
def nsarg_completions(
    completion_text: str,
    entity_types: list,
    bel_spec: BELSpec,
    namespace: str,
    species_id: str,
    bel_fmt: str,
    size: int,
):
    """Namespace completions

    Args:
        completion_text
        entity_types: used to filter namespace search results
        bel_spec: used to search default namespaces
        namespace: used to filter namespace search results
        species_id: used to filter namespace search results
        bel_fmt: used to select full name or abbrev for default namespaces
        size: how many completions to return

    Results:
        list of replacement text objects
    """

    minimal_nsarg_completion_len = 1

    species = [species_id]
    namespaces = [namespace]
    replace_list = []

    if len(completion_text) >= minimal_nsarg_completion_len:
        # Use BEL.bio API module if running bel module in BEL.bio API, otherwise call BEL.bio API endpoint
        #   is there a better way to  handle this?

        url = f'{config["bel_api"]["servers"]["api_url"]}/terms/completions/{url_path_param_quoting(completion_text)}'

        params = {
            "size": size,
            "entity_types": entity_types,
            "namespaces": namespaces,
            "species": species,
        }

        if "Species" in entity_types:
            params.pop("species", "")

        log.info(
            "NSArg completion",
            api_url=config["bel_api"]["servers"]["api_url"],
            url=url,
            params=params,
        )

        r = http_client.get(url, params=params)

        if r.status_code == 200:
            ns_completions = r.json()
        else:
            log.error(f"Status code of {r.status_code} for {url}")
            ns_completions = {}

        for complete in ns_completions.get("completions", []):
            replace_list.append({
                "replacement": complete["id"],
                "label": f"{complete['id']} ({complete['label']})",
                "highlight": complete["highlight"][-1],
                "type": "NSArg",
            })

    # Check default namespaces
    for entity_type in entity_types:
        default_namespace = bel_spec["namespaces"].get(entity_type, [])
        if default_namespace:
            for obj in default_namespace["info"]:
                replacement = None
                if bel_fmt == "long" and re.match(completion_text, obj["name"],
                                                  re.IGNORECASE):
                    replacement = obj["name"]
                elif bel_fmt in ["short", "medium"] and re.match(
                        completion_text, obj["abbreviation"], re.IGNORECASE):
                    replacement = obj["abbreviation"]

                if replacement:
                    highlight = replacement.replace(
                        completion_text, f"<em>{completion_text}</em>")
                    replace_list.insert(
                        0,
                        {
                            "replacement": replacement,
                            "label": replacement,
                            "highlight": highlight,
                            "type": "NSArg",
                        },
                    )

    return replace_list[:size]