예제 #1
0
def test_get_url():

    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=19894120'

    # with pytest.raises(requests.exceptions.Timeout):
    #     r = utils.get_url(url, timeout=0.0001)

    r = utils.get_url(url)
    r = utils.get_url(url)
    assert r.from_cache
예제 #2
0
파일: edges.py 프로젝트: chriscurran89/bel
def orthologize_context(orthologize_target: str,
                        annotations: Mapping[str, Any]) -> Mapping[str, Any]:
    """Orthologize context

    Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom
    """

    url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}'
    r = utils.get_url(url)
    species_label = r.json().get("label", "unlabeled")

    orthologized_from = {}
    for idx, annotation in enumerate(annotations):
        if annotation["type"] == "Species":
            orthologized_from = {
                "id": annotation["id"],
                "label": annotation["label"]
            }
            annotations[idx] = {
                "type": "Species",
                "id": orthologize_target,
                "label": species_label
            }

    if "id" in orthologized_from:
        annotations.append({
            "type": "OrigSpecies",
            "id": f'Orig-{orthologized_from["id"]}',
            "label": f'Orig-{orthologized_from["label"]}',
        })

    return annotations
예제 #3
0
파일: edges.py 프로젝트: BenDavidAaron/bel
def orthologize_context(orthologize_target: str,
                        annotations: Mapping[str, Any]) -> Mapping[str, Any]:
    """Orthologize context

    Replace Species context with new orthologize target and add a annotation type of OrthologizedFrom
    """

    url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{orthologize_target}'
    r = utils.get_url(url)
    species_label = r.json().get("label", "unlabeled")

    orthologized_from = {}
    for idx, annotation in enumerate(annotations):
        if annotation['type'] == 'Species':
            orthologized_from = {
                'id': annotation['id'],
                'label': annotation['label']
            }
            annotations[idx] = {
                'type': 'Species',
                'id': orthologize_target,
                'label': species_label
            }

    if 'id' in orthologized_from:
        annotations.append({
            'type': 'OrigSpecies',
            'id': f'Orig-{orthologized_from["id"]}',
            'label': f'Orig-{orthologized_from["label"]}'
        })

    return annotations
예제 #4
0
def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]:
    """Enhance pubmed namespace IDs

    Add additional entity and annotation types to annotations
    Use preferred id for namespaces as needed
    Add strings from Title, Abstract matching Pubtator BioConcept spans

    NOTE - basically duplicated code with bel_api:api.services.pubmed

    Args:
        pubmed

    Returns:
        pubmed object
    """

    text = pubmed["title"] + pubmed["abstract"]

    annotations = {}

    for nsarg in pubmed["annotations"]:
        url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{url_path_param_quoting(nsarg)}'
        log.info(f"URL: {url}")
        r = get_url(url)
        log.info(f"Result: {r}")
        new_nsarg = ""
        if r and r.status_code == 200:
            term = r.json()
            new_nsarg = bel_utils.convert_nsarg(term["id"],
                                                decanonicalize=True)

            pubmed["annotations"][nsarg]["name"] = term["name"]
            pubmed["annotations"][nsarg]["label"] = term["label"]
            pubmed["annotations"][nsarg]["entity_types"] = list(
                set(pubmed["annotations"][nsarg]["entity_types"] +
                    term.get("entity_types", [])))
            pubmed["annotations"][nsarg]["annotation_types"] = list(
                set(pubmed["annotations"][nsarg]["annotation_types"] +
                    term.get("annotation_types", [])))

        if new_nsarg != nsarg:
            annotations[new_nsarg] = copy.deepcopy(
                pubmed["annotations"][nsarg])
        else:
            annotations[nsarg] = copy.deepcopy(pubmed["annotations"][nsarg])

    for nsarg in annotations:
        for idx, span in enumerate(annotations[nsarg]["spans"]):
            string = text[span["begin"] - 1:span["end"] - 1]
            annotations[nsarg]["spans"][idx]["text"] = string

    pubmed["annotations"] = copy.deepcopy(annotations)

    return pubmed
예제 #5
0
def get_pubmed(pmid: str) -> Mapping[str, Any]:
    """Get pubmed xml for pmid and convert to JSON

    Remove MESH terms if they are duplicated in the compound term set

    ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate>
    Only getting pub_year at this point from the <PubDate> element.

    Args:
        pmid: pubmed id number as a string

    Returns:
        pubmed json
    """

    doc = {
        "abstract": "",
        "pmid": pmid,
        "title": "",
        "authors": [],
        "pub_date": "",
        "joural_iso_title": "",
        "journal_title": "",
        "doi": "",
        "compounds": [],
        "mesh": [],
    }

    try:
        pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid))
        r = get_url(pubmed_url)
        content = r.content
        log.info(f"Getting Pubmed URL {pubmed_url}")
        root = etree.fromstring(content)

    except Exception as e:
        log.error(
            f"Bad Pubmed request, status: {r.status_code} error: {e}",
            url=f'{PUBMED_TMPL.replace("PMID", pmid)}',
        )
        return {"doc": {}, "message": f"Cannot get PMID: {pubmed_url}"}

    doc["pmid"] = root.xpath("//PMID/text()")[0]
    print("PMID", doc["pmid"])

    if doc["pmid"] != pmid:
        log.error("Requested PMID doesn't match record PMID", url=pubmed_url)

    if root.find("PubmedArticle") is not None:
        doc = parse_journal_article_record(doc, root)
    elif root.find("PubmedBookArticle") is not None:
        doc = parse_book_record(doc, root)

    return doc
예제 #6
0
def convert_nsarg(
    nsarg: str,
    api_url: str = None,
    namespace_targets: Mapping[str, List[str]] = None,
    canonicalize: bool = False,
    decanonicalize: bool = False,
) -> str:
    """[De]Canonicalize NSArg

    Args:
        nsarg (str): bel statement string or partial string (e.g. subject or object)
        api_url (str): BEL.bio api url to use, e.g. https://api.bel.bio/v1
        namespace_targets (Mapping[str, List[str]]): formatted as in configuration file example
        canonicalize (bool): use canonicalize endpoint/namespace targets
        decanonicalize (bool): use decanonicalize endpoint/namespace targets

    Results:
        str: converted NSArg
    """

    if not api_url:
        api_url = config["bel_api"]["servers"]["api_url"]
        if not api_url:
            log.error("Missing api url - cannot convert namespace")
            return None

    params = None
    if namespace_targets:
        namespace_targets_str = json.dumps(namespace_targets)
        params = {"namespace_targets": namespace_targets_str}

    if not namespace_targets:
        if canonicalize:
            api_url = api_url + "/terms/{}/canonicalized"
        elif decanonicalize:
            api_url = api_url + "/terms/{}/decanonicalized"
        else:
            log.warning(
                "Missing (de)canonical flag - cannot convert namespaces")
            return nsarg
    else:

        api_url = api_url + "/terms/{}/canonicalized"  # overriding with namespace_targets

    request_url = api_url.format(url_path_param_quoting(nsarg))

    r = get_url(request_url, params=params, timeout=10)

    if r and r.status_code == 200:
        nsarg = r.json().get("term_id", nsarg)
    elif not r or r.status_code == 404:
        log.error(f"[de]Canonicalization endpoint missing: {request_url}")

    return nsarg
예제 #7
0
def validate_arg_values(ast, bo):
    """Recursively validate arg (NSArg and StrArg) values

    Check that NSArgs are found in BELbio API and match appropriate entity_type.
    Check that StrArgs match their value - either default namespace or regex string

    Generate a WARNING if not.

    Args:
        bo: bel object

    Returns:
        bel object
    """

    if not bo.api_url:
        log.info("No API endpoint defined")
        return bo

    log.debug(f"AST: {ast}")

    # Test NSArg terms
    if isinstance(ast, NSArg):
        term_id = "{}:{}".format(ast.namespace, ast.value)
        value_types = ast.value_types
        log.debug(f"Value types: {value_types}  AST value: {ast.value}")
        # Default namespaces are defined in the bel_specification file
        if ast.namespace == "DEFAULT":  # may use the DEFAULT namespace or not
            for value_type in value_types:
                default_namespace = [
                    ns["name"]
                    for ns in bo.spec["namespaces"][value_type]["info"]
                ] + [
                    ns["abbreviation"]
                    for ns in bo.spec["namespaces"][value_type]["info"]
                ]

                if ast.value in default_namespace:
                    log.debug(
                        "Default namespace valid term: {}".format(term_id))
                    break
            else:  # if for loop doesn't hit the break, run this else
                log.debug("Default namespace invalid term: {}".format(term_id))
                bo.validation_messages.append(
                    ("WARNING", f"Default Term: {term_id} not found"))

        # Process normal, non-default-namespace terms
        else:
            request_url = bo.api_url + "/terms/{}".format(
                url_path_param_quoting(term_id))
            log.info(f"Validate Arg Values url {request_url}")
            r = get_url(request_url)
            if r and r.status_code == 200:
                result = r.json()
                # function signature term value_types doesn't match up with API term entity_types

                log.debug(
                    f'AST.value_types  {ast.value_types}  Entity types {result.get("entity_types", [])}'
                )

                # Check that entity types match
                if len(
                        set(ast.value_types).intersection(
                            result.get("entity_types", []))) == 0:
                    log.debug(
                        "Invalid Term - statement term {} allowable entity types: {} do not match API term entity types: {}"
                        .format(term_id, ast.value_types,
                                result.get("entity_types", [])))
                    bo.validation_messages.append((
                        "WARNING",
                        "Invalid Term - statement term {} allowable entity types: {} do not match API term entity types: {}"
                        .format(term_id, ast.value_types,
                                result.get("entity_types", [])),
                    ))

                if term_id in result.get("obsolete_ids", []):
                    bo.validation_messages.append((
                        "WARNING",
                        f'Obsolete term: {term_id}  Current term: {result["id"]}'
                    ))

            elif r.status_code == 404:
                bo.validation_messages.append(
                    ("WARNING", f"Term: {term_id} not found in namespace"))
            else:
                log.error(f"Status {r.status_code} - Bad URL: {request_url}")

    # Process StrArgs
    if isinstance(ast, StrArg):
        log.debug(f"  Check String Arg: {ast.value}  {ast.value_types}")
        for value_type in ast.value_types:
            # Is this a regex to match against
            if re.match("/", value_type):
                value_type = re.sub("^/", "", value_type)
                value_type = re.sub("/$", "", value_type)
                match = re.match(value_type, ast.value)
                if match:
                    break
            if value_type in bo.spec["namespaces"]:
                default_namespace = [
                    ns["name"]
                    for ns in bo.spec["namespaces"][value_type]["info"]
                ] + [
                    ns["abbreviation"]
                    for ns in bo.spec["namespaces"][value_type]["info"]
                ]
                if ast.value in default_namespace:
                    break
        else:  # If for loop doesn't hit the break, no matches found, therefore for StrArg value is bad
            bo.validation_messages.append((
                "WARNING",
                f"String value {ast.value} does not match default namespace value or regex pattern: {ast.value_types}",
            ))

    # Recursively process every NSArg by processing BELAst and Functions
    if hasattr(ast, "args"):
        for arg in ast.args:
            validate_arg_values(arg, bo)

    return bo
예제 #8
0
def nsarg_completions(
    completion_text: str,
    entity_types: list,
    bel_spec: BELSpec,
    namespace: str,
    species_id: str,
    bel_fmt: str,
    size: int,
):
    """Namespace completions

    Args:
        completion_text
        entity_types: used to filter namespace search results
        bel_spec: used to search default namespaces
        namespace: used to filter namespace search results
        species_id: used to filter namespace search results
        bel_fmt: used to select full name or abbrev for default namespaces
        size: how many completions to return

    Results:
        list of replacement text objects
    """

    minimal_nsarg_completion_len = 1

    species = [species_id]
    namespaces = [namespace]
    replace_list = []

    if len(completion_text) >= minimal_nsarg_completion_len:
        # Use BEL.bio API module if running bel module in BEL.bio API, otherwise call BEL.bio API endpoint
        #   is there a better way to  handle this?

        url = f'{config["bel_api"]["servers"]["api_url"]}/terms/completions/{url_path_param_quoting(completion_text)}'
        params = {
            "size": size,
            "entity_types": entity_types,
            "namespaces": namespaces,
            "species": species,
        }
        r = get_url(url, params=params)

        if r.status_code == 200:
            ns_completions = r.json()
        else:
            log.error(f"Status code of {r.status_code} for {url}")
            ns_completions = {}

        for complete in ns_completions.get("completions", []):
            replace_list.append({
                "replacement": complete["id"],
                "label": f"{complete['id']} ({complete['label']})",
                "highlight": complete["highlight"][-1],
                "type": "NSArg",
            })

    # Check default namespaces
    for entity_type in entity_types:
        default_namespace = bel_spec["namespaces"].get(entity_type, [])
        if default_namespace:
            for obj in default_namespace["info"]:
                replacement = None
                if bel_fmt == "long" and re.match(completion_text, obj["name"],
                                                  re.IGNORECASE):
                    replacement = obj["name"]
                elif bel_fmt in ["short", "medium"] and re.match(
                        completion_text, obj["abbreviation"], re.IGNORECASE):
                    replacement = obj["abbreviation"]

                if replacement:
                    highlight = replacement.replace(
                        completion_text, f"<em>{completion_text}</em>")
                    replace_list.insert(
                        0,
                        {
                            "replacement": replacement,
                            "label": replacement,
                            "highlight": highlight,
                            "type": "NSArg",
                        },
                    )

    return replace_list[:size]
예제 #9
0
def get_pubtator(pmid):
    """Get Pubtator Bioconcepts from Pubmed Abstract

    Re-configure the denotations into an annotation dictionary format
    and collapse duplicate terms so that their spans are in a list.
    """
    r = get_url(PUBTATOR_TMPL.replace("PMID", pmid), timeout=10)
    if r and r.status_code == 200:
        pubtator = r.json()[0]
    else:
        log.error(
            f"Cannot access Pubtator, status: {r.status_code} url: {PUBTATOR_TMPL.replace('PMID', pmid)}"
        )
        return None

    known_types = ["CHEBI", "Chemical", "Disease", "Gene", "Species"]

    for idx, anno in enumerate(pubtator["denotations"]):
        s_match = re.match(r"(\w+):(\w+)", anno["obj"])
        c_match = re.match(r"(\w+):(\w+):(\w+)", anno["obj"])
        if c_match:
            (ctype, namespace, cid) = (c_match.group(1), c_match.group(2),
                                       c_match.group(3))

            if ctype not in known_types:
                log.info(f"{ctype} not in known_types for Pubtator")
            if namespace not in known_types:
                log.info(f"{namespace} not in known_types for Pubtator")

            pubtator["denotations"][idx][
                "obj"] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}'
            pubtator["denotations"][idx][
                "entity_type"] = pubtator_entity_convert.get(ctype, None)
            pubtator["denotations"][idx][
                "annotation_type"] = pubtator_annotation_convert.get(
                    ctype, None)
        elif s_match:
            (ctype, cid) = (s_match.group(1), s_match.group(2))

            if ctype not in known_types:
                log.info(f"{ctype} not in known_types for Pubtator")

            pubtator["denotations"][idx][
                "obj"] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}'
            pubtator["denotations"][idx][
                "entity_type"] = pubtator_entity_convert.get(ctype, None)
            pubtator["denotations"][idx][
                "annotation_type"] = pubtator_annotation_convert.get(
                    ctype, None)

    annotations = {}
    for anno in pubtator["denotations"]:
        log.info(anno)
        if anno["obj"] not in annotations:
            annotations[anno["obj"]] = {"spans": [anno["span"]]}
            annotations[anno["obj"]]["entity_types"] = [
                anno.get("entity_type", [])
            ]
            annotations[anno["obj"]]["annotation_types"] = [
                anno.get("annotation_type", [])
            ]

        else:
            annotations[anno["obj"]]["spans"].append(anno["span"])

    del pubtator["denotations"]
    pubtator["annotations"] = copy.deepcopy(annotations)

    return pubtator
예제 #10
0
def get_pubmed(pmid: str) -> Mapping[str, Any]:
    """Get pubmed xml for pmid and convert to JSON

    Remove MESH terms if they are duplicated in the compound term set

    ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate>
    Only getting pub_year at this point from the <PubDate> element.

    Args:
        pmid: pubmed id number as a string

    Returns:
        pubmed json
    """
    pubmed_url = PUBMED_TMPL.replace("PMID", str(pmid))
    r = get_url(pubmed_url)
    log.info(f"Getting Pubmed URL {pubmed_url}")

    try:
        root = etree.fromstring(r.content)
        doc = {"abstract": ""}
        doc["pmid"] = root.xpath("//PMID/text()")[0]
        doc["title"] = next(iter(root.xpath("//ArticleTitle/text()")), "")

        # TODO https://stackoverflow.com/questions/4770191/lxml-etree-element-text-doesnt-return-the-entire-text-from-an-element
        atext = next(iter(root.xpath("//Abstract/AbstractText/text()")), "")
        print("Text", atext)

        for abstracttext in root.xpath("//Abstract/AbstractText"):
            abstext = node_text(abstracttext)

            label = abstracttext.get("Label", None)
            if label:
                doc["abstract"] += f"{label}: {abstext}\n"
            else:
                doc["abstract"] += f"{abstext}\n"

        doc["abstract"] = doc["abstract"].rstrip()

        doc["authors"] = []
        for author in root.xpath("//Author"):
            last_name = next(iter(author.xpath("LastName/text()")), "")
            first_name = next(iter(author.xpath("ForeName/text()")), "")
            initials = next(iter(author.xpath("Initials/text()")), "")
            if not first_name and initials:
                first_name = initials
            doc["authors"].append(f"{last_name}, {first_name}")

        pub_year = next(
            iter(root.xpath("//Journal/JournalIssue/PubDate/Year/text()")),
            None)
        pub_mon = next(
            iter(root.xpath("//Journal/JournalIssue/PubDate/Month/text()")),
            "Jan")
        pub_day = next(
            iter(root.xpath("//Journal/JournalIssue/PubDate/Day/text()")),
            "01")

        pub_date = process_pub_date(pub_year, pub_mon, pub_day)

        doc["pub_date"] = pub_date
        doc["journal_title"] = next(iter(root.xpath("//Journal/Title/text()")),
                                    "")
        doc["joural_iso_title"] = next(
            iter(root.xpath("//Journal/ISOAbbreviation/text()")), "")
        doc["doi"] = next(
            iter(root.xpath('//ArticleId[@IdType="doi"]/text()')), None)

        doc["compounds"] = []
        for chem in root.xpath("//ChemicalList/Chemical/NameOfSubstance"):
            chem_id = chem.get("UI")
            doc["compounds"].append({
                "id": f"MESH:{chem_id}",
                "name": chem.text
            })

        compounds = [cmpd["id"] for cmpd in doc["compounds"]]
        doc["mesh"] = []
        for mesh in root.xpath("//MeshHeading/DescriptorName"):
            mesh_id = f"MESH:{mesh.get('UI')}"
            if mesh_id in compounds:
                continue
            doc["mesh"].append({"id": mesh_id, "name": mesh.text})

        return doc
    except Exception as e:
        log.error(
            f"Bad Pubmed request, status: {r.status_code} error: {e}",
            url=f'{PUBMED_TMPL.replace("PMID", pmid)}',
        )
        return {"message": f"Cannot get PMID: {pubmed_url}"}