Beispiel #1
0
def estimate_percent_nodes_covered_by_backup_method(kg: str):
    print(
        f"Estimating the percent of {kg} nodes mappable by the 'backup' NGD method (uses eUtils)"
    )
    backup_ngd = NormGoogleDistance()
    synonymizer = NodeSynonymizer()
    percentages_mapped = []
    num_batches = 10
    batch_size = 10
    for number in range(num_batches):
        print(f"  Batch {number + 1}")
        # Get random selection of nodes from the KG
        query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}"
        results = _run_cypher_query(query, kg)
        canonical_curie_info = synonymizer.get_canonical_curies(
            [result['a.id'] for result in results])
        recognized_curies = {
            input_curie
            for input_curie in canonical_curie_info
            if canonical_curie_info.get(input_curie)
        }

        # Use the back-up NGD method to try to grab PMIDs for each
        num_with_pmids = 0
        for curie in recognized_curies:
            # Try to map this to a MESH term using the backup method (the chokepoint)
            node_id = canonical_curie_info[curie].get('preferred_curie')
            node_name = canonical_curie_info[curie].get('preferred_name')
            node_type = canonical_curie_info[curie].get('preferred_type')
            try:
                pmids = backup_ngd.get_pmids_for_all([node_id], [node_name])
            except Exception:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                print(f"ERROR using back-up method: {tb}")
            else:
                if len(pmids) and ([
                        pmid_list for pmid_list in pmids if pmid_list
                ]):
                    num_with_pmids += 1
                    print(
                        f"    Found {len(pmids[0])} PMIDs for {node_id}, {node_name}."
                    )
                else:
                    print(f"    Not found. ({node_id}, {node_name})")
        percentage_with_pmids = (num_with_pmids / len(recognized_curies)) * 100
        print(
            f"    {percentage_with_pmids}% of nodes were mapped to PMIDs using backup method."
        )
        percentages_mapped.append(percentage_with_pmids)

    print(f"  Percentages for all batches: {percentages_mapped}.")
    average = sum(percentages_mapped) / len(percentages_mapped)
    print(
        f"Final estimate of backup method's coverage of {kg} nodes: {round(average)}%"
    )
def pubmed_mesh_ngd(term1, term2):  # noqa: E501
    """Query to get the Normalized Google Distance between two MeSH terms based on co-occurance in all PubMed article annotations

     # noqa: E501

    :param term1: First of two terms. Order not important.
    :type term1: str
    :param term2: Second of two terms. Order not important.
    :type term2: str

    :rtype: MeshNgdResponse
    """
    cwd = os.getcwd()
    os.chdir(
        os.path.dirname(os.path.abspath(__file__)) +
        "/../../../../../reasoningtool/kg-construction")
    ngd = NormGoogleDistance()
    response = ngd.api_ngd(term1, term2)
    os.chdir(cwd)
    return (response)
Beispiel #3
0
 def get_all_from_oxo(self, curie_id, map_to=None):
     """
     this takes a curie id and gets all the mappings that oxo has for the given id
     
     :param curie_id: The string for the curie id to submit to OXO (e.g. 'HP:0001947')
     :param map_to: A string containing the prefix for the resulting ids. If set to None it will return all mappings. (default is none)
     
     :return: A list of strings containing the found mapped ids or None if none where found
     """
     if map_to is None:
         map_to = ''
     if type(curie_id) != str:
         curie_id = str(curie_id)
     if curie_id.startswith('REACT:'):
         curie_id = curie_id.replace('REACT', 'Reactome')
     prefix = curie_id.split(':')[0]
     res = NormGoogleDistance.query_oxo(curie_id)
     synonym_ids = None
     if res is not None:
         res = res.json()
         synonym_ids = set()
         n_res = res['page']['totalElements']
         if int(n_res) > 0:
             mappings = res['_embedded']['mappings']
             for mapping in mappings:
                 if type(map_to) == list:
                     for elm in map_to:
                         if mapping['fromTerm']['curie'].startswith(prefix):
                             if mapping['toTerm']['curie'].startswith(elm):
                                 synonym_ids |= set(
                                     [mapping['toTerm']['curie']])
                         elif mapping['toTerm']['curie'].startswith(prefix):
                             if mapping['fromTerm']['curie'].startswith(
                                     elm):
                                 synonym_ids |= set(
                                     [mapping['fromTerm']['curie']])
                 else:
                     if mapping['fromTerm']['curie'].startswith(prefix):
                         if mapping['toTerm']['curie'].startswith(map_to):
                             synonym_ids |= set(
                                 [mapping['toTerm']['curie']])
                     elif mapping['toTerm']['curie'].startswith(prefix):
                         if mapping['fromTerm']['curie'].startswith(map_to):
                             synonym_ids |= set(
                                 [mapping['fromTerm']['curie']])
         if len(synonym_ids) == 0:
             synonym_ids = None
         else:
             synonym_ids = list(synonym_ids)
     return synonym_ids
Beispiel #4
0
    def add_node_pmids(self):
        """
        Iterate over all the nodes in the knowledge graph, decorate with PMID's from pubmed abstracts
        :return: response
        """
        self.response.debug(f"Adding node PMIDs")
        self.response.info(
            f"Adding pubmed ID's to nodes based on occurrence in PubMed abstracts"
        )
        self.response.warning(
            f"Utilizing API calls to NCBI eUtils, so this may take a while...")
        name = "pubmed_ids"
        type = "data:0971"
        value = ""
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

        # iterate over KG edges, add the information
        try:
            for node in self.message.knowledge_graph.nodes:
                # Make sure the edge_attributes are not None
                if not node.node_attributes:
                    node.node_attributes = [
                    ]  # should be an array, but why not a list?
                # now go and actually get the NGD
                node_curie = node.id
                node_name = node.name
                pmids = NGD.get_pmids_for_all(
                    [node_curie], [node_name]
                )[0]  # since the function was designed for multiple inputs, but I only want the first

                if 'max_num' in self.parameters:
                    pmids = pmids[0:self.parameters['max_num']]
                value = pmids
                ngd_edge_attribute = NodeAttribute(
                    type=type, name=name, value=value,
                    url=url)  # populate the NGD edge attribute
                node.node_attributes.append(
                    ngd_edge_attribute)  # append it to the list of attributes
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong adding the PubMed ID attributes")
        else:
            self.response.info(f"PubMed ID's successfully added to nodes")

        return self.response
Beispiel #5
0
def report_on_curies_missed_by_local_ngd(kg: str):
    backup_ngd = NormGoogleDistance()
    synonymizer = NodeSynonymizer()
    curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite")
    batch_size = 50

    # Get random selection of nodes from the KG
    query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}"
    results = _run_cypher_query(query, kg)
    canonical_curie_info = synonymizer.get_canonical_curies(
        [result['a.id'] for result in results])
    recognized_curies = {
        input_curie
        for input_curie in canonical_curie_info
        if canonical_curie_info.get(input_curie)
    }

    # Figure out which of these local ngd misses
    misses = set()
    for curie in recognized_curies:
        canonical_curie = canonical_curie_info[curie].get('preferred_curie')
        if canonical_curie not in curie_to_pmid_db:
            misses.add(curie)
    percent_missed = round((len(misses) / len(recognized_curies)) * 100)
    print(
        f"Local ngd missed {len(misses)} of {len(recognized_curies)} curies ({percent_missed}%)"
    )

    # Try eUtils for each of the curies local ngd missed
    num_eutils_found = 0
    try:
        with open('misses_found_by_eutils.json', 'r') as file_to_add_to:
            found_dict = json.load(file_to_add_to)
    except Exception:
        found_dict = dict()
    for missed_curie in misses:
        # Try eUtils for this node
        node_id = canonical_curie_info[missed_curie].get('preferred_curie')
        node_name = canonical_curie_info[missed_curie].get('preferred_name')
        node_type = canonical_curie_info[missed_curie].get('preferred_type')
        try:
            pmids = backup_ngd.get_pmids_for_all([node_id], [node_name])
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            print(f"ERROR using back-up method: {tb}")
        else:
            if len(pmids) and ([pmid_list
                                for pmid_list in pmids if pmid_list]):
                num_eutils_found += 1
                print(
                    f"    Found {len(pmids[0])} PMIDs for {node_id}, {node_name}."
                )
                found_dict[node_id] = {'name': node_name, 'type': node_type}
            else:
                print(f"    Not found. ({node_id}, {node_name})")

    # Report some findings
    percent_found_by_eutils = round((num_eutils_found / len(misses)) * 100)
    print(
        f"Eutils found {num_eutils_found} out of {len(misses)} curies that local ngd missed ({percent_found_by_eutils}%)"
    )
    found_types = [
        node_info['type'] for node_id, node_info in found_dict.items()
    ]
    counter = collections.Counter(found_types)
    print(counter)

    # Save the data to a JSON file for access later
    with open('misses_found_by_eutils.json', 'w+') as output_file:
        json.dump(found_dict, output_file)
Beispiel #6
0
import requests_cache
import CachedMethods
from NormGoogleDistance import NormGoogleDistance
import time
import pandas
import csv

df = pandas.read_csv('nodes_id_name.csv')

for a in range(len(df['id'])):
    b = NormGoogleDistance.get_mesh_term_for_all(df['id'][a], df['name'][a])
Beispiel #7
0
    def add_node_pmids(self):
        """
        Iterate over all the nodes in the knowledge graph, decorate with PMID's from pubmed abstracts
        :return: response
        """
        self.response.debug(f"Adding node PMIDs")
        self.response.info(
            f"Adding pubmed ID's to nodes based on occurrence in PubMed abstracts"
        )
        name = "pubmed_ids"
        type = "EDAM:data_0971"
        value = ""
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
        ncbi_warning_flag = True

        # iterate over KG edges, add the information
        try:
            canonicalized_curie_lookup = self.ngd._get_canonical_curies_map(
                list(set(self.message.knowledge_graph.nodes.keys())))
            self.ngd.load_curie_to_pmids_data(
                canonicalized_curie_lookup.values())
            for key, node in self.message.knowledge_graph.nodes.items():
                # Make sure the attributes are not None
                if not node.attributes:
                    node.attributes = [
                    ]  # should be an array, but why not a list?
                # now go and actually get the NGD
                node_curie = key
                node_name = node.name
                pmids = self.ngd.curie_to_pmids_map.get(node_curie)
                if pmids is None or len(pmids) < 1:
                    if ncbi_warning_flag:
                        self.response.warning(
                            f"Utilizing API calls to NCBI eUtils, so this may take a while..."
                        )
                        ncbi_warning_flag = False
                    try:
                        pmids = NGD.get_pmids_for_all(
                            [node_curie], [node_name]
                        )[0]  # since the function was designed for multiple inputs, but I only want the first
                    except:
                        self.response.warning(
                            f"There was an error retrieving the PMIDs for {node_curie} from NCBIeUtils."
                        )
                        pmids = []
                else:
                    pmids = [f"PMID:{str(pmid)}" for pmid in pmids]
                if 'max_num' in self.parameters:
                    pmids = pmids[0:self.parameters['max_num']]
                value = pmids
                ngd_edge_attribute = NodeAttribute(
                    attribute_type_id=type,
                    original_attribute_name=name,
                    value=value,
                    value_url=url)  # populate the NGD edge attribute
                node.attributes.append(
                    ngd_edge_attribute)  # append it to the list of attributes
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong adding the PubMed ID attributes")
        else:
            self.response.info(f"PubMed ID's successfully added to nodes")

        return self.response