Exemple #1
0
def generate_adeft_terms():
    from adeft import available_shortforms
    from adeft.disambiguate import load_disambiguator
    all_term_args = set()
    for shortform in available_shortforms:
        da = load_disambiguator(shortform)
        for grounding in da.names.keys():
            if grounding == 'ungrounded' or ':' not in grounding:
                continue
            db_ns, db_id = grounding.split(':', maxsplit=1)
            if db_ns == 'HGNC':
                standard_name = hgnc_client.get_hgnc_name(db_id)
            elif db_ns == 'GO':
                standard_name = go_client.get_go_label(db_id)
            elif db_ns == 'MESH':
                standard_name = mesh_client.get_mesh_name(db_id)
            elif db_ns == 'CHEBI':
                standard_name = chebi_client.get_chebi_name_from_id(db_id)
            elif db_ns == 'FPLX':
                standard_name = db_id
            elif db_ns == 'UP':
                standard_name = uniprot_client.get_gene_name(db_id)
            else:
                logger.warning('Unknown grounding namespace from Adeft: %s' %
                               db_ns)
                continue
            term_args = (normalize(shortform), shortform, db_ns, db_id,
                         standard_name, 'synonym', 'adeft')
            all_term_args.add(term_args)
    terms = [
        Term(*term_args)
        for term_args in sorted(list(all_term_args), key=lambda x: x[0])
    ]
    return terms
Exemple #2
0
def get_specific_chebi_id(chebi_ids, name):
    # NOTE: this function is mainly factored out to be able to use cacheing, it
    # requires a frozenset as input to work.

    # First, if we have a manual override, we just do that
    manual_id = manual_chebi_map.get(name)
    if manual_id:
        return manual_id

    # The first thing we do is eliminate the secondary IDs by mapping them to
    # primaries
    primary_ids = {chebi_client.get_primary_id(cid) for cid in chebi_ids}
    # Occasinally, invalid ChEBI IDs are given that don't have corresponding
    # primary IDs, which we can filter out
    primary_ids = {pi for pi in primary_ids if pi is not None}
    # We then get rid of generic IDs which are never useful for grounding
    non_generic_ids = primary_ids - generic_chebi_ids

    # We then try name-based grounding to see if any of the names in the list
    # match the name of the entity well enough
    grounding_names = [
        chebi_client.get_chebi_name_from_id(p) for p in non_generic_ids
    ]
    for grounding_name, grounding_id in zip(grounding_names, non_generic_ids):
        if grounding_name and (name.lower() == grounding_name.lower()):
            return grounding_id

    # If we still have no best grounding, we try to distill the IDs down to
    # the most specific one based on the hierarchy
    specific_chebi_id = chebi_client.get_specific_id(non_generic_ids)
    return specific_chebi_id
Exemple #3
0
    def standardize_agent_name(agent, standardize_refs=True):
        """Standardize the name of an Agent based on grounding information.

        If an agent contains a FamPlex grounding, the FamPlex ID is used as a
        name. Otherwise if it contains a Uniprot ID, an attempt is made to find
        the associated HGNC gene name. If one can be found it is used as the
        agent name and the associated HGNC ID is added as an entry to the
        db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of
        priority to assign a standardized name to the Agent. If no relevant
        IDs are found, the name is not changed.

        Parameters
        ----------
        agent : indra.statements.Agent
            An INDRA Agent whose name attribute should be standardized based
            on grounding information.
        standardize_refs : Optional[bool]
            If True, this function assumes that the Agent's db_refs need to
            be standardized, e.g., HGNC mapped to UP.
            Default: True
        """
        # We return immediately for None Agents
        if agent is None:
            return

        if standardize_refs:
            agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs)

        # We next look for prioritized grounding, if missing, we return
        db_ns, db_id = agent.get_grounding()
        if not db_ns or not db_id:
            return

        # If there's a FamPlex ID, prefer that for the name
        if db_ns == 'FPLX':
            agent.name = agent.db_refs['FPLX']
        # Importantly, HGNC here will be a symbol because that is what
        # get_grounding returns
        elif db_ns == 'HGNC':
            agent.name = hgnc_client.get_hgnc_name(db_id)
        elif db_ns == 'UP':
            # Try for the gene name
            gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'],
                                                     web_fallback=False)
            if gene_name:
                agent.name = gene_name
        elif db_ns == 'CHEBI':
            chebi_name = \
                chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI'])
            if chebi_name:
                agent.name = chebi_name
        elif db_ns == 'MESH':
            mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False)
            if mesh_name:
                agent.name = mesh_name
        elif db_ns == 'GO':
            go_name = go_client.get_go_label(agent.db_refs['GO'])
            if go_name:
                agent.name = go_name
        return
def generate_chebi_terms():
    fname = os.path.join(indra_resources, 'chebi_entries.tsv')
    logger.info('Loading %s' % fname)
    terms = []
    for row in read_csv(fname, header=True, delimiter='\t'):
        db = 'CHEBI'
        id = 'CHEBI:' + row['CHEBI_ID']
        name = row['NAME']
        term = Term(normalize(name), name, db, id, name, 'name', 'chebi')
        terms.append(term)
    logger.info('Loaded %d terms' % len(terms))

    # Now we add synonyms
    # NOTE: this file is not in version control. The file is available
    # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_
    # tab_delimited/names_3star.tsv.gz, it needs to be decompressed
    # into the INDRA resources folder.
    fname = os.path.join(indra_resources, 'names_3star.tsv')
    if not os.path.exists(fname):
        import pandas as pd
        chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \
                    'Flat_file_tab_delimited/names_3star.tsv.gz'
        logger.info('Loading %s into memory. You can download and decompress'
                    ' it in the indra/resources folder for faster access.'
                    % chebi_url)
        df = pd.read_csv(chebi_url, sep='\t')
        rows = (row for _, row in df.iterrows())
    else:
        rows = read_csv(fname, header=True, delimiter='\t')

    added = set()
    for row in rows:
        chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID']))
        if not chebi_id:
            logger.info('Could not get valid CHEBI ID for %s' %
                        row['COMPOUND_ID'])
            continue
        db = 'CHEBI'
        id = 'CHEBI:%s' % chebi_id
        name = str(row['NAME'])
        chebi_name = \
            chebi_client.get_chebi_name_from_id(chebi_id, offline=True)
        if chebi_name is None:
            logger.info('Could not get valid name for %s' % chebi_id)
            continue

        term_args = (normalize(name), name, db, id, chebi_name, 'synonym',
                     'chebi')
        if term_args in added:
            continue
        else:
            term = Term(*term_args)
            terms.append(term)
            added.add(term_args)
    logger.info('Loaded %d terms' % len(terms))
    return terms
Exemple #5
0
def generate_famplex_terms(ignore_mappings=False):
    fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv')
    logger.info('Loading %s' % fname)
    terms = []
    for row in read_csv(fname, delimiter=','):
        txt = row[0]
        norm_txt = normalize(txt)
        groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)}
        if 'FPLX' in groundings:
            id = groundings['FPLX']
            term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex')
        elif 'HGNC' in groundings:
            id = groundings['HGNC']
            term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id,
                        'assertion', 'famplex', '9606')
        elif 'UP' in groundings:
            db = 'UP'
            id = groundings['UP']
            name = id
            organism = None
            if uniprot_client.is_human(id):
                organism = '9606'
                hgnc_id = uniprot_client.get_hgnc_id(id)
                if hgnc_id:
                    name = hgnc_client.get_hgnc_name(hgnc_id)
                    if hgnc_id:
                        db = 'HGNC'
                        id = hgnc_id
                else:
                    logger.warning('No gene name for %s' % id)
            # TODO: should we add organism info here?
            term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex',
                        organism)
        elif 'CHEBI' in groundings:
            id = groundings['CHEBI']
            name = chebi_client.get_chebi_name_from_id(id[6:])
            term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion',
                        'famplex')
        elif 'GO' in groundings:
            id = groundings['GO']
            term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id),
                        'assertion', 'famplex')
        elif 'MESH' in groundings:
            id = groundings['MESH']
            mesh_mapping = mesh_mappings.get(id)
            db, db_id, name = mesh_mapping if (mesh_mapping
                                               and not ignore_mappings) else \
                ('MESH', id, mesh_client.get_mesh_name(id))
            term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex')
        else:
            # TODO: handle HMDB, PUBCHEM, CHEMBL
            continue
        terms.append(term)
    return terms
Exemple #6
0
def generate_chebi_terms():
    # We can get standard names directly from the OBO
    terms = _generate_obo_terms('chebi', ignore_mappings=True, map_to_ns={})

    # Now we add synonyms
    # NOTE: this file is not in version control. The file is available
    # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_
    # tab_delimited/names_3star.tsv.gz, it needs to be decompressed
    # into the INDRA resources folder.
    fname = os.path.join(indra_resources, 'names_3star.tsv')
    if not os.path.exists(fname):
        import pandas as pd
        chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \
                    'Flat_file_tab_delimited/names_3star.tsv.gz'
        logger.info('Loading %s into memory. You can download and decompress'
                    ' it in the indra/resources folder for faster access.' %
                    chebi_url)
        df = pd.read_csv(chebi_url, sep='\t')
        rows = (row for _, row in df.iterrows())
    else:
        rows = read_csv(fname, header=True, delimiter='\t')

    added = set()
    for row in rows:
        chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID']))
        if not chebi_id:
            logger.info('Could not get valid CHEBI ID for %s' %
                        row['COMPOUND_ID'])
            continue
        db = 'CHEBI'
        name = str(row['NAME'])
        chebi_name = \
            chebi_client.get_chebi_name_from_id(chebi_id, offline=True)
        if chebi_name is None:
            logger.info('Could not get valid name for %s' % chebi_id)
            continue
        # We skip entries of the form Glu-Lys with synonyms like EK since
        # there are highly ambiguous with other acronyms, and are unlikely
        # to be used in practice.
        if is_aa_sequence(chebi_name) and re.match(r'(^[A-Z-]+$)', name):
            continue

        term_args = (normalize(name), name, db, chebi_id, chebi_name,
                     'synonym', 'chebi')
        if term_args in added:
            continue
        else:
            term = Term(*term_args)
            terms.append(term)
            added.add(term_args)
    logger.info('Loaded %d terms' % len(terms))
    return terms
Exemple #7
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            if c.startswith('CHEBI'):
                db_refs['CHEBI'] = c
                name = chebi_client.get_chebi_name_from_id(c)
            else:
                name = uniprot_client.get_gene_name(c)
                if name is None:
                    db_refs['SIGNOR'] = c
                else:
                    db_refs['UP'] = c
                    hgnc_id = uniprot_client.get_hgnc_id(c)
                    if hgnc_id:
                        name = hgnc_client.get_hgnc_name(hgnc_id)
                        db_refs['HGNC'] = hgnc_id

                famplex_key = ('SIGNOR', c)
                if famplex_key in famplex_map:
                    db_refs['FPLX'] = famplex_map[famplex_key]
                    if not name:
                        # Set agent name to Famplex name if
                        # the Uniprot name is not available
                        name = db_refs['FPLX']
                elif not name:
                    # We neither have a Uniprot nor Famplex grounding
                    logger.info('Have neither a Uniprot nor Famplex grounding '
                                'for "%s" in complex %s' % (c, complex_id))
                    if not name:
                        # Set the agent name to the Signor name if neither the
                        # Uniprot nor Famplex names are available
                        name = db_refs['SIGNOR']
            assert name is not None
            agents.append(Agent(name, db_refs=db_refs))
        return agents
Exemple #8
0
def update_hmdb_chebi_map():
    logger.info('--Updating HMDB to ChEBI entries----')
    ns = {'hmdb': 'http://www.hmdb.ca'}
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'
    fname = os.path.join(path, 'hmdb_metabolites.zip')
    logger.info('Downloading %s' % url)
    urlretrieve(url, fname)
    mappings = []
    with ZipFile(fname) as input_zip:
        with input_zip.open('hmdb_metabolites.xml') as fh:
            for event, elem in ET.iterparse(fh, events=('start', 'end')):
                #print(elem.tag)
                if event == 'start' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    hmdb_id = None
                    chebi_id = None
                # Important: we only look at accession if there's no HMDB
                # ID yet, otherwise we pick up secondary accession tags
                elif event == 'start' and \
                        elem.tag == '{%s}accession' % ns['hmdb'] and \
                        not hmdb_id:
                    hmdb_id = elem.text
                elif event == 'start' and \
                        elem.tag == '{%s}chebi_id' % ns['hmdb']:
                    chebi_id = elem.text
                elif event == 'end' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    if hmdb_id and chebi_id:
                        name = chebi_client.get_chebi_name_from_id(chebi_id)
                        if not name:
                            print('Likely invalid ChEBI mapping: ',
                                  hmdb_id, chebi_id)
                            continue
                        mappings.append([hmdb_id, chebi_id])
                elem.clear()
    fname = os.path.join(path, 'hmdb_to_chebi.tsv')
    mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0])
    write_unicode_csv(fname, mappings, delimiter='\t')
Exemple #9
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs
Exemple #10
0
def test_chebi_id_to_name():
    name = chebi_client.get_chebi_name_from_id('CHEBI:63637')
    assert name == 'vemurafenib', name
Exemple #11
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in [
            'agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
            'agi-ncimcelltype'
    ]:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name
"""This script helps identify entries in PubChem.tsv that systematically
lead to incorrect groundings and should therefore be removed."""

import os
import re
from indra.databases import chebi_client

if __name__ == '__main__':
    # Basic positioning
    here = os.path.dirname(os.path.abspath(__file__))
    kb_dir = os.path.join(here, os.pardir, 'src', 'main', 'resources', 'org',
                          'clulab', 'reach', 'kb')
    resource_fname = os.path.join(kb_dir, 'PubChem.tsv')

    keep_rows = []
    with open(resource_fname, 'r') as fh:
        for row in fh.readlines():
            if '\t' not in row:
                continue
            txt, id = [x.strip() for x in row.split('\t')]
            if re.match(r'^[A-Z][A-Z]$', txt):
                chebi_id = chebi_client.get_chebi_id_from_pubchem(id)
                name = chebi_client.get_chebi_name_from_id(chebi_id)
                if name and  '-' in name and len(name) == 7:
                    continue
            keep_rows.append(row)
    with open(resource_fname, 'w') as fh:
        for row in keep_rows:
            fh.write(row)
Exemple #13
0
def test_chebi_id_to_name():
    name = chebi_client.get_chebi_name_from_id('63637', offline=True)
    assert name == 'vemurafenib', name
Exemple #14
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
                      'agi-ncimcelltype']:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name