Esempio n. 1
0
def run_adeft_disambiguation(stmt, agent_list, idx, new_agent, agent_txt):
    # Initialize annotations if needed so Adeft predicted
    # probabilities can be added to Agent annotations
    annots = stmt.evidence[0].annotations if stmt.evidence else {}
    if 'agents' in annots:
        if 'adeft' not in annots['agents']:
            annots['agents']['adeft'] = \
                {'adeft': [None for _ in agent_list]}
    else:
        annots['agents'] = {'adeft': [None for _ in agent_list]}
    grounding_text = _get_text_for_grounding(stmt, agent_txt)
    if grounding_text:
        res = adeft_disambiguators[agent_txt].disambiguate(
                                                [grounding_text])
        ns_and_id, standard_name, disamb_scores = res[0]
        # If the highest score is ungrounded we don't do anything
        # TODO: should we explicitly remove grounding if we conclude it
        # doesn't match any of the choices?
        if ns_and_id == 'ungrounded':
            return
        db_ns, db_id = ns_and_id.split(':', maxsplit=1)
        new_agent.db_refs = {'TEXT': agent_txt, db_ns: db_id}
        new_agent.name = standard_name
        logger.info('Disambiguated %s to: %s, %s:%s' %
                    (agent_txt, standard_name, db_ns, db_id))
        if db_ns == 'HGNC':
            hgnc_sym = hgnc_client.get_hgnc_name(db_id)
            GroundingMapper.standardize_agent_db_refs(new_agent,
                                                      {'HGNC': hgnc_sym},
                                                      do_rename=False)
        annots['agents']['adeft'][idx] = disamb_scores
Esempio n. 2
0
 def _get_hgnc_name(self, hgnc_id):
     try:
         hgnc_name = self._hgnc_cache[hgnc_id]
     except KeyError:
         hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
         self._hgnc_cache[hgnc_id] = hgnc_name
     return hgnc_name
Esempio n. 3
0
    def _make_db_refs(self, entrez_id, text_id):
        """Looks up the HGNC ID  and name, as well as the Uniprot ID.

        Parameters
        ----------
        entrez_id : str
            Entrez gene ID.
        text_id : str or None
            A plain text systematic name, or None if not listed in the
            Biogrid data.

        Returns
        -------
        hgnc_name : str
            Official HGNC symbol for the gene.
        db_refs : dict
            db_refs grounding dictionary, used when constructing the Agent
            object.
        """
        db_refs = {}
        if text_id != '-' and text_id is not None:
            db_refs['TEXT'] = text_id

        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
        return (hgnc_name, db_refs)
Esempio n. 4
0
def _get_agent_grounding(agent):
    """Convert an agent to the corresponding PyBEL DSL object (to be filled with variants later)."""
    def _get_id(_agent, key):
        _id = _agent.db_refs.get(key)
        if isinstance(_id, list):
            _id = _id[0]
        return _id

    hgnc_id = _get_id(agent, 'HGNC')
    if hgnc_id:
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if not hgnc_name:
            logger.warning('Agent %s with HGNC ID %s has no HGNC name.',
                           agent, hgnc_id)
            return
        return protein('HGNC', hgnc_name)

    uniprot_id = _get_id(agent, 'UP')
    if uniprot_id:
        return protein('UP', uniprot_id)

    fplx_id = _get_id(agent, 'FPLX')
    if fplx_id:
        return protein('FPLX', fplx_id)

    pfam_id = _get_id(agent, 'PF')
    if pfam_id:
        return protein('PFAM', pfam_id)

    ip_id = _get_id(agent, 'IP')
    if ip_id:
        return protein('IP', ip_id)

    fa_id = _get_id(agent, 'FA')
    if fa_id:
        return protein('NXPFA', fa_id)

    chebi_id = _get_id(agent, 'CHEBI')
    if chebi_id:
        if chebi_id.startswith('CHEBI:'):
            chebi_id = chebi_id[len('CHEBI:'):]
        return abundance('CHEBI', chebi_id)

    pubchem_id = _get_id(agent, 'PUBCHEM')
    if pubchem_id:
        return abundance('PUBCHEM', pubchem_id)

    go_id = _get_id(agent, 'GO')
    if go_id:
        return bioprocess('GO', go_id)

    mesh_id = _get_id(agent, 'MESH')
    if mesh_id:
        return bioprocess('MESH', mesh_id)

    return
Esempio n. 5
0
 def _extract_protein(self, name, gene_id):
     refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         refs['HGNC'] = hgnc_id
         up_id = hgnc_client.get_uniprot_id(hgnc_id)
         if up_id:
             refs['UP'] = up_id
         # If there is a HGNC ID, we standardize the gene name
         name = hgnc_client.get_hgnc_name(hgnc_id)
     return Agent(name, db_refs=refs)
Esempio n. 6
0
 def get_db_refs(egid):
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     if not hgnc_id:
         logger.info("No HGNC ID for Entrez ID: %s" % egid)
         return (None, {})
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     if not hgnc_name:
         logger.info("No HGNC name for HGNC ID: %s" % hgnc_id)
         return (None, {})
     up_id = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id:
         logger.info("No Uniprot ID for EGID / HGNC ID / Symbol "
                     "%s / %s / %s" % (egid, hgnc_id, hgnc_name))
         return (None, {})
     return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
Esempio n. 7
0
def go_gene_pair(stmt):
    go = None
    gene_symbol = None
    for ag in stmt.agent_list():
        if ag is None:
            continue
        grounding = ag.db_refs.keys()
        if 'HGNC' in grounding:
            gene_id = ag.db_refs.get('HGNC')
            gene_symbol = hgnc_client.get_hgnc_name(gene_id)
        elif 'GO' in grounding:
            go = ag.db_refs.get('GO')
            bp_name = ag.name
    if go is not None and gene_symbol is not None:
        return (bp_name, go, gene_symbol)
    else:
        return (None, None, None)
Esempio n. 8
0
 def get_grounding(self):
     import indra.databases.hgnc_client as hgc
     import indra.databases.uniprot_client as upc
     be = self.db_refs.get('FPLX')
     if be:
         return ('FPLX', be)
     hgnc = self.db_refs.get('HGNC')
     if hgnc:
         if isinstance(hgnc, list):
             hgnc = hgnc[0]
         return ('HGNC', hgc.get_hgnc_name(str(hgnc)))
     up = self.db_refs.get('UP')
     if up:
         if isinstance(up, list):
             up = up[0]
         if upc.is_human(up):
             gene_name = upc.get_gene_name(up, web_fallback=False)
             if gene_name:
                 return ('HGNC', gene_name)
         else:
             return ('UP', up)
     return (None, None)
Esempio n. 9
0
 def get_agent(concept, entity):
     name = term_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         agent_name = name
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
                            name)
     elif namespace in ('MGI', 'RGD'):
         agent_name = name
         db_refs[namespace] = name
     elif namespace in ('PFH', 'SFAM'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL family: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace in ('NCH', 'SCOMP'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL complex: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace == 'CHEBI':
         chebi_id = chebi_name_id.get(name)
         if chebi_id:
             db_refs['CHEBI'] = chebi_id
         else:
             logger.warning('CHEBI name %s not found in map.' % name)
         agent_name = name
     elif namespace == 'EGID':
         hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
         db_refs['EGID'] = name
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             agent_name = hgnc_client.get_hgnc_name(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning('Could not map EGID%s to HGNC.' % name)
             agent_name = 'E%s' % name
     else:
         logger.warning('Unhandled entity namespace: %s' % namespace)
         print('%s, %s' % (concept, entity))
         agent_name = name
     agent = Agent(agent_name, db_refs=db_refs)
     return agent
Esempio n. 10
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in [
            'agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
            'agi-ncimcelltype'
    ]:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name
Esempio n. 11
0
    cxa = CxAssembler(stmts)
    cxa.make_model(add_indra_json=False)
    cxa.save_model(save_file)
    return cxa


if __name__ == '__main__':
    # Load NDEx credentials
    with open('ndex_cred.json', 'rt') as f:
        ndex_cred = json.load(f)
    # Get the network
    ncp = ndex_cx.process_ndex_network('df1fea48-8cfb-11e7-a10d-0ac135e8bacf',
                                       username=ndex_cred['user'],
                                       password=ndex_cred['password'])
    gene_names = [
        hgnc_client.get_hgnc_name(ag.db_refs['HGNC'])
        for ag in ncp.get_agents()
    ]
    """
    # Get PMIDs for reading
    entrez_pmids = get_pmids(gene_names)
    network_pmids = ncp.get_pmids()
    pmids = list(set(entrez_pmids + network_pmids))
    save_pmids_for_reading(pmids, 'dna_damage_pmids.txt')
    """

    # Build the model
    prior_stmts = build_prior(gene_names, 'prior_stmts.pkl')
    reach_stmts = ac.load_statements('reach_stmts.pkl')
    stmts = ncp.statements + reach_stmts + prior_stmts
    stmts = run_assembly(stmts, 'unfiltered_assembled_stmts.pkl')
Esempio n. 12
0
def test_get_hgnc_name():
    hgnc_id = '3236'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert(hgnc_name == 'EGFR')
    assert unicode_strs(hgnc_name)
Esempio n. 13
0
 def _get_hgnc_name(hgnc_id):
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     return hgnc_name
Esempio n. 14
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = get_bioentities_mapping(db_ns, db_id)
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['BE'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                db_refs['UP'] = db_id
                gene_name = uniprot_client.get_gene_name(db_id)
                if gene_name:
                    agent_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXPFA'] = db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'BE']:
                db_refs[db_ns] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert (agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
Esempio n. 15
0
def test_get_hgnc_name_nonexistent():
    hgnc_id = '123456'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert(hgnc_name is None)
Esempio n. 16
0
 def _get_agent(concept, entity):
     name = term_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         agent_name = name
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
                            name)
     elif namespace in ('MGI', 'RGD'):
         agent_name = name
         db_refs[namespace] = name
     elif namespace in ('PFH', 'SFAM'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL family: %s' % name
             logger.warning(msg)
         else:
             db_refs['FPLX'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace in ('NCH', 'SCOMP'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL complex: %s' % name
             logger.warning(msg)
         else:
             db_refs['FPLX'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace == 'CHEBI':
         chebi_id = chebi_name_id.get(name)
         if chebi_id:
             db_refs['CHEBI'] = chebi_id
         else:
             logger.warning('CHEBI name %s not found in map.' % name)
         agent_name = name
     elif namespace == 'EGID':
         hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
         db_refs['EGID'] = name
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             agent_name = hgnc_client.get_hgnc_name(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning('Could not map EGID%s to HGNC.' % name)
             agent_name = 'E%s' % name
     else:
         logger.warning('Unhandled entity namespace: %s' % namespace)
         print('%s, %s' % (concept, entity))
         agent_name = name
     agent = Agent(agent_name, db_refs=db_refs)
     return agent
Esempio n. 17
0
def test_get_hgnc_name_nonexistent():
    hgnc_id = '123456'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert hgnc_name is None
    assert unicode_strs(hgnc_name)
Esempio n. 18
0
 def _get_db_refs(entity_term):
     agent_name = entity_term['text']
     db_refs = {}
     for xr in entity_term['xrefs']:
         ns = xr['namespace']
         if ns == 'uniprot':
             up_id = xr['id']
             db_refs['UP'] = up_id
             # Look up official names in UniProt
             gene_name = up_client.get_gene_name(up_id)
             if gene_name is not None:
                 agent_name = gene_name
                 # If the gene name corresponds to an HGNC ID, add it to the
                 # db_refs
                 if up_client.is_human(up_id):
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         db_refs['HGNC'] = hgnc_id
         elif ns == 'hgnc':
             hgnc_id = xr['id']
             db_refs['HGNC'] = hgnc_id
             # Look up the standard gene symbol and set as name
             hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
             if hgnc_name:
                 agent_name = hgnc_name
             # Look up the corresponding uniprot id
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
         elif ns == 'pfam':
             be_id = famplex_map.get(('PF', xr['id']))
             if be_id:
                 db_refs['FPLX'] = be_id
                 agent_name = be_id
             db_refs['PF'] = xr['id']
         elif ns == 'interpro':
             be_id = famplex_map.get(('IP', xr['id']))
             if be_id:
                 db_refs['FPLX'] = be_id
                 agent_name = be_id
             db_refs['IP'] = xr['id']
         elif ns == 'chebi':
             db_refs['CHEBI'] = xr['id']
         elif ns == 'pubchem':
             db_refs['PUBCHEM'] = xr['id']
         elif ns == 'go':
             go_id = xr['id']
             # Handle secondary to primary mapping if necessary
             pri = go_client.get_primary_id(go_id)
             if pri:
                 go_id = pri
             db_refs['GO'] = go_id
         elif ns == 'mesh':
             db_refs['MESH'] = xr['id']
         elif ns == 'hmdb':
             db_refs['HMDB'] = xr['id']
         elif ns == 'simple_chemical':
             if xr['id'].startswith('HMDB'):
                 db_refs['HMDB'] = xr['id']
         elif ns == 'be':
             db_refs['FPLX'] = xr['id']
             agent_name = db_refs['FPLX']
         # These name spaces are ignored
         elif ns in ['uaz']:
             pass
         else:
             logger.warning('Unhandled xref namespace: %s' % ns)
     db_refs['TEXT'] = entity_term['text']
     return agent_name, db_refs
Esempio n. 19
0
def get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = node_modifier_data or 'No node data'
        logger.info("Nodes of type %s not handled: %s",
                    node_func, mod_data)
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" % str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [BoundCondition(get_agent(m), True)
                            for m in members[1:]]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.',
                                name, hgnc_id)
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s',
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
               location=to_loc)
    return ag
Esempio n. 20
0
def analyze(filename):
    results = load_file(filename)

    all_stmts = [stmt for paper_stmts in results.values()
                      for stmt in paper_stmts]

    # Map grounding
    logger.info('Mapping grounding...')
    gmap = gm.GroundingMapper(gm.default_grounding_map)
    map_stmts = gmap.map_agents(all_stmts)
    map_stmts = gmap.rename_agents(map_stmts)

    # Combine duplicates
    logger.info('Removing duplicates...')
    pa = Preassembler(hierarchies, map_stmts)
    pa.combine_duplicates()

    # Get complexes
    complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)]
    # Get HGNC grounding
    protein_complexes = [s for s in complexes
                           if all([True if 'HGNC' in ag.db_refs.keys()
                                        else False
                                        for ag in s.agent_list()])]

    logger.info('Mapping gene IDs to gene symbols')
    gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes
                                            for ag in stmt.members]))
    genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids]

    # Get complexes from BioGrid and combine duplicates
    num_genes_per_query = 50
    start_indices = range(0, len(genes), num_genes_per_query)
    end_indices = [i + num_genes_per_query
                   if i + num_genes_per_query < len(genes) else len(genes)
                   for i in start_indices]
    bg_complexes = []
    for i in range(len(start_indices)):
        logger.info("Querying biogrid for %s" %
                    str(genes[start_indices[i]:end_indices[i]]))
        bg_complexes += (bg.get_statements(
                                genes[start_indices[i]:end_indices[i]]))

    # Filter out Biogrid statements not involving genes in the gene list
    # (this will make duplicate removal more efficient
    bg_filt = []
    for stmt in bg_complexes:
        if stmt.members[0].name in genes and \
           stmt.members[1].name in genes:
            bg_filt.append(stmt)
    # Might as well free up some memory
    del bg_complexes

    logger.info("Combining duplicates with biogrid...")
    pa = Preassembler(hierarchies, bg_filt + protein_complexes)
    pa.combine_duplicates()

    indra_only = []
    bg_only = []
    indra_and_bg = []
    for stmt in pa.unique_stmts:
        evidence_source_list = set([])
        for e in stmt.evidence:
            evidence_source_list.add(e.source_api)
        if 'reach' in evidence_source_list and \
           'biogrid' in evidence_source_list:
            indra_and_bg.append(stmt)
        elif 'reach' in evidence_source_list and \
             'biogrid' not in evidence_source_list:
            indra_only.append(stmt)
        elif 'reach' not in evidence_source_list and \
             'biogrid' in evidence_source_list:
            bg_only.append(stmt)

    rows = []
    for stmt in indra_only:
        rows.append([stmt.members[0].name, stmt.members[1].name,
                     str(len(stmt.evidence))])
    write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t')

    return {'indra_only': indra_only,
            'bg_only': bg_only,
            'indra_and_bg': indra_and_bg}
Esempio n. 21
0
    def _get_agent_from_entity(self, entity_id):
        qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id
        res = self.tree.execute(qstr)
        if res is None:
            return None
        try:
            entity_term = next(res)
        except StopIteration:
            logger.debug(' %s is not an entity' % entity_id)
            return None
        # This is the default name, which can be overwritten 
        # below for specific database entries
        agent_name = self._get_valid_name(entity_term['text'])
        db_refs = {}
        for xr in entity_term['xrefs']:
            ns = xr['namespace']
            if ns == 'uniprot':
                up_id = xr['id']
                db_refs['UP'] = up_id
                # Look up official names in UniProt
                gene_name = up_client.get_gene_name(up_id)
                if gene_name is not None:
                    agent_name = self._get_valid_name(gene_name)
                    # If the gene name corresponds to an HGNC ID, add it to the
                    # db_refs
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif ns == 'hgnc':
                hgnc_id = xr['id']
                db_refs['HGNC'] = hgnc_id
                # Look up the standard gene symbol and set as name
                hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
                if hgnc_name:
                    agent_name = hgnc_name
                # Look up the corresponding uniprot id
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
            elif ns == 'pfam':
                be_id = bioentities_map.get(('PF', xr['id']))
                if be_id:
                    db_refs['BE'] = be_id
                db_refs['PF'] = xr['id']
            elif ns == 'interpro':
                be_id = bioentities_map.get(('IP', xr['id']))
                if be_id:
                    db_refs['BE'] = be_id
                db_refs['PF'] = xr['id']
            elif ns == 'chebi':
                db_refs['CHEBI'] = xr['id']
            elif ns == 'pubchem':
                db_refs['PUBCHEM'] = 'PUBCHEM:%s' % xr['id']
            elif ns == 'go':
                db_refs['GO'] = xr['id']
            elif ns == 'mesh':
                db_refs['MESH'] = xr['id']
            elif ns == 'hmdb':
                db_refs['HMDB'] = xr['id']
            elif ns == 'simple_chemical':
                if xr['id'].startswith('HMDB'):
                    db_refs['HMDB'] = xr['id']
            elif ns == 'be':
                db_refs['BE'] = xr['id']
            # These name spaces are ignored
            elif ns in ['uaz']:
                pass
            else:
                logger.warning('Unhandled xref namespace: %s' % ns)
        db_refs['TEXT'] = entity_term['text']

        mod_terms = entity_term.get('modifications')
        mods = []
        muts = []
        if mod_terms is not None:
            for m in mod_terms:
                if m['type'].lower() == 'mutation':
                    # Evidence is usualy something like "V600E"
                    # We could parse this to get the amino acid
                    # change that happened.
                    mutation_str = m.get('evidence')
                    # TODO: sometimes mutation_str is "mutant", "Mutant",
                    # "mutants" - this indicates that there is a mutation
                    # but not the specific type. We should encode this
                    # somehow as a "blank" mutation condition
                    mut = self._parse_mutation(mutation_str)
                    if mut is not None:
                        muts.append(mut)
                else:
                    mc = self._get_mod_condition(m)
                    if mc is not None:
                        mods.append(mc)

        agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts)
        return agent
Esempio n. 22
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
        elif db_ns == 'GO':
            if db_id.startswith('GO:'):
                db_refs_tmp['GO'] = db_id
            else:
                db_refs_tmp['GO'] = 'GO:' + db_id
        # Change PCID name space
        elif db_ns == 'PCID':
            db_refs_tmp.pop('PCID', None)
            db_refs_tmp['PUBCHEM'] = db_id
    agent.db_refs = db_refs_tmp
    # Check if we have a FPLX entry and handle old BE mappings
    if 'BE' in agent.db_refs:
        agent.db_refs['FPLX'] = agent.db_refs.pop('BE')
    be_id = agent.db_refs.get('FPLX')
    # Try to map to FPLX from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # FPLX takes precedence if we have it
    if be_id:
        agent.db_refs['FPLX'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
        # If it doesn't have a gene name, it's better to just
        # use the raw string name otherwise Sparser sets
        # has Uniprot IDs or mnemonics as the name
        else:
            name = agent.db_refs.get('TEXT', agent.name)
            agent.name = name
Esempio n. 23
0
    statement_list = ac.load_statements(args.input_file)

    # Make a dictionary mapping the raw text mention to db_refs
    logger.info('Extracting grounding information')
    text_to_refs = {}
    counter = 0
    percent_done = 0
    start_time = time.time()
    for statement in statement_list:
        for a in statement.agent_list():
            db_refs = copy.copy(a.db_refs)
            text = db_refs.pop('TEXT', None)

            # Convert HGNC ids to names
            if 'HGNC' in db_refs and string_is_integer(db_refs['HGNC']):
                db_refs['HGNC'] = get_hgnc_name(db_refs['HGNC'])

            if len(db_refs.keys()) > 0:
                text_to_refs[text] = db_refs
        counter = counter + 1

        progress = math.floor(100.0 * float(counter)
                              / float(len(statement_list)))
        if progress > percent_done:
            percent_done = progress
            ellapsed_min = (time.time()-start_time) / 60.0
            logger.info(('%d%% done with processing statements '
                         '(%f minutes elapsed)')
                        % (percent_done, ellapsed_min))
    logger.info('\tDone!')
Esempio n. 24
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                db_refs['UP'] = db_id
                gene_name = uniprot_client.get_gene_name(db_id)
                if gene_name:
                    agent_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert(agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
Esempio n. 25
0
def test_get_hgnc_name_nonexistent():
    hgnc_id = '123456'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert(hgnc_name is None)
    assert unicode_strs(hgnc_name)
Esempio n. 26
0
def test_get_hgnc_name():
    hgnc_id = '3236'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert(hgnc_name == 'EGFR')
Esempio n. 27
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}
    entries = entity_info['entityId']
    if entries is None:
        entries = []
    ref_counts = Counter([entry['source'] for entry in entries])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping'
                        % (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entries:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            hgnc_id = uniprot_client.get_hgnc_id(id_dict['idString'])
            if hgnc_id:
                # Check to see if we have a conflict with an HGNC id
                # found from the Entrez id. If so, overwrite with this
                # one, in which we have greater faith.
                if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                    msg = ('Inferred HGNC:%s from UP:%s does not'
                           ' match HGNC:%s from EGID:%s') % \
                          (refs['HGNC'], refs['UP'], hgnc_id,
                           refs['EGID'])
                    logger.info(msg)
                refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
            else:
                gene_name = uniprot_client.get_gene_name(id_dict['idString'])
                if gene_name is not None:
                    name = gene_name
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning("Unhandled id type: {source}={idString}"
                           .format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords
Esempio n. 28
0
def test_get_hgnc_name():
    hgnc_id = '3236'
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    assert hgnc_name == 'EGFR'
    assert unicode_strs(hgnc_name)
Esempio n. 29
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
                      'agi-ncimcelltype']:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name
Esempio n. 30
0
 def _make_agent(self, hprd_id, refseq_id=None):
     if hprd_id is None or hprd_id is nan:
         return None
     # Get the basic info (HGNC name/symbol, Entrez ID) from the
     # ID mappings dataframe
     try:
         egid = self.id_df.loc[hprd_id].EGID
     except KeyError:
         logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
         return None
     if not egid:
         logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
         return None
     # Get the HGNC ID
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     # If we couldn't get an HGNC ID for the Entrez ID, this means that
     # the Entrez ID has been discontinued or replaced.
     if not hgnc_id:
         self.no_hgnc_for_egid.append(egid)
         return None
     # Get the (possibly updated) HGNC Symbol
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     assert hgnc_name is not None
     # See if we can get a Uniprot ID from the HGNC symbol--if there is
     # a RefSeq ID we wil also try to use it to get an isoform specific
     # UP ID, but we will have this one to fall back on. But if we can't
     # get one here, then we skip the Statement
     up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id_from_hgnc:
         self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id))
         return None
     # If we have provided the RefSeq ID, it's because we need to make
     # sure that we are getting the right isoform-specific ID (for sequence
     # positions of PTMs). Here we try to get the Uniprot ID from the
     # Refseq->UP mappings in the protmapper.uniprot_client.
     if refseq_id is not None:
         # Get the Uniprot IDs from the uniprot client
         up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                     reviewed_only=True)
         # Nothing for this RefSeq ID (quite likely because the RefSeq ID
         # is obsolete; take the UP ID from HGNC
         if len(up_ids) == 0:
             self.no_up_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # More than one reviewed entry--no thanks, we'll take the one from
         # HGNC instead
         elif len(up_ids) > 1:
             self.many_ups_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # We got a unique, reviewed UP entry for the RefSeq ID
         else:
             up_id = up_ids[0]
             # If it's the canonical isoform, strip off the '-1'
             if up_id.endswith('-1'):
                 up_id = up_id.split('-')[0]
     # For completeness, get the Refseq ID from the HPRD ID table
     else:
         refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
         up_id = up_id_from_hgnc
     # Make db_refs, return Agent
     db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid,
                'REFSEQ_PROT': refseq_id}
     return Agent(hgnc_name, db_refs=db_refs)
Esempio n. 31
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s'
                        % (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns in ('MGI', 'RGD'):
        db_refs = {ns: name}
    # Map Selventa families to FamPlexes
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info('HGNC entity %s with HGNC ID %s has no '
                            'corresponding Uniprot ID.',
                            name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.info('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name,
                                                          node_data))
    return name, db_refs