Beispiel #1
0
 def _get_db_refs(bpe):
     db_refs = {}
     if _is_protein(bpe):
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         # Handle missing HGNC/UP ids
         if hgnc_id and not uniprot_id:
             uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
         if uniprot_id and not hgnc_id:
             if uniprot_client.is_human(uniprot_id):
                 hgnc_name = uniprot_client.get_gene_name(uniprot_id, False)
                 if hgnc_name:
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     elif _is_small_molecule(bpe):
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
     else:
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     return db_refs
Beispiel #2
0
def filter_human_only(stmts_in, **kwargs):
    """Filter out statements that are not grounded to human genes.

    Parameters
    ----------
    stmts_in : list[indra.statements.Statement]
        A list of statements to filter.
    save : Optional[str]
        The name of a pickle file to save the results (stmts_out) into.

    Returns
    -------
    stmts_out : list[indra.statements.Statement]
        A list of filtered statements.

    """
    dump_pkl = kwargs.get('save')
    logger.info('Filtering %d statements for human genes only...' %
                len(stmts_in))
    stmts_out = []
    for st in stmts_in:
        human_genes = True
        for agent in st.agent_list():
            if agent is not None:
                upid = agent.db_refs.get('UP')
                if upid and not uniprot_client.is_human(upid):
                    human_genes = False
                    break
        if human_genes:
            stmts_out.append(st)
    logger.info('%d statements after filter...' % len(stmts_out))
    if dump_pkl:
        dump_statements(stmts_out, dump_pkl)
    return stmts_out
Beispiel #3
0
def read_phosphosite(fname):
    df = pandas.read_csv(fname, index_col=None)
    statements = []
    antibody_map = {}
    for _, row in df.iterrows():
        sub_upid = row['SUB_ID']
        if not pandas.isnull(sub_upid):
            sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid)
            sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
        else:
            sub_hgnc_symbol = row['SUB_GENE']
            sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
            sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id)
        sub = Agent(sub_hgnc_symbol,
                    db_refs={'UP': sub_upid,'HGNC': sub_hgnc})
        residue = row['Actual_site'][0]
        if len(row['Actual_site']) > 1:
            position = row['Actual_site'][1:]
        else:
            position = None

        sub_readout = deepcopy(sub)
        mc = ModCondition('phosphorylation', residue, position)
        sub_readout.mods = [mc]
        ps = row['phosphosite']
        if ps in antibody_map:
            found = False
            for p in antibody_map[ps]:
                if p.name == sub.name and p.mods[0].residue == residue and \
                    p.mods[0].position == position:
                    found = True
                    break
            if not found:
                antibody_map[ps].append(sub_readout)
        else:
            antibody_map[ps] = [sub_readout]

        kin_upid = row['KIN_ID']
        if not pandas.isnull(kin_upid):
            if not uniprot_client.is_human(kin_upid):
                print('%s non human' % kin_upid)
                continue
            kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid)
            kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
        else:
            kin_hgnc_symbol = row['KINASE_GENE_SYMBOL']
            kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
            kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id)
        kin = Agent(kin_hgnc_symbol,
                    db_refs={'UP': kin_upid,'HGNC': kin_hgnc})

        ev = Evidence(source_api='phosphosite')
        st = Phosphorylation(kin, sub, residue, position, evidence = [ev])
        statements.append(st)
    return statements, antibody_map
Beispiel #4
0
def _human_only_filter(stmts_in):
    stmts_out = []
    for st in stmts_in:
        agents = [a for a in st.agent_list() if a is not None]
        non_human = False
        for a in agents:
            hgnc_id = a.db_refs.get("HGNC")
            up_id = a.db_refs.get("UP")
            if not hgnc_id:
                if up_id and not uniprot_client.is_human(up_id):
                    non_human = True
                    break
        if not non_human:
            stmts_out.append(st)
    return stmts_out
Beispiel #5
0
def test_all_protein_db_refs():
    unmapped_uniprot_ids = []
    for obj in bp.model.getObjects().toArray():
        bpe = bpc._cast_biopax_element(obj)
        if bpc._is_protein(bpe):
            db_refs = bpc.BiopaxProcessor._get_db_refs(bpe)
            uniprot_id = db_refs.get('UP')
            hgnc_id = db_refs.get('HGNC')
            if uniprot_id:
                if uniprot_client.is_human(uniprot_id):
                    if not hgnc_id:
                        unmapped_uniprot_ids.append(uniprot_id)
    unmapped_uniprot_ids = sorted(list(set(unmapped_uniprot_ids)))
    # The number of unmapped entries should not increase
    # so we check for an upper limit here
    assert(len(unmapped_uniprot_ids) < 95)
Beispiel #6
0
 def get_grounding(self):
     import indra.databases.hgnc_client as hgc
     import indra.databases.uniprot_client as upc
     be = self.db_refs.get('FPLX')
     if be:
         return ('FPLX', be)
     hgnc = self.db_refs.get('HGNC')
     if hgnc:
         if isinstance(hgnc, list):
             hgnc = hgnc[0]
         return ('HGNC', hgc.get_hgnc_name(str(hgnc)))
     up = self.db_refs.get('UP')
     if up:
         if isinstance(up, list):
             up = up[0]
         if upc.is_human(up):
             gene_name = upc.get_gene_name(up, web_fallback=False)
             if gene_name:
                 return ('HGNC', gene_name)
         else:
             return ('UP', up)
     return (None, None)
Beispiel #7
0
def is_non_human_protein(bio_ontology, node):
    if bio_ontology.get_ns(node) == 'UP' and \
             not uniprot_client.is_human(bio_ontology.get_id(node)):
        return True
    return False
Beispiel #8
0
def test_not_is_human():
    assert(not uniprot_client.is_human('P31938'))
def test_noentry_is_human():
    assert not uniprot_client.is_human('XXXX')
def test_is_human():
    assert uniprot_client.is_human('P00533')
Beispiel #11
0
def test_is_human():
    assert(uniprot_client.is_human('P00533'))
def test_not_is_human():
    assert not uniprot_client.is_human('P31938')
Beispiel #13
0
 def _get_db_refs(entity_term):
     agent_name = entity_term['text']
     db_refs = {}
     for xr in entity_term['xrefs']:
         ns = xr['namespace']
         if ns == 'uniprot':
             up_id = xr['id']
             db_refs['UP'] = up_id
             # Look up official names in UniProt
             gene_name = up_client.get_gene_name(up_id)
             if gene_name is not None:
                 agent_name = gene_name
                 # If the gene name corresponds to an HGNC ID, add it to the
                 # db_refs
                 if up_client.is_human(up_id):
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         db_refs['HGNC'] = hgnc_id
         elif ns == 'hgnc':
             hgnc_id = xr['id']
             db_refs['HGNC'] = hgnc_id
             # Look up the standard gene symbol and set as name
             hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
             if hgnc_name:
                 agent_name = hgnc_name
             # Look up the corresponding uniprot id
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
         elif ns == 'pfam':
             be_id = famplex_map.get(('PF', xr['id']))
             if be_id:
                 db_refs['FPLX'] = be_id
                 agent_name = be_id
             db_refs['PF'] = xr['id']
         elif ns == 'interpro':
             be_id = famplex_map.get(('IP', xr['id']))
             if be_id:
                 db_refs['FPLX'] = be_id
                 agent_name = be_id
             db_refs['IP'] = xr['id']
         elif ns == 'chebi':
             db_refs['CHEBI'] = xr['id']
         elif ns == 'pubchem':
             db_refs['PUBCHEM'] = xr['id']
         elif ns == 'go':
             db_refs['GO'] = xr['id']
         elif ns == 'mesh':
             db_refs['MESH'] = xr['id']
         elif ns == 'hmdb':
             db_refs['HMDB'] = xr['id']
         elif ns == 'simple_chemical':
             if xr['id'].startswith('HMDB'):
                 db_refs['HMDB'] = xr['id']
         elif ns == 'be':
             db_refs['FPLX'] = xr['id']
             agent_name = db_refs['FPLX']
         # These name spaces are ignored
         elif ns in ['uaz']:
             pass
         else:
             logger.warning('Unhandled xref namespace: %s' % ns)
     db_refs['TEXT'] = entity_term['text']
     return agent_name, db_refs
Beispiel #14
0
def _get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = ('No node data' if not node_modifier_data else
                    node_modifier_data.get(pc.CNAME))
        logger.info("Nodes of type %s not handled: %s" % (node_func, mod_data))
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" %
                    str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = _get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [
            BoundCondition(_get_agent(m), True) for m in members[1:]
        ]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' % (name, hgnc_id))
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s' %
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name,
               db_refs=db_refs,
               mods=mods,
               mutations=muts,
               activity=ac,
               location=to_loc)
    return ag
Beispiel #15
0
def test_noentry_is_human():
    assert(not uniprot_client.is_human('XXXX'))
Beispiel #16
0
def get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = node_modifier_data or 'No node data'
        logger.info("Nodes of type %s not handled: %s",
                    node_func, mod_data)
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" % str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [BoundCondition(get_agent(m), True)
                            for m in members[1:]]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.',
                                name, hgnc_id)
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s',
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
               location=to_loc)
    return ag
Beispiel #17
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.
    """
    db_refs = None
    if ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        gene_name = uniprot_client.get_gene_name(name)
        if gene_name:
            up_id = name
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
                gene_name = uniprot_client.get_gene_name(up_id)
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        if uniprot_client.is_human(up_id):
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if not hgnc_id:
                logger.info('Uniprot ID linked to invalid human gene '
                            'name %s' % name)
            else:
                db_refs['HGNC'] = hgnc_id
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID fro %s' % name)
            return name, None
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns in ('MGI', 'RGD'):
        db_refs = {ns: name}
    # Map Selventa families to FamPlexes
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info('HGNC entity %s with HGNC ID %s has no '
                            'corresponding Uniprot ID.',
                            name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.info('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    # CHEBI
    elif ns == 'CHEBI':
        chebi_id = chebi_name_id.get(name)
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM'):
        db_refs = {ns: name}
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name,
                                                          node_data))
    return name, db_refs
Beispiel #18
0
def is_non_human_protein(bio_ontology, node):
    """Return True if the given ontology node is a non-human protein."""
    if bio_ontology.get_ns(node) == 'UP' and \
             not uniprot_client.is_human(bio_ontology.get_id(node)):
        return True
    return False
Beispiel #19
0
    def standardize_db_refs(db_refs):
        """Return a standardized db refs dict for a given db refs dict.

        Parameters
        ----------
        db_refs : dict
            A dict of db refs that may not be standardized, i.e., may be
            missing an available UP ID corresponding to an existing HGNC ID.

        Returns
        -------
        dict
            The db_refs dict with standardized entries.
        """
        up_id = db_refs.get('UP')
        hgnc_id = db_refs.get('HGNC')
        # If we have a UP ID and no HGNC ID, we try to get a gene name,
        # and if possible, a HGNC ID from that
        if up_id and not hgnc_id and uniprot_client.is_human(up_id):
            gene_name = uniprot_client.get_gene_name(up_id, False)
            if gene_name:
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
        # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to
        # get the UP ID
        elif hgnc_id:
            # Now get the Uniprot ID for the gene
            mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if mapped_up_id:
                # If we find an inconsistency, we explain it in an error
                # message and fall back on the mapped ID
                if up_id and up_id != mapped_up_id:
                    # We handle a special case here in which mapped_up_id is
                    # actually a list of UP IDs that we skip and just keep
                    # the original up_id
                    if ', ' not in mapped_up_id:
                        # If we got a proper single protein mapping, we use
                        # the mapped_up_id to standardize to.
                        msg = ('Inconsistent groundings UP:%s not equal to '
                               'UP:%s mapped from HGNC:%s, standardizing to '
                               'UP:%s' %
                               (up_id, mapped_up_id, hgnc_id, mapped_up_id))
                        logger.debug(msg)
                        db_refs['UP'] = mapped_up_id
                # If there is no conflict, we can update the UP entry
                else:
                    db_refs['UP'] = mapped_up_id

        # Now try to improve chemical groundings
        pc_id = db_refs.get('PUBCHEM')
        chebi_id = db_refs.get('CHEBI')
        hmdb_id = db_refs.get('HMDB')
        mapped_chebi_id = None
        mapped_pc_id = None
        hmdb_mapped_chebi_id = None
        # If we have original PUBCHEM and CHEBI IDs, we always keep those:
        if pc_id:
            mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id)
            if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'):
                mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id
        if chebi_id:
            mapped_pc_id = chebi_client.get_pubchem_id(chebi_id)
        if hmdb_id:
            hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id)
            if hmdb_mapped_chebi_id and \
                    not hmdb_mapped_chebi_id.startswith('CHEBI:'):
                hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id
        # We always keep originals if both are present but display warnings
        # if there are inconsistencies
        if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id:
            msg = ('Inconsistent groundings PUBCHEM:%s not equal to '
                   'PUBCHEM:%s mapped from %s, standardizing to '
                   'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id))
            logger.debug(msg)
        elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \
                mapped_chebi_id:
            msg = ('Inconsistent groundings %s not equal to '
                   '%s mapped from PUBCHEM:%s, standardizing to '
                   '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id))
            logger.debug(msg)
        # If we have PC and not CHEBI but can map to CHEBI, we do that
        elif pc_id and not chebi_id and mapped_chebi_id:
            db_refs['CHEBI'] = mapped_chebi_id
        elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \
                hmdb_mapped_chebi_id != chebi_id:
            msg = ('Inconsistent groundings %s not equal to '
                   '%s mapped from %s, standardizing to '
                   '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id))
            logger.debug(msg)
        elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id:
            db_refs['CHEBI'] = hmdb_mapped_chebi_id
        # If we have CHEBI and not PC but can map to PC, we do that
        elif chebi_id and not pc_id and mapped_pc_id:
            db_refs['PUBCHEM'] = mapped_pc_id
        # Otherwise there is no useful mapping that we can add and no
        # further conflict to resolve.
        return db_refs