Beispiel #1
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' % hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' % (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' % (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return
def test_query_protein_deprecated():
    g = uniprot_client.query_protein('Q8NHX1')
    assert g is not None
    gene_name = uniprot_client.get_gene_name('Q8NHX1')
    assert gene_name == 'MAPK3'
    assert unicode_strs(gene_name)
    gene_name = uniprot_client.get_gene_name('Q8NHX1', web_fallback=False)
    assert gene_name == 'MAPK3'
    assert unicode_strs(gene_name)
def test_query_protein_deprecated():
    g = uniprot_client.query_protein('Q8NHX1')
    assert g is not None
    gene_name = uniprot_client.get_gene_name('Q8NHX1')
    assert gene_name == 'MAPK3'
    assert unicode_strs(gene_name)
    gene_name = uniprot_client.get_gene_name('Q8NHX1', web_fallback=False)
    assert gene_name == 'MAPK3'
    assert unicode_strs(gene_name)
Beispiel #4
0
def read_phosphosite(fname):
    df = pandas.read_csv(fname, index_col=None)
    statements = []
    antibody_map = {}
    for _, row in df.iterrows():
        sub_upid = row['SUB_ID']
        if not pandas.isnull(sub_upid):
            sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid)
            sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
        else:
            sub_hgnc_symbol = row['SUB_GENE']
            sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
            sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id)
        sub = Agent(sub_hgnc_symbol,
                    db_refs={'UP': sub_upid,'HGNC': sub_hgnc})
        residue = row['Actual_site'][0]
        if len(row['Actual_site']) > 1:
            position = row['Actual_site'][1:]
        else:
            position = None

        sub_readout = deepcopy(sub)
        mc = ModCondition('phosphorylation', residue, position)
        sub_readout.mods = [mc]
        ps = row['phosphosite']
        if ps in antibody_map:
            found = False
            for p in antibody_map[ps]:
                if p.name == sub.name and p.mods[0].residue == residue and \
                    p.mods[0].position == position:
                    found = True
                    break
            if not found:
                antibody_map[ps].append(sub_readout)
        else:
            antibody_map[ps] = [sub_readout]

        kin_upid = row['KIN_ID']
        if not pandas.isnull(kin_upid):
            if not uniprot_client.is_human(kin_upid):
                print('%s non human' % kin_upid)
                continue
            kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid)
            kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
        else:
            kin_hgnc_symbol = row['KINASE_GENE_SYMBOL']
            kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
            kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id)
        kin = Agent(kin_hgnc_symbol,
                    db_refs={'UP': kin_upid,'HGNC': kin_hgnc})

        ev = Evidence(source_api='phosphosite')
        st = Phosphorylation(kin, sub, residue, position, evidence = [ev])
        statements.append(st)
    return statements, antibody_map
Beispiel #5
0
def get_all_gene_names(data):
    gene_names = data['antibody']['Gene Name']
    uniprot_ids = data['antibody']['UniProt ID']
    all_genes = set()
    invalid_genes = set()
    for gn, upid in zip(gene_names, uniprot_ids):
        # Some entries are lists of genes separated by commas
        # and we also strip off extra spaces
        names = [x.strip() for x in gn.split(',')]
        ids = [x.strip() for x in upid.split(',')]
        names_from_ids = [uniprot_client.get_gene_name(x) for x in ids]
        # Find invalid gene names
        for name in names:
            if not hgnc_client.get_hgnc_id(name):
                print('Invalid or deprecated gene symbol: %s' % name)
                invalid_genes.add(name)
        # Find inconsistent gene names and UniProt IDs
        if set(names) != set(names_from_ids):
            print('Inconsistent entries:')
            print('- Given gene names: %s' % ','.join(names))
            print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids))
        # Add both the gene names and the gene names derived from UniProt IDs
        all_genes = all_genes.union(set(names)).union(set(names_from_ids))
    # Finally remove the invalid gene names
    all_genes = all_genes.difference(invalid_genes)
    all_genes = sorted(list(all_genes))
    return all_genes
Beispiel #6
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            name = uniprot_client.get_gene_name(c)
            if name is None:
                db_refs['SIGNOR'] = c
            else:
                db_refs['UP'] = c
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id

            famplex_key = ('SIGNOR', c)
            if famplex_key in famplex_map:
                db_refs['FPLX'] = famplex_map[famplex_key]
                if not name:
                    name = db_refs['FPLX']  # Set agent name to Famplex name if
                                            # the Uniprot name is not available
            elif not name:
                # We neither have a Uniprot nor Famplex grounding
                logger.info('Have neither a Uniprot nor Famplex grounding ' + \
                            'for ' + c)
                if not name:
                    name = db_refs['SIGNOR']  # Set the agent name to the
                                              # Signor name if neither the
                                              # Uniprot nor Famplex names are
                                              # available
            assert(name is not None)
            agents.append(Agent(name, db_refs=db_refs))
        return agents
Beispiel #7
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            logger.warning('Skipping collection Agent.')
            return None
        name_tag = ref.find("var/[@name='name']")
        if name_tag is not None:
            name = name_tag.text
        else:
            return None
        uid_tag = ref.find("var/[@name='uid']")
        if uid_tag is not None:
            uid = uid_tag.text
        else:
            uid = None

        db_refs = {}
        text_tag = ref.find("var/[@name='raw-text']")
        if text_tag is not None:
            db_refs['TEXT'] = text_tag.text

        if uid is not None and uid.startswith('UP:'):
            up_mnemonic = uid[3:]
            up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic)
            if up_id is not None:
                up_name = uniprot_client.get_gene_name(up_id)
                if up_name is not None:
                    name = up_name
                db_refs['UP'] = up_id

        assert name is not None

        agent = Agent(name, db_refs=db_refs)
        return agent
Beispiel #8
0
    def _extract_protein(self, line):
        # Extract key information from the lines.
        prot_name = line['Protein Name']
        prot_id = line['Protein HMS LINCS ID']

        # Get available db-refs.
        db_refs = {}
        if prot_id:
            db_refs.update(self._lc.get_protein_refs(prot_id))
            # Since the resource only gives us an UP ID (not HGNC), we
            # try to get that and standardize the name to the gene name
            up_id = db_refs.get('UP')
            if up_id:
                gene_name = uniprot_client.get_gene_name(up_id)
                if gene_name:
                    prot_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
        # In some cases lines are missing protein information in which
        # case we return None
        else:
            return None

        # Create the agent.
        return Agent(prot_name, db_refs=db_refs)
Beispiel #9
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            logger.warning('Skipping collection Agent.')
            return None
        name_tag = ref.find("var/[@name='name']")
        if name_tag is not None:
            name = name_tag.text
        else:
            return None
        uid_tag = ref.find("var/[@name='uid']")
        if uid_tag is not None:
            uid = uid_tag.text
        else:
            uid = None

        db_refs = {}
        text_tag = ref.find("var/[@name='raw-text']")
        if text_tag is not None:
            db_refs['TEXT'] = text_tag.text

        if uid is not None and uid.startswith('UP:'):
            up_mnemonic = uid[3:]
            up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic)
            if up_id is not None:
                up_name = uniprot_client.get_gene_name(up_id)
                if up_name is not None:
                    name = up_name
                db_refs['UP'] = up_id

        assert name is not None

        agent = Agent(name, db_refs=db_refs)
        return agent
Beispiel #10
0
 def _get_db_refs(bpe):
     db_refs = {}
     if _is_protein(bpe):
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         # Handle missing HGNC/UP ids
         if hgnc_id and not uniprot_id:
             uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
         if uniprot_id and not hgnc_id:
             if uniprot_client.is_human(uniprot_id):
                 hgnc_name = uniprot_client.get_gene_name(uniprot_id, False)
                 if hgnc_name:
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     elif _is_small_molecule(bpe):
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
     else:
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     return db_refs
Beispiel #11
0
    def _extract_protein(self, line):
        # Extract key information from the lines.
        prot_name = line['Protein Name']
        prot_id = line['Protein HMS LINCS ID']

        # Get available db-refs.
        db_refs = {}
        if prot_id:
            db_refs.update(self._lc.get_protein_refs(prot_id))
            # Since the resource only gives us an UP ID (not HGNC), we
            # try to get that and standardize the name to the gene name
            up_id = db_refs.get('UP')
            if up_id:
                hgnc_id = uniprot_client.get_hgnc_id(up_id)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
                    prot_name = hgnc_client.get_hgnc_name(hgnc_id)
                else:
                    gene_name = uniprot_client.get_gene_name(up_id)
                    if gene_name:
                        prot_name = gene_name
        # In some cases lines are missing protein information in which
        # case we return None
        else:
            return None

        # Create the agent.
        return Agent(prot_name, db_refs=db_refs)
Beispiel #12
0
def fix_stmts(stmts):
    new_stmts = []
    for stmt in stmts:
        for ev in stmt.evidence:
            if ev.pmid and ev.pmid.startswith('PMID'):
                ev.pmid = ev.pmid[:-4]
        # Skip if no subject
        if isinstance(stmt, RegulateActivity):
            if stmt.subj is None:
                continue
        # Skip if no locations
        if isinstance(stmt, Translocation):
            if not (stmt.from_location or stmt.to_location):
                continue
        for agent in stmt.agent_list():
            if agent is not None:
                upid = agent.db_refs.get('UP')
                if upid:
                    gene_name = uniprot_client.get_gene_name(upid)
                    if gene_name:
                        agent.name = gene_name
                        if uniprot_client.is_human(upid):
                            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                            if hgnc_id:
                                agent.db_refs['HGNC'] = hgnc_id

        new_stmts.append(stmt)
    return new_stmts
def get_genes_for_go_ids(go_ids):
    """Return genes that are annotated with a given go ID or its children."""
    df = goa[goa['GO_ID'].isin(set(go_ids))]
    up_ids = sorted(list(set(df['DB_ID'])))
    gene_names = [uniprot_client.get_gene_name(up_id) for up_id in up_ids]
    gene_names = {g for g in gene_names if g}
    return gene_names
Beispiel #14
0
 def rename_agents(self, stmts):
     # Make a copy of the stmts
     mapped_stmts = deepcopy(stmts)
     # Iterate over the statements
     for stmt_ix, stmt in enumerate(mapped_stmts):
         # Iterate over the agents
         for agent in stmt.agent_list():
             if agent is None:
                 continue
             old_name = agent.name
             # If there's a Bioentities ID, prefer that for the name
             if agent.db_refs.get('BE'):
                 agent.name = agent.db_refs.get('BE')
             # Take a HGNC name from Uniprot next
             elif agent.db_refs.get('UP'):
                 # Try for the gene name
                 gene_name = uniprot_client.get_gene_name(
                     agent.db_refs.get('UP'), web_fallback=False)
                 if gene_name:
                     agent.name = gene_name
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         agent.db_refs['HGNC'] = hgnc_id
                 # Take the text string
                 #if agent.db_refs.get('TEXT'):
                 #    agent.name = agent.db_refs.get('TEXT')
                 # If this fails, then we continue with no change
             # Fall back to the text string
             #elif agent.db_refs.get('TEXT'):
             #    agent.name = agent.db_refs.get('TEXT')
     return mapped_stmts
Beispiel #15
0
def generate_adeft_terms():
    from adeft import available_shortforms
    from adeft.disambiguate import load_disambiguator
    all_term_args = set()
    for shortform in available_shortforms:
        da = load_disambiguator(shortform)
        for grounding in da.names.keys():
            if grounding == 'ungrounded' or ':' not in grounding:
                continue
            db_ns, db_id = grounding.split(':', maxsplit=1)
            if db_ns == 'HGNC':
                standard_name = hgnc_client.get_hgnc_name(db_id)
            elif db_ns == 'GO':
                standard_name = go_client.get_go_label(db_id)
            elif db_ns == 'MESH':
                standard_name = mesh_client.get_mesh_name(db_id)
            elif db_ns == 'CHEBI':
                standard_name = chebi_client.get_chebi_name_from_id(db_id)
            elif db_ns == 'FPLX':
                standard_name = db_id
            elif db_ns == 'UP':
                standard_name = uniprot_client.get_gene_name(db_id)
            else:
                logger.warning('Unknown grounding namespace from Adeft: %s' %
                               db_ns)
                continue
            term_args = (normalize(shortform), shortform, db_ns, db_id,
                         standard_name, 'synonym', 'adeft')
            all_term_args.add(term_args)
    terms = [
        Term(*term_args)
        for term_args in sorted(list(all_term_args), key=lambda x: x[0])
    ]
    return terms
Beispiel #16
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            name = uniprot_client.get_gene_name(c)
            if name is None:
                db_refs['SIGNOR'] = c
            else:
                db_refs['UP'] = c
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id

            famplex_key = ('SIGNOR', c)
            if famplex_key in famplex_map:
                db_refs['FPLX'] = famplex_map[famplex_key]
                if not name:
                    name = db_refs['FPLX']  # Set agent name to Famplex name if
                    # the Uniprot name is not available
            elif not name:
                # We neither have a Uniprot nor Famplex grounding
                logger.info('Have neither a Uniprot nor Famplex grounding ' + \
                            'for ' + c)
                if not name:
                    name = db_refs['SIGNOR']  # Set the agent name to the
                    # Signor name if neither the
                    # Uniprot nor Famplex names are
                    # available
            assert (name is not None)
            agents.append(Agent(name, db_refs=db_refs))
        return agents
Beispiel #17
0
 def rename_agents(self, stmts):
     # Make a copy of the stmts
     mapped_stmts = deepcopy(stmts)
     # Iterate over the statements
     for stmt_ix, stmt in enumerate(mapped_stmts):
         # Iterate over the agents
         for agent in stmt.agent_list():
             if agent is None:
                 continue
             old_name = agent.name
             # If there's a Bioentities ID, prefer that for the name
             if agent.db_refs.get('BE'):
                 agent.name = agent.db_refs.get('BE')
             # Take a HGNC name from Uniprot next
             elif agent.db_refs.get('UP'):
                 # Try for the gene name
                 gene_name = uniprot_client.get_gene_name(
                                                 agent.db_refs.get('UP'),
                                                 web_fallback=False)
                 if gene_name:
                     agent.name = gene_name
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         agent.db_refs['HGNC'] = hgnc_id
                 # Take the text string
                 #if agent.db_refs.get('TEXT'):
                 #    agent.name = agent.db_refs.get('TEXT')
                 # If this fails, then we continue with no change
             # Fall back to the text string
             #elif agent.db_refs.get('TEXT'):
             #    agent.name = agent.db_refs.get('TEXT')
     return mapped_stmts
Beispiel #18
0
def get_agent_from_grounding(grounding, up_web_fallback=False):
    """Return an INDRA Agent based on a grounding annotation."""
    db_ns, db_id = grounding.split(':')
    # Assume UniProt or RefSeq IDs
    assert db_ns in {'uniprotkb', 'refseq', 'ddbj/embl/genbank'}, db_ns
    if db_ns == 'uniprotkb':
        if '-' in db_id:
            up_id, feat_id = db_id.split('-')
            # Assume it's a feature ID
            assert feat_id.startswith('PRO'), feat_id
            db_refs = {'UP': up_id, 'UPPRO': feat_id}
        else:
            db_refs = {'UP': db_id}
    elif db_ns == 'refseq':
        db_refs = {'REFSEQ_PROT': db_id}
    else:
        db_refs = {'GENBANK': db_id}
    agent = Agent(db_id, db_refs=db_refs)
    standardized = standardize_agent_name(agent)
    if up_web_fallback:
        # Handle special case of unreviewed UP entries
        if not standardized and 'UP' in db_refs:
            name = uniprot_client.get_gene_name(db_refs['UP'],
                                                web_fallback=True)
            if name:
                agent.name = name
    return agent
Beispiel #19
0
    def standardize_agent_name(agent, standardize_refs=True):
        """Standardize the name of an Agent based on grounding information.

        If an agent contains a FamPlex grounding, the FamPlex ID is used as a
        name. Otherwise if it contains a Uniprot ID, an attempt is made to find
        the associated HGNC gene name. If one can be found it is used as the
        agent name and the associated HGNC ID is added as an entry to the
        db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of
        priority to assign a standardized name to the Agent. If no relevant
        IDs are found, the name is not changed.

        Parameters
        ----------
        agent : indra.statements.Agent
            An INDRA Agent whose name attribute should be standardized based
            on grounding information.
        standardize_refs : Optional[bool]
            If True, this function assumes that the Agent's db_refs need to
            be standardized, e.g., HGNC mapped to UP.
            Default: True
        """
        # We return immediately for None Agents
        if agent is None:
            return

        if standardize_refs:
            agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs)

        # We next look for prioritized grounding, if missing, we return
        db_ns, db_id = agent.get_grounding()
        if not db_ns or not db_id:
            return

        # If there's a FamPlex ID, prefer that for the name
        if db_ns == 'FPLX':
            agent.name = agent.db_refs['FPLX']
        # Importantly, HGNC here will be a symbol because that is what
        # get_grounding returns
        elif db_ns == 'HGNC':
            agent.name = hgnc_client.get_hgnc_name(db_id)
        elif db_ns == 'UP':
            # Try for the gene name
            gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'],
                                                     web_fallback=False)
            if gene_name:
                agent.name = gene_name
        elif db_ns == 'CHEBI':
            chebi_name = \
                chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI'])
            if chebi_name:
                agent.name = chebi_name
        elif db_ns == 'MESH':
            mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False)
            if mesh_name:
                agent.name = mesh_name
        elif db_ns == 'GO':
            go_name = go_client.get_go_label(agent.db_refs['GO'])
            if go_name:
                agent.name = go_name
        return
Beispiel #20
0
    def _get_name_by_id(self, entity_id):
        entity_term = self.tree.find("TERM/[@id='%s']" % entity_id)
        if entity_term is None:
            logger.debug('Term %s for entity not found' % entity_id)
            return None
        name = entity_term.find("name")
        if name is None:
            logger.debug('Entity without a name')
            return None
        try:
            dbid = entity_term.attrib["dbid"]
        except:
            #logger.debug('No grounding information for %s' % name.text)
            return self._get_valid_name(name.text)

        dbids = dbid.split('|')
        hgnc_ids = [i for i in dbids if i.startswith('HGNC')]
        up_ids = [i for i in dbids if i.startswith('UP')]


        #TODO: handle protein families like 14-3-3 with IDs like
        # XFAM:PF00244.15, FA:00007
        if hgnc_ids:
            if len(hgnc_ids) > 1:
                lisp_str = entity_term.attrib.get('lisp')
                if lisp_str is None:
                    hgnc_id = re.match(r'HGNC\:([0-9]*)',
                                       hgnc_ids[0]).groups()[0]
                else:
                    parts = lisp_str.split('(TERM :ID ')
                    scores = {}
                    for p in parts:
                        res = re.findall('HGNC::\|(.*)\|', p)
                        if res:
                            hgnc_id = res[0]
                            score = re.findall(':SCORE ([^ ]+)', p)[0]
                            scores[hgnc_id] = float(score)
                    if scores:
                        sorted_ids = sorted(scores.items(), key=operator.itemgetter(1))
                        hgnc_id = sorted_ids[-1][0]
            else:
                hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0]
            hgnc_name = self._get_hgnc_name(hgnc_id)
            return self._get_valid_name(hgnc_name)
        elif up_ids:
            if len(hgnc_ids) > 1:
                logger.debug('%d UniProt IDs reported.' % len(up_ids))
            up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0]
            # First try to get HGNC name
            hgnc_name = up_client.get_hgnc_name(up_id)
            if hgnc_name is not None:
                return self._get_valid_name(hgnc_name)
            # Next, try to get the gene name
            gene_name = up_client.get_gene_name(up_id)
            if gene_name is not None:
                return self._get_valid_name(gene_name)
        # By default, return the text of the name tag
        name_txt = name.text.strip('|')
        return self._get_valid_name(name_txt)
Beispiel #21
0
def print_reading_contribs(reader_sites, psp_sites):
    reader_only = set(reader_sites.keys()) - set(psp_sites.keys())
    for ctrl_id, ctrl_ns, up_id, residue, pos in \
            sorted(reader_only, key=lambda x: (x[0], x[2], x[4])):
        target_name = uniprot_client.get_gene_name(up_id, web_fallback=False)
        if target_name is None:
            print('Could not get gene name for %s' % up_id)
        print('%s -> %s-%s%s' % (ctrl_id, target_name, residue, int(pos)))
Beispiel #22
0
def get_db_refs_by_ident(ns, ident, node_data):
    """Return standard name and grounding based on a namespace and an ID.

    Parameters
    ----------
    ns : str
        A name space in which the given identifier is interpreted.
    ident : str
        The identifier in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.
    """
    name = node_data.get(pc.NAME)
    db_refs = None
    if ns == 'HGNC':
        name = hgnc_client.get_hgnc_name(ident)
        if not name:
            return None, None
        db_refs = {'HGNC': ident}
        up_id = _get_up_id(ident)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(ident)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id
    elif ns == 'UP':
        db_refs = {'UP': ident}
        name = uniprot_client.get_gene_name(ident)
        if not name:
            return None, None
        if uniprot_client.is_human(ident):
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info('Uniprot ID linked to invalid human gene '
                            'name %s' % name)
            else:
                db_refs['HGNC'] = hgnc_id
    elif ns == 'MIRBASE':
        db_refs = {'MIRBASE': ident}
    elif ns in ('MGI', 'RGD', 'CHEBI', 'HMDB', 'MESH'):
        db_refs = {ns: ident}
        # raise ValueError('Identifiers for MGI and RGD databases are not '
        #                 'currently handled: %s' % node_data)
    elif ns == 'PUBCHEM.COMPOUND':
        db_refs = {'PUBCHEM': ident}
    else:
        logger.info("Unhandled namespace %s with name %s and "
                    "identifier %s (%s)." % (ns, name,
                                             node_data.identifier,
                                             node_data))
    return name, db_refs
Beispiel #23
0
def protein_map_from_twg(twg):
    """Build  map of entity texts to validate protein grounding.

    Looks at the grounding of the entity texts extracted from the statements
    and finds proteins where there is grounding to a human protein that maps to
    an HGNC name that is an exact match to the entity text. Returns a dict that
    can be used to update/expand the grounding map.

    Parameters
    ----------
    twg : list of tuple
        list of tuples of the form output by agent_texts_with_grounding

    Returns
    -------
    protein_map : dict
        dict keyed on agent text with associated values
        {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts
        where the grounding map was able to find human protein grounded to
        this agent_text in Uniprot.
    """

    protein_map = {}
    unmatched = 0
    matched = 0
    logger.info('Building grounding map for human proteins')
    for agent_text, grounding_list, _ in twg:
        # If 'UP' (Uniprot) not one of the grounding entries for this text,
        # then we skip it.
        if 'UP' not in [entry[0] for entry in grounding_list]:
            continue
        # Otherwise, collect all the Uniprot IDs for this protein.
        uniprot_ids = [
            entry[1] for entry in grounding_list if entry[0] == 'UP'
        ]
        # For each Uniprot ID, look up the species
        for uniprot_id in uniprot_ids:
            # If it's not a human protein, skip it
            mnemonic = uniprot_client.get_mnemonic(uniprot_id)
            if mnemonic is None or not mnemonic.endswith('_HUMAN'):
                continue
            # Otherwise, look up the gene name in HGNC and match against the
            # agent text
            gene_name = uniprot_client.get_gene_name(uniprot_id)
            if gene_name is None:
                unmatched += 1
                continue
            if agent_text.upper() == gene_name.upper():
                matched += 1
                protein_map[agent_text] = {
                    'TEXT': agent_text,
                    'UP': uniprot_id
                }
            else:
                unmatched += 1
    logger.info('Exact matches for %d proteins' % matched)
    logger.info('No match (or no gene name) for %d proteins' % unmatched)
    return protein_map
Beispiel #24
0
def get_drug_targets(fname='drug_grounding.csv'):
    df = pandas.read_csv(fname, index_col=None, header=None)
    abbrevs = df[1]
    target_upids = df[6]
    targets = {}
    for abb, tupid in zip(abbrevs, target_upids):
        targets[abb] = [uniprot_client.get_gene_name(ui)
                        for ui in tupid.split(',')]
    return targets
Beispiel #25
0
def get_drug_inhibition_stmts(drug):
    """Query ChEMBL for kinetics data given drug as Agent get back statements

    Parameters
    ----------
    drug : Agent
        Agent representing drug with MESH or CHEBI grounding

    Returns
    -------
    stmts : list of INDRA statements
        INDRA statements generated by querying ChEMBL for all kinetics data of
        a drug interacting with protein targets
    """
    chebi_id = drug.db_refs.get('CHEBI')
    mesh_id = drug.db_refs.get('MESH')
    if chebi_id:
        drug_chembl_id = chebi_client.get_chembl_id(chebi_id)
    elif mesh_id:
        drug_chembl_id = get_chembl_id(mesh_id)
    else:
        logger.error('Drug missing ChEBI or MESH grounding.')
        return None
    logger.info('Drug: %s' % (drug_chembl_id))
    query_dict = {'query': 'activity',
                  'params': {'molecule_chembl_id': drug_chembl_id,
                             'limit': 10000}
                  }
    res = send_query(query_dict)
    activities = res['activities']
    targ_act_dict = activities_by_target(activities)
    target_chembl_ids = [x for x in targ_act_dict]
    protein_targets = get_protein_targets_only(target_chembl_ids)
    filtered_targ_act_dict = {t: targ_act_dict[t]
                              for t in [x for x in protein_targets]}
    stmts = []
    for target_chembl_id in filtered_targ_act_dict:
        target_activity_ids = filtered_targ_act_dict[target_chembl_id]
        target_activites = [x for x in activities
                            if x['activity_id'] in target_activity_ids]
        target_upids = []
        targ_comp = protein_targets[target_chembl_id]['target_components']
        for t_c in targ_comp:
            target_upids.append(t_c['accession'])
        evidence = []
        for assay in target_activites:
            ev = get_evidence(assay)
            if not ev:
                continue
            evidence.append(ev)
        if len(evidence) > 0:
            for target_upid in target_upids:
                agent_name = uniprot_client.get_gene_name(target_upid)
                target_agent = Agent(agent_name, db_refs={'UP': target_upid})
                st = Inhibition(drug, target_agent, evidence=evidence)
                stmts.append(st)
    return stmts
Beispiel #26
0
def get_unannotated_antibody_genes(data):
    """Return the gene names corresponding to unannotated ABs."""
    all_genes = []
    for k, v in unannotated_ab_map.items():
        up_ids = v.split(',')
        for up_id in up_ids:
            gene_name = uniprot_client.get_gene_name(up_id)
            all_genes.append(gene_name)
    return sorted(list(set(all_genes)))
Beispiel #27
0
def get_unannotated_antibody_genes(data):
    """Return the gene names corresponding to unannotated ABs."""
    all_genes = []
    for k, v in unannotated_ab_map.items():
        up_ids = v.split(',')
        for up_id in up_ids:
            gene_name = uniprot_client.get_gene_name(up_id)
            all_genes.append(gene_name)
    return sorted(list(set(all_genes)))
Beispiel #28
0
def load_brca_sites():
    filename = 'sources/Merged_dataset_normalized_subset.csv'
    sites = set([])
    for row in read_unicode_csv(filename, skiprows=1):
        entry_info = row[0]
        site_info = entry_info.split('_')[1]
        up_id = row[-1]
        gene_name = uniprot_client.get_gene_name(up_id)
        sites.add((gene_name, site_info))
    return sites
Beispiel #29
0
def get_drug_targets(fname='drug_grounding.csv'):
    df = pandas.read_csv(fname, index_col=None, header=None)
    abbrevs = df[1]
    target_upids = df[6]
    targets = {}
    for abb, tupid in zip(abbrevs, target_upids):
        targets[abb] = [
            uniprot_client.get_gene_name(ui) for ui in tupid.split(',')
        ]
    return targets
Beispiel #30
0
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
                 [BoundCondition(a, True) for a in agents[1:]]
         return agent
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             up_id = id
             db_refs = {'UP': up_id}
             hgnc_id = uniprot_client.get_hgnc_id(up_id)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
                 name = hgnc_client.get_hgnc_name(hgnc_id)
             else:
                 name = uniprot_client.get_gene_name(up_id)
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {
                 database: id
             }  # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             name = ent_name
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' % str(key))
             else:
                 db_refs['FPLX'] = famplex_id
                 name = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase',
                                 'DRUGBANK'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
             name = ent_name
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         return Agent(name, db_refs=db_refs)
Beispiel #31
0
def protein_map_from_twg(twg):
    """Build  map of entity texts to validate protein grounding.

    Looks at the grounding of the entity texts extracted from the statements
    and finds proteins where there is grounding to a human protein that maps to
    an HGNC name that is an exact match to the entity text. Returns a dict that
    can be used to update/expand the grounding map.

    Parameters
    ----------
    twg : list of tuple
        list of tuples of the form output by agent_texts_with_grounding

    Returns
    -------
    protein_map : dict
        dict keyed on agent text with associated values
        {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts
        where the grounding map was able to find human protein grounded to
        this agent_text in Uniprot.
    """

    protein_map = {}
    unmatched = 0
    matched = 0
    logger.info('Building grounding map for human proteins')
    for agent_text, grounding_list, _ in twg:
        # If 'UP' (Uniprot) not one of the grounding entries for this text,
        # then we skip it.
        if 'UP' not in [entry[0] for entry in grounding_list]:
            continue
        # Otherwise, collect all the Uniprot IDs for this protein.
        uniprot_ids = [entry[1] for entry in grounding_list
                       if entry[0] == 'UP']
        # For each Uniprot ID, look up the species
        for uniprot_id in uniprot_ids:
            # If it's not a human protein, skip it
            mnemonic = uniprot_client.get_mnemonic(uniprot_id)
            if mnemonic is None or not mnemonic.endswith('_HUMAN'):
                continue
            # Otherwise, look up the gene name in HGNC and match against the
            # agent text
            gene_name = uniprot_client.get_gene_name(uniprot_id)
            if gene_name is None:
                unmatched += 1
                continue
            if agent_text.upper() == gene_name.upper():
                matched += 1
                protein_map[agent_text] = {'TEXT': agent_text,
                                           'UP': uniprot_id}
            else:
                unmatched += 1
    logger.info('Exact matches for %d proteins' % matched)
    logger.info('No match (or no gene name) for %d proteins' % unmatched)
    return protein_map
Beispiel #32
0
def get_drug_targets(fname=None):
    if not fname:
        fname = drug_grounding_file
    df = pandas.read_csv(fname, index_col=None, header=None, encoding='utf-8')
    abbrevs = df[1]
    target_upids = df[6]
    targets = {}
    for abb, tupid in zip(abbrevs, target_upids):
        targets[abb] = [uniprot_client.get_gene_name(ui)
                        for ui in tupid.split(',')]
    return targets
Beispiel #33
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
    agent.db_refs = db_refs_tmp
    # Check if we have a BE entry
    be_id = agent.db_refs.get('BE')
    # Try to map to BE from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = bioentities_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # BE takes precedence if we have it
    if be_id:
        agent.db_refs['BE'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
Beispiel #34
0
    def rename_agents(self, stmts):
        """Return a list of mapped statements with updated agent names.

        Creates a new list of statements without modifying the original list.

        The agents in a statement should be renamed if the grounding map has
        updated their db_refs. If an agent contains a FamPlex grounding, the
        FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID,
        an attempt is made to find the associated HGNC gene name. If one can
        be found it is used as the agent name and the associated HGNC ID is
        added as an entry to the db_refs. If neither a FamPlex ID or HGNC name
        can be found, falls back to the original name.

        Parameters
        ----------
        stmts : list of :py:class:`indra.statements.Statement`
            List of statements whose Agents need their names updated.

        Returns
        -------
        mapped_stmts : list of :py:class:`indra.statements.Statement`
            A new list of Statements with updated Agent names
        """
        # Make a copy of the stmts
        mapped_stmts = deepcopy(stmts)
        # Iterate over the statements
        for _, stmt in enumerate(mapped_stmts):
            # Iterate over the agents
            for agent in stmt.agent_list():
                if agent is None:
                    continue
                # If there's a FamPlex ID, prefer that for the name
                if agent.db_refs.get('FPLX'):
                    agent.name = agent.db_refs.get('FPLX')
                # Take a HGNC name from Uniprot next
                elif agent.db_refs.get('UP'):
                    # Try for the gene name
                    gene_name = uniprot_client.get_gene_name(
                                                    agent.db_refs.get('UP'),
                                                    web_fallback=False)
                    if gene_name:
                        agent.name = gene_name
                        hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                        if hgnc_id:
                            agent.db_refs['HGNC'] = hgnc_id
                    # Take the text string
                    #if agent.db_refs.get('TEXT'):
                    #    agent.name = agent.db_refs.get('TEXT')
                    # If this fails, then we continue with no change
                # Fall back to the text string
                #elif agent.db_refs.get('TEXT'):
                #    agent.name = agent.db_refs.get('TEXT')
        return mapped_stmts
Beispiel #35
0
    def rename_agents(self, stmts):
        """Return a list of mapped statements with updated agent names.

        Creates a new list of statements without modifying the original list.

        The agents in a statement should be renamed if the grounding map has
        updated their db_refs. If an agent contains a FamPlex grounding, the
        FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID,
        an attempt is made to find the associated HGNC gene name. If one can
        be found it is used as the agent name and the associated HGNC ID is
        added as an entry to the db_refs. If neither a FamPlex ID or HGNC name
        can be found, falls back to the original name.

        Parameters
        ----------
        stmts : list of :py:class:`indra.statements.Statement`
            List of statements whose Agents need their names updated.

        Returns
        -------
        mapped_stmts : list of :py:class:`indra.statements.Statement`
            A new list of Statements with updated Agent names
        """
        # Make a copy of the stmts
        mapped_stmts = deepcopy(stmts)
        # Iterate over the statements
        for _, stmt in enumerate(mapped_stmts):
            # Iterate over the agents
            for agent in stmt.agent_list():
                if agent is None:
                    continue
                # If there's a FamPlex ID, prefer that for the name
                if agent.db_refs.get('FPLX'):
                    agent.name = agent.db_refs.get('FPLX')
                # Take a HGNC name from Uniprot next
                elif agent.db_refs.get('UP'):
                    # Try for the gene name
                    gene_name = uniprot_client.get_gene_name(
                        agent.db_refs.get('UP'), web_fallback=False)
                    if gene_name:
                        agent.name = gene_name
                        hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                        if hgnc_id:
                            agent.db_refs['HGNC'] = hgnc_id
                    # Take the text string
                    #if agent.db_refs.get('TEXT'):
                    #    agent.name = agent.db_refs.get('TEXT')
                    # If this fails, then we continue with no change
                # Fall back to the text string
                #elif agent.db_refs.get('TEXT'):
                #    agent.name = agent.db_refs.get('TEXT')
        return mapped_stmts
Beispiel #36
0
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
                 [BoundCondition(a, True) for a in agents[1:]]
         return agent
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             up_id = id
             db_refs = {'UP': up_id}
             name = uniprot_client.get_gene_name(up_id)
             hgnc_id = hgnc_client.get_hgnc_id(name)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {database: id} # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             name = ent_name
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' %
                             str(key))
             else:
                 db_refs['FPLX'] = famplex_id
                 name = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
             name = ent_name
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         return Agent(name, db_refs=db_refs)
Beispiel #37
0
def fix_protein_grounding(agent):
    for k, v in agent.db_refs.items():
        agent.db_refs.pop(k, None)
        agent.db_refs[k.upper()] = v
    if not agent.db_refs.get('TEXT'):
        agent.db_refs['TEXT'] = agent.name
    up_id = agent.db_refs.get('UP')
    if up_id:
        up_id = up_id.split('-')[0]
        agent.db_refs['UP'] = up_id
        hgnc_symbol = uniprot_client.get_gene_name(up_id)
        hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
        if hgnc_id:
            agent.name = hgnc_symbol
            agent.db_refs['HGNC'] = hgnc_id
Beispiel #38
0
def get_genes_for_go_ids(go_ids, goa):
    """Return genes that are annotated with a given go ID or its children."""
    all_go_ids = set()
    for go_id in go_ids:
        children_go_ids = {
            ch[1]
            for ch in bio_ontology.get_children('GO', go_id)
        }
        all_go_ids.add(go_id)
        all_go_ids |= children_go_ids
    df = goa[goa['GO_ID'].isin(all_go_ids)]
    up_ids = sorted(list(set(df['DB_ID'])))
    gene_names = [uniprot_client.get_gene_name(up_id) for up_id in up_ids]
    gene_names = {g for g in gene_names if g}
    return gene_names
Beispiel #39
0
def _agent_from_id(db_id):
    # There are some Ensembl protein IDs which we currently can't normalize
    # to anything else (unlike ENSG).
    if db_id.startswith('ENSP'):
        db_refs = {'ENSEMBL': db_id}
        name = db_id
    # All other entries are UniProt IDs
    else:
        name = uniprot_client.get_gene_name(db_id)
        if not name:
            return None
        db_refs = {'UP': db_id}
        hgnc_id = uniprot_client.get_hgnc_id(db_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    return Agent(name, db_refs=db_refs)
Beispiel #40
0
 def get_name(bpe):
     # FIXME Deal with case when HGNC entry is not name
     # Deal with case when multiple Uniprot IDs marked as
     # primary
     hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
     uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
     if hgnc_id is not None:
         name = BiopaxProcessor._get_hgnc_name(hgnc_id)
         if name is None:
             name = bpe.getDisplayName()
     elif uniprot_id is not None:
         name = uniprot_client.get_gene_name(uniprot_id)
         if name is None:
             name = bpe.getDisplayName()
     else:
         name = bpe.getDisplayName()
     return name
Beispiel #41
0
def get_antibody_map(data):
    phos_ab_map = get_phospho_antibody_map()
    ab_map = {}
    for _, row in data['antibody'].iterrows():
        ab_name = row['Protein Data ID']
        if ab_name in phos_ab_map:
            continue
        upids = row['UniProt ID'].split(',')
        for upid in upids:
            hgnc_symbol = uniprot_client.get_gene_name(upid)
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
            target = Agent(hgnc_symbol, db_refs={'UP': upid, 'HGNC': hgnc_id})
            try:
                ab_map[ab_name].append(target)
            except KeyError:
                ab_map[ab_name] = [target]
    ab_map.update(phos_ab_map)
    return ab_map
Beispiel #42
0
def get_all_enzymes():
    HOME = str(Path.home())
    ec_code_path = '.obo/ec-code/ec-code.obo'
    if not os.path.exists(os.path.join(HOME, ec_code_path)):
        _ = pyobo.get_id_name_mapping('ec-code')
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    else:
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    up_nodes = set()
    for node in obo.nodes:
        if node.startswith('uniprot'):
            up_nodes.add(node[8:])
    human_ups = {u for u in up_nodes if uniprot_client.is_human(u)}
    enzymes = {uniprot_client.get_gene_name(u) for u in human_ups}
    enzymes = {g for g in enzymes if not hgnc_client.is_kinase(g)}
    enzymes = {g for g in enzymes if not hgnc_client.is_phosphatase(g)}
    logger.info(f'Filtered {len(enzymes)} enzymes in total')
    return enzymes
Beispiel #43
0
def get_all_gene_names(data, out_file='prior_genes.txt'):
    """Return all gene names corresponding to all ABs."""
    filt = pandas.notnull(data['antibody']['Protein Data ID'])
    data_filt = data['antibody'][filt]
    gene_names = data_filt['Gene Name']
    uniprot_ids = data_filt['UniProt ID']
    all_genes = set()
    invalid_genes = set()
    for gn, upid in zip(gene_names, uniprot_ids):
        # Some entries are lists of genes separated by commas
        # and we also strip off extra spaces
        names = [x.strip() for x in gn.split(',')]
        ids = [x.strip() for x in upid.split(',')]
        names_from_ids = [uniprot_client.get_gene_name(x) for x in ids]
        # Find invalid gene names
        for name in names:
            if not hgnc_client.get_hgnc_id(name):
                print('Invalid or deprecated gene symbol: %s' % name)
                invalid_genes.add(name)
        # Find inconsistent gene names and UniProt IDs
        if set(names) != set(names_from_ids):
            print('Inconsistent entries:')
            print('- Given gene names: %s' % ','.join(names))
            print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids))
        # Add both the gene names and the gene names derived from UniProt IDs
        all_genes = all_genes.union(set(names)).union(set(names_from_ids))
    # Finally remove the invalid gene names
    all_genes = list(all_genes.difference(invalid_genes))
    # Add the unannotated genes
    unannotated_ab_genes = get_unannotated_antibody_genes(data)
    all_genes += unannotated_ab_genes
    # Add drug target genes
    drug_targets = get_drug_targets()
    for targets in drug_targets.values():
        all_genes += targets
    # Add other important genes, for now, the RAS pathway
    all_genes += get_ras227_genes()
    all_genes = sorted(list(set(all_genes)))
    print('%d genes in total' % len(all_genes))
    with open(out_file, 'wb') as fh:
        for gene in all_genes:
            fh.write(('%s\n' % gene).encode('utf-8'))
    return all_genes
Beispiel #44
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         node_name = node['n']
         up_id = cx_db_refs.get('UP')
         if up_id:
             db_refs = {'UP': up_id, 'TEXT': node_name}
             hgnc_id = uniprot_client.get_hgnc_id(up_id)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
                 gene_name = hgnc_client.get_hgnc_name(hgnc_id)
             else:
                 gene_name = uniprot_client.get_gene_name(up_id)
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
Beispiel #45
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         up_id = cx_db_refs.get('UP')
         if up_id:
             gene_name = uniprot_client.get_gene_name(up_id)
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name}
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             node_name = node['n']
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
Beispiel #46
0
 def get_grounding(self):
     import indra.databases.hgnc_client as hgc
     import indra.databases.uniprot_client as upc
     be = self.db_refs.get('FPLX')
     if be:
         return ('FPLX', be)
     hgnc = self.db_refs.get('HGNC')
     if hgnc:
         if isinstance(hgnc, list):
             hgnc = hgnc[0]
         return ('HGNC', hgc.get_hgnc_name(str(hgnc)))
     up = self.db_refs.get('UP')
     if up:
         if isinstance(up, list):
             up = up[0]
         if upc.is_human(up):
             gene_name = upc.get_gene_name(up, web_fallback=False)
             if gene_name:
                 return ('HGNC', gene_name)
         else:
             return ('UP', up)
     return (None, None)
Beispiel #47
0
 def _get_name_by_id(self, entity_id):
     entity_term = self.tree.find("TERM/[@id='%s']" % entity_id)
     name = entity_term.find("name")
     if name is None:
         warnings.warn('Entity without a name')
         return ''
     try:
         dbid = entity_term.attrib["dbid"]
     except:
         warnings.warn('No grounding information for %s' % name.text)
         return self._get_valid_component_name(name.text)
     dbids = dbid.split('|')
     hgnc_ids = [i for i in dbids if i.startswith('HGNC')]
     up_ids = [i for i in dbids if i.startswith('UP')]
     #TODO: handle protein families like 14-3-3 with IDs like
     # XFAM:PF00244.15, FA:00007
     if hgnc_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids))
         hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0]
         hgnc_name = self._get_hgnc_name(hgnc_id)
         return self._get_valid_component_name(hgnc_name)
     elif up_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d UniProt IDs reported.' % len(up_ids))
         up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0]
         up_rdf = up_client.query_protein(up_id)
         # First try to get HGNC name
         hgnc_name = up_client.get_hgnc_name(up_rdf)
         if hgnc_name is not None:
             return self._get_valid_component_name(hgnc_name)
         # Next, try to get the gene name
         gene_name = up_client.get_gene_name(up_rdf)
         if gene_name is not None:
             return self._get_valid_component_name(gene_name)
     # By default, return the text of the name tag
     name_txt = name.text.strip('|')
     return self._get_valid_component_name(name_txt)
Beispiel #48
0
    def _get_element_name(bpe):
        if _is_protein(bpe):
            hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
            uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
            if hgnc_id is not None:
                name = BiopaxProcessor._get_hgnc_name(hgnc_id)
                if name is None:
                    name = bpe.getDisplayName()
            elif uniprot_id is not None:
                name = uniprot_client.get_gene_name(uniprot_id)
                if name is None:
                    name = bpe.getDisplayName()
            else:
                name = bpe.getDisplayName()
        elif _is_small_molecule(bpe):
            name = bpe.getDisplayName()
        elif _is_physical_entity(bpe):
            name = bpe.getDisplayName()
        else:
            logger.info('Unhandled entity type %s' %
                        bpe.getModelInterface().getName())
            name = bpe.getDisplayName()

        return name
Beispiel #49
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                db_refs['UP'] = db_id
                gene_name = uniprot_client.get_gene_name(db_id)
                if gene_name:
                    agent_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert(agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
def test_get_gene_name_nonhuman():
    gene_name = uniprot_client.get_gene_name('P31938')
    assert gene_name == 'Map2k1'
    assert unicode_strs(gene_name)
def test_get_gene_name_human():
    gene_name = uniprot_client.get_gene_name('P00533')
    assert gene_name == 'EGFR'
    assert unicode_strs(gene_name)
Beispiel #52
0
 def map_agents(self, stmts, do_rename=True):
     # Make a copy of the stmts
     mapped_stmts = []
     num_skipped = 0
     # Iterate over the statements
     for stmt in stmts:
         mapped_stmt = deepcopy(stmt)
         # Iterate over the agents
         skip_stmt = False
         for agent in mapped_stmt.agent_list():
             if agent is None or agent.db_refs.get('TEXT') is None:
                 continue
             agent_text = agent.db_refs.get('TEXT')
             # Look this string up in the grounding map
             # If not in the map, leave agent alone and continue
             try:
                 map_db_refs = self.gm[agent_text]
             except KeyError:
                 continue
             # If it's in the map but it maps to None, then filter out
             # this statement by skipping it
             if map_db_refs is None:
                 # Increase counter if this statement has not already
                 # been skipped via another agent
                 if not skip_stmt:
                     num_skipped += 1
                 logger.debug("Skipping %s" % agent_text)
                 skip_stmt = True
             # If it has a value that's not None, map it and add it
             else:
                 # Otherwise, update the agent's db_refs field
                 gene_name = None
                 map_db_refs = deepcopy(self.gm.get(agent_text))
                 up_id = map_db_refs.get('UP')
                 hgnc_sym = map_db_refs.get('HGNC')
                 if up_id and not hgnc_sym:
                     gene_name = uniprot_client.get_gene_name(up_id, False)
                     if gene_name:
                         hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                         if hgnc_id:
                             map_db_refs['HGNC'] = hgnc_id
                 elif hgnc_sym and not up_id:
                     # Override the HGNC symbol entry from the grounding
                     # map with an HGNC ID
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                     if hgnc_id:
                         map_db_refs['HGNC'] = hgnc_id
                         # Now get the Uniprot ID for the gene
                         up_id = hgnc_client.get_uniprot_id(hgnc_id)
                         if up_id:
                             map_db_refs['UP'] = up_id
                     # If there's no HGNC ID for this symbol, raise an
                     # Exception
                     else:
                         raise ValueError('No HGNC ID corresponding to gene '
                                          'symbol %s in grounding map.' %
                                          hgnc_sym)
                 # If we have both, check the gene symbol ID against the
                 # mapping from Uniprot
                 elif up_id and hgnc_sym:
                     # Get HGNC Symbol from Uniprot
                     gene_name = uniprot_client.get_gene_name(up_id)
                     if not gene_name:
                         raise ValueError('No gene name found for Uniprot '
                                          'ID %s (expected %s)' %
                                          (up_id, hgnc_sym))
                     # We got gene name, compare it to the HGNC name
                     else:
                         if gene_name != hgnc_sym:
                             raise ValueError('Gene name %s for Uniprot ID '
                                              '%s does not match HGNC '
                                              'symbol %s given in grounding '
                                              'map.' %
                                              (gene_name, up_id, hgnc_sym))
                         else:
                             hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                             if not hgnc_id:
                                 raise ValueError('No HGNC ID '
                                                  'corresponding to gene '
                                                  'symbol %s in grounding '
                                                  'map.' % hgnc_sym)
                 # Assign the DB refs from the grounding map to the agent
                 agent.db_refs = map_db_refs
                 # Are we renaming right now?
                 if do_rename:
                     # If there's a Bioentities ID, prefer that for the name
                     if agent.db_refs.get('BE'):
                         agent.name = agent.db_refs.get('BE')
                     # Get the HGNC symbol or gene name (retrieved above)
                     elif hgnc_sym is not None:
                         agent.name = hgnc_sym
                     elif gene_name is not None:
                         agent.name = gene_name
         # Check if we should skip the statement
         if not skip_stmt:
             mapped_stmts.append(mapped_stmt)
     logger.info('%s statements filtered out' % num_skipped)
     return mapped_stmts
Beispiel #53
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}

    ref_counts = Counter([entry['source'] for entry in
                          entity_info['entityId']])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping'
                        % (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entity_info['entityId']:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            gene_name = uniprot_client.get_gene_name(id_dict['idString'])
            if gene_name is not None:
                name = gene_name
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id is not None:
                    # Check to see if we have a conflict with an HGNC id
                    # found from the Entrez id. If so, overwrite with this
                    # one, in which we have greater faith.
                    if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                        msg = ('Inferred HGNC:%s from UP:%s does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], refs['UP'], hgnc_id,
                                refs['EGID'])
                        logger.info(msg)
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning("Unhandled id type: {source}={idString}"
                           .format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords
def test_get_gene_name_no_gene_name():
    gene_name = uniprot_client.get_gene_name('P04434', web_fallback=False)
    assert gene_name is None
    gene_name = uniprot_client.get_gene_name('P04434', web_fallback=True)
    assert gene_name is None
def test_get_gene_name_multiple_gene_names():
    gene_name = uniprot_client.get_gene_name('Q5VWM5')
    assert gene_name == 'PRAMEF9'
Beispiel #56
0
def get_participant(agent):
    # Handle missing Agent as generic protein
    if agent is None:
        return get_generic('protein')
    # The Agent is not missing
    text_name = agent.db_refs.get('TEXT')
    if text_name is None:
        text_name = agent.name
    participant = {}
    participant['entity_text'] = [text_name]
    hgnc_id = agent.db_refs.get('HGNC')
    uniprot_id = agent.db_refs.get('UP')
    chebi_id = agent.db_refs.get('CHEBI')
    pfam_def_ids = agent.db_refs.get('PFAM-DEF')
    # If HGNC grounding is available, that is the first choice
    if hgnc_id:
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if uniprot_id:
        uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id))
        participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
        participant['entity_type'] = 'protein'
    elif chebi_id:
        pubchem_id = chebi_client.get_pubchem_id(chebi_id)
        participant['identifier'] = 'PUBCHEM:%s' % pubchem_id
        participant['entity_type'] = 'chemical'
    elif pfam_def_ids:
        participant['entity_type'] = 'protein_family'
        participant['entities'] = []
        pfam_def_list = []
        for p in pfam_def_ids.split('|'):
            dbname, dbid = p.split(':')
            pfam_def_list.append({dbname: dbid})
        for pdi in pfam_def_list:
            # TODO: handle non-uniprot protein IDs here
            uniprot_id = pdi.get('UP')
            if uniprot_id:
                entity_dict = {}
                uniprot_mnemonic = \
                    str(uniprot_client.get_mnemonic(uniprot_id))
                gene_name = uniprot_client.get_gene_name(uniprot_id)
                if gene_name is None:
                    gene_name = ""
                entity_dict['entity_text'] = [gene_name]
                entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
                entity_dict['entity_type'] = 'protein'
                participant['entities'].append(entity_dict)
    else:
        participant['identifier'] = ''
        participant['entity_type'] = 'protein'

    features = []
    not_features = []
    # Binding features
    for bc in agent.bound_conditions:
        feature = {
            'feature_type': 'binding_feature',
            'bound_to': {
                # NOTE: get type and identifier for bound to protein
                'entity_type': 'protein',
                'entity_text': [bc.agent.name],
                'identifier': ''
                }
            }
        if bc.is_bound:
            features.append(feature)
        else:
            not_features.append(feature)
    # Modification features
    for mc in agent.mods:
        feature = {
            'feature_type': 'modification_feature',
            'modification_type': mc.mod_type.lower(),
            }
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        if mc.residue is not None:
            feature['aa_code'] = mc.residue
        if mc.is_modified:
            features.append(feature)
        else:
            not_features.append(feature)
    # Mutation features
    for mc in agent.mutations:
        feature = {}
        feature['feature_type'] = 'mutation_feature'
        if mc.residue_from is not None:
            feature['from_aa'] = mc.residue_from
        if mc.residue_to is not None:
            feature['to_aa'] = mc.residue_to
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        features.append(feature)
    if features:
        participant['features'] = features
    if not_features:
        participant['not_features'] = not_features
    return participant
Beispiel #57
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
        elif db_ns == 'GO':
            if db_id.startswith('GO:'):
                db_refs_tmp['GO'] = db_id
            else:
                db_refs_tmp['GO'] = 'GO:' + db_id
        # Change PCID name space
        elif db_ns == 'PCID':
            db_refs_tmp.pop('PCID', None)
            db_refs_tmp['PUBCHEM'] = db_id
    agent.db_refs = db_refs_tmp
    # Check if we have a FPLX entry and handle old BE mappings
    if 'BE' in agent.db_refs:
        agent.db_refs['FPLX'] = agent.db_refs.pop('BE')
    be_id = agent.db_refs.get('FPLX')
    # Try to map to FPLX from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # FPLX takes precedence if we have it
    if be_id:
        agent.db_refs['FPLX'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
        # If it doesn't have a gene name, it's better to just
        # use the raw string name otherwise Sparser sets
        # has Uniprot IDs or mnemonics as the name
        else:
            name = agent.db_refs.get('TEXT', agent.name)
            agent.name = name
def test_get_gene_name_unreviewed():
    gene_name = uniprot_client.get_gene_name('X6RK18', web_fallback=False)
    assert gene_name == 'EXO5'
    assert unicode_strs(gene_name)
Beispiel #59
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' %
                              hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' %
                              (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' %
                                  (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return