def test_query_protein_deprecated(): g = uniprot_client.query_protein('Q8NHX1') assert g is not None gene_name = uniprot_client.get_gene_name('Q8NHX1') assert gene_name == 'MAPK3' gene_name = uniprot_client.get_gene_name('Q8NHX1', web_fallback=False) assert gene_name == 'MAPK3'
def map_peptide_to_human_ref(prot_id, prot_ns, peptide, site_pos): """Return a mapped site for a given peptide. Parameters ---------- prot_id : str A Uniprot ID or HGNC gene symbol for the protein. prot_ns : str One of 'uniprot' or 'hgnc' indicating the type of ID given. peptide : str A string of amino acid symbols representing a peptide. site_pos : int A site position within the peptide. Note: site_pos is 1-indexed. Returns ------- MappedSite The MappedSite object gives information on results of mapping the site. See :py:class:`protmapper.api.MappedSite` documentation for details. """ # Get the uniprot ID for the gene # Check the protein ID and namespace if prot_id is None: raise ValueError("prot_id must not be None.") if prot_ns not in ('uniprot', 'hgnc'): raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for " "HGNC symbols)") if prot_ns == 'uniprot' and len(prot_id.split('-')) != 1 and \ prot_id.split('-')[1] != '1': raise ValueError("Protein ID passed in appears to be a " "non-reference isoform: %s" % prot_id) # Get Uniprot ID and gene name up_id = _get_uniprot_id(prot_id, prot_ns) # If an HGNC ID was given and the uniprot entry is not found, flag # as error if up_id is None: assert prot_ns == 'hgnc' and prot_id is not None return MappedSite(None, None, None, None, gene_name=prot_id, error_code='NO_UNIPROT_ID') # Get the gene name from Uniprot gene_name = uniprot_client.get_gene_name(up_id) mapped_pos = ProtMapper.map_peptide(up_id, peptide, site_pos) ms = MappedSite(up_id=up_id, valid=None, orig_res=None, orig_pos=None, error_code=None, description=None, gene_name=gene_name) if mapped_pos is None: ms.valid = False else: ms.valid = True ms.mapped_id = up_id ms.mapped_res = peptide[site_pos - 1] ms.mapped_pos = str(mapped_pos) return ms
def get_subject(record) -> Optional[Agent]: ncbigene_id = record["id"][len("gene:") :] uniprot_id = uniprot_client.get_id_from_entrez(ncbigene_id) if uniprot_id is None: logger.debug(f"Could not convert ncbigene:{ncbigene_id} to UniProt") return None name = uniprot_client.get_gene_name(uniprot_id) return get_standard_agent( name, { "EGID": ncbigene_id, "UP": uniprot_id, }, )
def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over NCI PID terms.""" hgnc_id_to_name = get_id_name_mapping("hgnc") hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()} for uuid, cx in iter_networks(force=force, use_tqdm=True): name = None for node in iterate_aspect(cx, "networkAttributes"): if node["n"] == "name": name = node["v"] term = Term(reference=Reference(prefix=PREFIX, identifier=uuid, name=name), ) genes = set() for node in iterate_aspect(cx, "nodes"): name, reference = node["n"], node["r"] hgnc_id = hgnc_name_to_id.get(name) if hgnc_id: genes.add((hgnc_id, name)) elif any(reference.startswith(x) for x in ("CHEBI:", "cas:")): pass elif reference.startswith("uniprot:"): uniprot_id = reference[len("uniprot:"):] hgnc_id = get_hgnc_id(uniprot_id) if hgnc_id is None: # this only happens for proteins that seem to be virus related # TODO reinvestigate this later logger.debug( "uniprot could not map %s/%s/%s to HGNC", name, reference, get_gene_name(uniprot_id, web_fallback=False), ) else: name = hgnc_id_to_name[hgnc_id] genes.add((hgnc_id, name)) else: logger.debug(f"unmapped: {name}, {reference}") for hgnc_id, hgnc_symbol in genes: term.append_relationship(has_part, Reference("hgnc", hgnc_id, hgnc_symbol)) yield term
def iter_terms() -> Iterable[Term]: """Iterate over NCI PID terms.""" hgnc_id_to_name = get_id_name_mapping('hgnc') hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()} for uuid, cx in iter_networks(use_tqdm=True): name = None for node in iterate_aspect(cx, 'networkAttributes'): if node['n'] == 'name': name = node['v'] term = Term(reference=Reference(prefix=PREFIX, identifier=uuid, name=name), ) genes = set() for node in iterate_aspect(cx, 'nodes'): name, reference = node['n'], node['r'] hgnc_id = hgnc_name_to_id.get(name) if hgnc_id: genes.add((hgnc_id, name)) elif any(reference.startswith(x) for x in ('CHEBI:', 'cas:')): pass elif reference.startswith('uniprot:'): uniprot_id = reference[len('uniprot:'):] hgnc_id = get_hgnc_id(uniprot_id) if hgnc_id is None: # this only happens for proteins that seem to be virus related # TODO reinvestigate this later logger.debug( 'uniprot could not map %s/%s/%s to HGNC', name, reference, get_gene_name(uniprot_id, web_fallback=False), ) else: name = hgnc_id_to_name[hgnc_id] genes.add((hgnc_id, name)) else: logger.debug(f'unmapped: {name}, {reference}') for hgnc_id, hgnc_symbol in genes: term.append_relationship(pathway_has_part, Reference('hgnc', hgnc_id, hgnc_symbol)) yield term
def test_get_gene_name_multiple_gene_names(): gene_name = uniprot_client.get_gene_name('Q5VWM5') assert gene_name == 'PRAMEF9'
def test_get_gene_name_no_gene_name(): gene_name = uniprot_client.get_gene_name('P04434', web_fallback=False) assert gene_name is None gene_name = uniprot_client.get_gene_name('P04434', web_fallback=True) assert gene_name is None
def test_get_gene_name_unreviewed(): gene_name = uniprot_client.get_gene_name('X6RK18', web_fallback=False) assert gene_name == 'EXO5'
def test_get_gene_name_nonhuman(): gene_name = uniprot_client.get_gene_name('P31938') assert gene_name == 'Map2k1'
def test_get_gene_name_human(): gene_name = uniprot_client.get_gene_name('P00533') assert gene_name == 'EGFR'
def test_get_gene_name_only_protein_name(): assert uniprot_client.get_gene_name('P04377') == 'Pseudoazurin'
def test_more_gene_names_for_nonhuman(): gene_name = uniprot_client.get_gene_name('P59632', web_fallback=False) assert gene_name == '3a' gene_name = uniprot_client.get_gene_name('P0DTD2', web_fallback=False) assert gene_name == '9b'
def _get_gene_name(protein_id: str, web_fallback: bool = True): from protmapper.uniprot_client import get_gene_name return get_gene_name(protein_id, web_fallback=web_fallback)
def map_to_human_ref(self, prot_id, prot_ns, residue, position, do_methionine_offset=True, do_orthology_mapping=True, do_isoform_mapping=True): """Check an agent for invalid sites and look for mappings. Look up each modification site on the agent in Uniprot and then the site map. Parameters ---------- prot_id : str A Uniprot ID or HGNC gene symbol for the protein. prot_ns : str One of 'uniprot' or 'hgnc' indicating the type of ID given. residue : str Residue to map on the protein to check for validity and map. position : str Position of the residue to check for validity and map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. Returns ------- MappedSite The MappedSite object gives information on results of mapping the site. See :py:class:`protmapper.api.MappedSite` documentation for details. """ # Check the protein ID and namespace if prot_id is None: raise ValueError("prot_id must not be None.") if prot_ns not in ('uniprot', 'hgnc'): raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for " "HGNC symbols)") # Get Uniprot ID and gene name up_id = _get_uniprot_id(prot_id, prot_ns) # If an HGNC ID was given and the uniprot entry is not found, flag # as error if up_id is None: assert prot_ns == 'hgnc' and prot_id is not None return MappedSite(None, None, residue, position, gene_name=prot_id, error_code='NO_UNIPROT_ID') # Make sure the sites are proper amino acids/positions try: valid_res, valid_pos = _validate_site(residue, position) except InvalidSiteException as ex: return MappedSite(up_id, None, residue, position, error_code='INVALID_SITE', description=str(ex)) # Get the gene name from Uniprot gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False) site_key = (up_id, residue, position) # First, check the cache to potentially avoid a costly sequence # lookup cached_site = self._cache.get(site_key) if cached_site is not None: return cached_site # If not cached, continue # Look up the residue/position in uniprot try: site_valid = uniprot_client.verify_location(up_id, residue, position) error_code = None except HTTPError as ex: if ex.response.status_code == 404: error_code = 'UNIPROT_HTTP_NOT_FOUND' else: error_code = 'UNIPROT_HTTP_OTHER' except Exception as ex: error_code = 'UNIPROT_OTHER' logger.error(ex) if error_code: # Set error_code; valid will set to None, not True/False mapped_site = MappedSite(up_id, None, residue, position, error_code=error_code) return mapped_site # It's a valid site if site_valid: mapped_site = MappedSite(up_id, True, residue, position, description='VALID', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # If it's not a valid site, check the site map first curated_site = self.site_map.get(site_key, None) # Manually mapped in the site map if curated_site is not None: mapped_res, mapped_pos, description = curated_site mapped_site = MappedSite(up_id, False, residue, position, mapped_id=up_id, mapped_res=mapped_res, mapped_pos=mapped_pos, description=description, gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # There is no manual mapping, next we try to see if UniProt # reports a signal peptide that could be responsible for the position # being shifted signal_peptide = uniprot_client.get_signal_peptide(up_id, False) # If there is valid signal peptide information from UniProt if signal_peptide and signal_peptide.begin == 1 and \ signal_peptide.end is not None: offset_pos = str(int(position) + signal_peptide.end) # Check to see if the offset position is known to be phosphorylated mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'SIGNAL_PEPTIDE_REMOVED') if mapped_site: return mapped_site # ...there's no manually curated site or signal peptide, so do mapping # via PhosphoSite if the data is available: human_prot = uniprot_client.is_human(up_id) if phosphosite_client.has_data(): # First, look for other entries in phosphosite for this protein # where this sequence position is legit (i.e., other isoforms) if do_isoform_mapping and up_id and human_prot: mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, position, 'INFERRED_ALTERNATIVE_ISOFORM') if mapped_site: return mapped_site # Try looking for rat or mouse sites if do_orthology_mapping and up_id and human_prot: # Get the mouse ID for this protein up_mouse = uniprot_client.get_mouse_id(up_id) # Get mouse sequence mapped_site = self.get_psp_mapping( up_id, up_mouse, gene_name, residue, position, position, 'INFERRED_MOUSE_SITE') if mapped_site: return mapped_site # Try the rat sequence up_rat = uniprot_client.get_rat_id(up_id) mapped_site = self.get_psp_mapping( up_id, up_rat, gene_name, residue, position, position, 'INFERRED_RAT_SITE') if mapped_site: return mapped_site # Check for methionine offset (off by one) if do_methionine_offset and up_id and human_prot: offset_pos = str(int(position) + 1) mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'INFERRED_METHIONINE_CLEAVAGE') if mapped_site: return mapped_site # If we've gotten here, the entry is 1) not in the site map, and # 2) we either don't have PSP data or no mapping was found using PSP mapped_site = MappedSite(up_id, False, residue, position, description='NO_MAPPING_FOUND', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site
def test_protein_name_no_ec_code(): assert uniprot_client.get_gene_name('P84122') == 'Thrombin'
def get_graph_from_cx(network_uuid: str, cx: CX) -> BELGraph: # noqa: C901 """Get a PID network from NDEx.""" metadata = {} for entry in iterate_aspect(cx, 'networkAttributes'): member_name = entry['n'] if member_name == 'name': metadata['name'] = entry['v'] elif member_name == 'version': metadata['version'] = entry['v'] elif member_name == 'description': metadata['description'] = entry['v'] graph = BELGraph(**metadata) id_to_type = {} id_to_members = {} id_to_alias = {} # TODO nodeAttributes have list of protein definitions for some things for entry in iterate_aspect(cx, 'nodeAttributes'): node_id = entry['po'] member_name = entry['n'] if member_name == 'type': id_to_type[node_id] = entry['v'] elif member_name == 'alias': id_to_alias[node_id] = entry['v'] elif member_name == 'member': id_to_members[node_id] = entry['v'] else: logger.warning(f'unhandled node attribute: {member_name}') id_to_citations = {} for entry in iterate_aspect(cx, 'edgeAttributes'): if entry['n'] == 'citation': id_to_citations[entry['po']] = [ x[len('pubmed:'):] for x in entry['v'] ] id_to_dsl = {} for node in iterate_aspect(cx, 'nodes'): node_id = node['@id'] reference = node['r'] if reference in MAPPING: id_to_dsl[node_id] = [MAPPING[reference]] continue if node_id in id_to_members: node_type = id_to_type[node_id] members = id_to_members[node_id] if node_type != 'proteinfamily': logger.warning( f'unhandled node: {node_id} type={node_type} members={members}' ) _rv = [] for member in members: if not member.startswith('hgnc.symbol:'): logger.warning( f'unhandled member for node: {node_id} -> {member}') continue member_name = member[len('hgnc.symbol:'):] member_identifier = hgnc_name_to_id.get(member_name) if member_identifier is None: logger.warning( f'unhandled member for node: {node_id} -> {member}') continue _rv.append( pybel.dsl.Protein(namespace='hgnc', identifier=member_identifier, name=member_name)) id_to_dsl[node_id] = _rv continue if ':' not in reference: logger.warning(f'no curie: {node_id} {reference}') UNMAPPED.add(reference) continue prefix, identifier = reference.split(':') if prefix == 'hprd': # nodes.write(f'unhandled hprd:{identifier}') continue elif prefix == 'cas': # nodes.write(f'unhandled cas:{identifier}') continue # not sure what to do with this elif prefix == 'CHEBI': name = chebi_id_to_name[identifier] id_to_dsl[node_id] = [ pybel.dsl.Abundance(namespace='chebi', identifier=identifier, name=name) ] elif prefix == 'uniprot': name = node['n'] if name not in hgnc_name_to_id: name = get_gene_name(identifier) if name is None: logger.warning('could not map uniprot to name') identifier = hgnc_name_to_id.get(name) if identifier is None: logger.warning(f'could not map HGNC symbol {name}') continue id_to_dsl[node_id] = [ pybel.dsl.Protein(namespace='hgnc', identifier=identifier, name=name) ] else: logger.warning(f'unexpected prefix: {prefix}') continue for edge in iterate_aspect(cx, 'edges'): source_id, target_id = edge['s'], edge['t'] if source_id not in id_to_dsl or target_id not in id_to_dsl: continue edge_type = edge['i'] edge_id = edge['@id'] sources = id_to_dsl[source_id] targets = id_to_dsl[target_id] citations = id_to_citations.get(edge_id, [('ndex', network_uuid)]) for source, target, citation in product(sources, targets, citations): if edge_type == 'in-complex-with': graph.add_binds(source, target, citation=citation, evidence=edge_id) elif edge_type == 'controls-phosphorylation-of': graph.add_regulates( source, target.with_variants(pybel.dsl.ProteinModification('Ph')), citation=citation, evidence=edge_id, ) elif edge_type in { 'controls-transport-of', 'controls-transport-of-chemical' }: graph.add_regulates( source, target, citation=citation, evidence=edge_id, # object_modifier=pybel.dsl.translocation(), ) elif edge_type == 'chemical-affects': graph.add_regulates( source, target, citation=citation, evidence=edge_id, object_modifier=pybel.dsl.activity(), ) elif edge_type in { 'controls-expression-of', 'controls-production-of', 'consumption-controlled-by', 'controls-state-change-of', 'catalysis-precedes' }: graph.add_regulates(source, target, citation=citation, evidence=edge_id) elif edge_type == 'used-to-produce': graph.add_node_from_data( pybel.dsl.Reaction( reactants=source, products=target, )) elif edge_type == 'reacts-with': graph.add_binds(source, target, citation=citation, evidence=edge_id) # graph.add_node_from_data(pybel.dsl.Reaction( # reactants=[source, target], # )) else: logger.warning( f'unhandled edge type: {source} {edge_type} {target}') return graph