def test_get_signal_peptide(): # This is a valid entry local to the resource file sp = uniprot_client.get_signal_peptide('P00533') assert sp.begin == 1, sp assert sp.end == 24, sp # This one requires a web lookup sp = uniprot_client.get_signal_peptide('P00534') assert sp is None, sp # This one errors when doing web lookup sp = uniprot_client.get_signal_peptide('Q9H7H1') assert sp is None, sp
def test_get_signal_peptide(): # This is a valid entry local to the resource file bp, ep = uniprot_client.get_signal_peptide('P00533') assert bp == 1, bp assert ep == 24, ep # This one requires a web lookup bp, ep = uniprot_client.get_signal_peptide('P00534') assert bp is None, bp assert ep is None, ep # This one errors when doing web lookup bp, ep = uniprot_client.get_signal_peptide('Q9H7H1') assert bp is None, bp assert ep is None, ep
def map_to_human_ref(self, prot_id, prot_ns, residue, position, do_methionine_offset=True, do_orthology_mapping=True, do_isoform_mapping=True): """Check an agent for invalid sites and look for mappings. Look up each modification site on the agent in Uniprot and then the site map. Parameters ---------- prot_id : str A Uniprot ID or HGNC gene symbol for the protein. prot_ns : str One of 'uniprot' or 'hgnc' indicating the type of ID given. residue : str Residue to map on the protein to check for validity and map. position : str Position of the residue to check for validity and map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. Returns ------- MappedSite The MappedSite object gives information on results of mapping the site. See :py:class:`protmapper.api.MappedSite` documentation for details. """ # Check the protein ID and namespace if prot_id is None: raise ValueError("prot_id must not be None.") if prot_ns not in ('uniprot', 'hgnc'): raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for " "HGNC symbols)") # Get Uniprot ID and gene name up_id = _get_uniprot_id(prot_id, prot_ns) # If an HGNC ID was given and the uniprot entry is not found, flag # as error if up_id is None: assert prot_ns == 'hgnc' and prot_id is not None return MappedSite(None, None, residue, position, gene_name=prot_id, error_code='NO_UNIPROT_ID') # Make sure the sites are proper amino acids/positions try: valid_res, valid_pos = _validate_site(residue, position) except InvalidSiteException as ex: return MappedSite(up_id, None, residue, position, error_code='INVALID_SITE', description=str(ex)) # Get the gene name from Uniprot gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False) site_key = (up_id, residue, position) # First, check the cache to potentially avoid a costly sequence # lookup cached_site = self._cache.get(site_key) if cached_site is not None: return cached_site # If not cached, continue # Look up the residue/position in uniprot try: site_valid = uniprot_client.verify_location(up_id, residue, position) error_code = None except HTTPError as ex: if ex.response.status_code == 404: error_code = 'UNIPROT_HTTP_NOT_FOUND' else: error_code = 'UNIPROT_HTTP_OTHER' except Exception as ex: error_code = 'UNIPROT_OTHER' logger.error(ex) if error_code: # Set error_code; valid will set to None, not True/False mapped_site = MappedSite(up_id, None, residue, position, error_code=error_code) return mapped_site # It's a valid site if site_valid: mapped_site = MappedSite(up_id, True, residue, position, description='VALID', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # If it's not a valid site, check the site map first curated_site = self.site_map.get(site_key, None) # Manually mapped in the site map if curated_site is not None: mapped_res, mapped_pos, description = curated_site mapped_site = MappedSite(up_id, False, residue, position, mapped_id=up_id, mapped_res=mapped_res, mapped_pos=mapped_pos, description=description, gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # There is no manual mapping, next we try to see if UniProt # reports a signal peptide that could be responsible for the position # being shifted signal_peptide = uniprot_client.get_signal_peptide(up_id, False) # If there is valid signal peptide information from UniProt if signal_peptide and signal_peptide.begin == 1 and \ signal_peptide.end is not None: offset_pos = str(int(position) + signal_peptide.end) # Check to see if the offset position is known to be phosphorylated mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'SIGNAL_PEPTIDE_REMOVED') if mapped_site: return mapped_site # ...there's no manually curated site or signal peptide, so do mapping # via PhosphoSite if the data is available: human_prot = uniprot_client.is_human(up_id) if phosphosite_client.has_data(): # First, look for other entries in phosphosite for this protein # where this sequence position is legit (i.e., other isoforms) if do_isoform_mapping and up_id and human_prot: mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, position, 'INFERRED_ALTERNATIVE_ISOFORM') if mapped_site: return mapped_site # Try looking for rat or mouse sites if do_orthology_mapping and up_id and human_prot: # Get the mouse ID for this protein up_mouse = uniprot_client.get_mouse_id(up_id) # Get mouse sequence mapped_site = self.get_psp_mapping( up_id, up_mouse, gene_name, residue, position, position, 'INFERRED_MOUSE_SITE') if mapped_site: return mapped_site # Try the rat sequence up_rat = uniprot_client.get_rat_id(up_id) mapped_site = self.get_psp_mapping( up_id, up_rat, gene_name, residue, position, position, 'INFERRED_RAT_SITE') if mapped_site: return mapped_site # Check for methionine offset (off by one) if do_methionine_offset and up_id and human_prot: offset_pos = str(int(position) + 1) mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'INFERRED_METHIONINE_CLEAVAGE') if mapped_site: return mapped_site # If we've gotten here, the entry is 1) not in the site map, and # 2) we either don't have PSP data or no mapping was found using PSP mapped_site = MappedSite(up_id, False, residue, position, description='NO_MAPPING_FOUND', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site