def _handle_identifier_not_name(*, concept, prefix, identifier) -> bool: # Some namespaces are just too much of a problem at the moment to look up if prefix in SKIP: return False if prefix in NO_NAMES: concept[NAME] = concept[IDENTIFIER] return True if prefix == 'uniprot': concept[NAME] = get_mnemonic(identifier) return True try: id_name_mapping = get_id_name_mapping(prefix) except (NoOboFoundry, MissingOboBuild): return False if id_name_mapping is None: logger.warning('could not get names for prefix %s', prefix) return False name = id_name_mapping.get(identifier) if name is None: logger.warning('could not get name for %s:%s', prefix, identifier) return False concept[NAME] = name return True
def get_uniprot_id_names(hgnc_id: str) -> Iterable[Tuple[str, str]]: """Get all of the UniProt identifiers for a given gene.""" try: r = hgnc_id_to_up[str(hgnc_id)] except KeyError: tqdm.write(f'could not find HGNC:{hgnc_id}') return for _uniprot_id in r.split(', '): yield _uniprot_id, uniprot_client.get_mnemonic(_uniprot_id)
def get_uniprot_id_names(hgnc_id: str) -> Iterable[Tuple[str, str]]: """Get all of the UniProt identifiers for a given gene.""" try: r = hgnc_id_to_up[str(hgnc_id)] except KeyError: _k, _v = list(hgnc_id_to_up.items())[0] print(f'could not find {hgnc_id} ({type(hgnc_id)} in dict. Example: {_k} ({type(_k)}), {_v} ({type(_v)})') raise for _uniprot_id in r.split(', '): yield _uniprot_id, uniprot_client.get_mnemonic(_uniprot_id)
def _process_interactor(s: str) -> Optional[Tuple[str, str, Optional[str]]]: if s.startswith('uniprotkb:'): uniprot_id = s[len('uniprotkb:'):] try: ncbigene_id = get_entrez_id(uniprot_id) except Exception: ncbigene_id = None if ncbigene_id: return 'ncbigene', ncbigene_id, pyobo.get_name( 'ncbigene', ncbigene_id) return 'uniprot', uniprot_id, get_mnemonic(uniprot_id) if s.startswith('chebi:"CHEBI:'): chebi_id = s[len('chebi:"CHEBI:'):-1] return 'chebi', chebi_id, pyobo.get_name('chebi', chebi_id) if s.startswith('chembl target:'): return 'chembl.target', s[len('chembl target:'):-1], None if s.startswith('intact:'): prefix, identifier = 'intact', s[len('intact:'):] complexportal_identifier = _map_complexportal(identifier) if complexportal_identifier is not None: return 'complexportal', complexportal_identifier, None reactome_identifier = _map_reactome(identifier) if reactome_identifier is not None: return 'reactome', reactome_identifier, None _unhandled[prefix] += 1 logger.debug('could not find complexportal/reactome mapping for %s:%s', prefix, identifier) return prefix, identifier, None if s.startswith('intenz:'): return 'eccode', s[len('intenz:'):], None """ Counter({'chebi': 9534, 'ensembl': 3156, 'refseq': 444, 'ensemblgenomes': 439, 'ddbj/embl/genbank': 204, 'wwpdb': 163, 'matrixdb': 102, 'reactome': 87, 'intenz': 43, 'signor': 15, 'chembl target': 11, 'dip': 4, 'entrezgene/locuslink': 2, 'protein ontology': 2, 'emdb': 2}) """ _unhandled[s.split(':')[0]] += 1 if s not in _logged_unhandled: logger.warning('unhandled identifier: %s', s) _logged_unhandled.add(s)
def get_name(prefix: str, identifier: str) -> Optional[str]: """Get the name for an entity.""" if prefix == 'uniprot': from protmapper import uniprot_client return uniprot_client.get_mnemonic(identifier) try: id_name = get_id_name_mapping(prefix) except NoOboFoundry: id_name = None if not id_name: logger.warning('unable to look up names for prefix %s', prefix) return primary_id = get_primary_identifier(prefix, identifier) return id_name.get(primary_id)
def _handle_identifier_not_name( *, concept, prefix, identifier, skip_namespaces: Optional[Collection[str]] = None, ) -> bool: # Some namespaces are just too much of a problem at the moment to look up if prefix in SKIP: return False if skip_namespaces and prefix in skip_namespaces: return True if prefix in NO_NAMES: concept[NAME] = concept[IDENTIFIER] return True if prefix == 'uniprot': concept[NAME] = get_mnemonic(identifier) return True try: id_name_mapping = pyobo.api.names.get_id_name_mapping(prefix) except NoBuild: return False if id_name_mapping is None: logger.warning('could not get names for prefix "%s"', prefix) return False name = id_name_mapping.get(identifier) if name is None: logger.warning('could not get name for curie %s:%s', prefix, identifier) return False concept[NAME] = name return True
def _handle_name_and_not_identifier( *, concept, prefix, name, node=None, skip_namespaces: Optional[Collection[str]] = None, ) -> bool: remapped_prefix, remapped_identifier, remapped_name = _get_name_remapping( prefix, name) if remapped_prefix: concept[NAMESPACE] = remapped_prefix concept[IDENTIFIER] = remapped_identifier concept[NAME] = remapped_name return True # Some namespaces are just too much of a problem at the moment to look up if prefix in SKIP: return False if skip_namespaces and prefix in skip_namespaces: return True concept[NAMESPACE] = prefix if prefix in NO_NAMES: concept[IDENTIFIER] = name return True if prefix == 'bel' and node is not None and KIND in node: kind = node[KIND] if kind == PMOD and name in pmod_mappings: # the 0th position xref is the preferred one (usually GO) _mapped = pmod_mappings[name]['xrefs'][0] elif kind == GMOD and name in gmod_mappings: _mapped = gmod_mappings[name]['xrefs'][0] else: raise ValueError(f'invalid kind: {kind}') concept[NAMESPACE] = _mapped[NAMESPACE] concept[IDENTIFIER] = _mapped[IDENTIFIER] concept[NAME] = _mapped[NAME] return True elif prefix == 'bel' and name in activity_mapping: _mapped = activity_mapping[name] concept[NAMESPACE] = _mapped[NAMESPACE] concept[IDENTIFIER] = _mapped[IDENTIFIER] concept[NAME] = _mapped[NAME] return True elif prefix == 'bel' and name in compartment_mapping: _mapped = compartment_mapping[name] concept[NAMESPACE] = _mapped[NAMESPACE] concept[IDENTIFIER] = _mapped[IDENTIFIER] concept[NAME] = _mapped[NAME] return True elif prefix == 'bel': logger.warning('could not figure out how to map bel ! "%s"', name) return False if prefix == 'uniprot': # assume identifier given as name identifier = get_id_from_mnemonic(name) if identifier is not None: concept[IDENTIFIER] = identifier return True mnemomic = get_mnemonic(name, web_fallback=False) if mnemomic is not None: concept[IDENTIFIER] = name concept[NAME] = mnemomic return True logger.warning('could not interpret uniprot name: "%s"', name) return False try: id_name_mapping = pyobo.api.names.get_name_id_mapping(prefix) except NoBuild as e: logger.warning('could not get namespace %s - %s', prefix, e) return False if id_name_mapping is None: logger.warning('unhandled namespace in %s ! %s', prefix, name) return False identifier = id_name_mapping.get(name) if identifier is None: logger.warning('could not find name "%s" in namespace "%s"', name, prefix) return False concept[IDENTIFIER] = identifier concept[NAME] = name return True
def test_get_mnemonic(): mnemonic = uniprot_client.get_mnemonic('Q02750') assert mnemonic == 'MP2K1_HUMAN'
def get_psp_mapping(self, orig_id, query_id, gene_name, res, pos, query_pos, mapping_code): """ Wrapper around Phosphosite queries that performs peptide remapping. The function is called with a uniprot ID, residue, and position combination that is used to query the phosphosite_client for a valid corresponding site on the human reference protein. The `mapping_code` is provided by the caller to indicate the type of mapping being attempted (e.g., human isoform, mouse, rat, methionine). If a valid mapping is obtained, this is the error code that is applied. If a valid mapping is obtained but it is for a human isoform, this indicates that the queried site exists only on a human isoform and not on the human reference protein, and the code `ISOFORM_SPECIFIC_SITE` is used. If the site returned by the phosphosite_client is at a position that does not match the Uniprot reference sequence (which can happen when the queried site and the PhosphositePlus protein sequences both exclude the initial methionine), the site is remapped to the Uniprot reference sequence using the peptide information for the site in PhosphositePlus. In these cases, the mapping code `REMAPPED_FROM_PSP_SEQUENCE` is used. Parameters ---------- orig_id : str Original Uniprot ID of the protein to be mapped. query_id : str Uniprot ID of the protein being queried for sites. This may differ from `orig_id` if the orthologous mouse or rat protein is being checked for sites. gene_name : str Gene name of the protein. res : str Residue of the site to be mapped. pos : str Position of the site to be mapped. query_pos : str Position being queried for a mapping. This differs from `pos` when off-by-one (methionine) errors are being checked. mapping_code : str Mapping code to apply in case of a successful mapping, e.g. `INFERRED_ALTERNATIVE_ISOFORM`, `INFERRED_MOUSE_SITE`, etc. Returns ------- MappedSite or None MappedSite object containing the mapping, or None indicating that no mapping was found. """ pspmapping = phosphosite_client.map_to_human_site(query_id, res, query_pos) # If no mapping, return None if pspmapping is None: return None # If there is a mapping, check to make sure that it is valid wrt to the # reference sequence human_pos = pspmapping.mapped_pos # Check if the site mapped from PSP is valid in the Uniprot sequence # for the ID that we're interested in # PSP sometimes returns a non-UP ID like NP_001184222 which we want # to control for here, we do that by looking up the mnemonic if not uniprot_client.get_mnemonic(pspmapping.mapped_id, web_fallback=False): return MappedSite(orig_id, None, res, pos, error_code='PSP_MAPPED_ID_NOT_UP') # At this point the ID is supposed to be valid UP try: site_valid = uniprot_client.verify_location(pspmapping.mapped_id, pspmapping.mapped_res, pspmapping.mapped_pos) error_code = None except HTTPError as ex: if ex.response.status_code == 404: error_code = 'UNIPROT_HTTP_NOT_FOUND' else: error_code = 'UNIPROT_HTTP_OTHER' except Exception as ex: error_code = 'UNIPROT_OTHER' logger.error(ex) if error_code: # Set error_code; valid will set to None, not True/False mapped_site = MappedSite(orig_id, None, res, pos, error_code=error_code) return mapped_site # If the mapped site is valid, we're done! if site_valid: # If the residue is different, change the code accordingly mapped_site = MappedSite(orig_id, False, res, pos, mapped_id=pspmapping.mapped_id, mapped_res=pspmapping.mapped_res, mapped_pos=human_pos, description=mapping_code, gene_name=gene_name) else: # If mapped site is invalid, attempt to re-map based on the seq updated_pos = ProtMapper.map_peptide(orig_id, pspmapping.motif, pspmapping.respos) # If the re-mapping fails, we give up if updated_pos is None: return None # Otherwise, we update to the mapped position updated_pos_1x = str(updated_pos + 1) mapped_site = MappedSite(orig_id, False, res, pos, mapped_id=pspmapping.mapped_id, mapped_res=pspmapping.mapped_res, mapped_pos=updated_pos_1x, # Switch to 1-indexed description='REMAPPED_FROM_PSP_SEQUENCE', gene_name=gene_name) site_key = (orig_id, res, pos) self._cache[site_key] = mapped_site return mapped_site
def _add_my_row(graph: BELGraph, row) -> None: relation = row['relation'] source_uniprot_id = row['source'] target_uniprot_id = row['target'] pubmed_ids = row['pubmed_ids'] pubmed_ids = pubmed_ids.split('|') source = pybel.dsl.Protein( namespace='uniprot', identifier=source_uniprot_id, name=get_mnemonic(source_uniprot_id), ) target = pybel.dsl.Protein( namespace='uniprot', identifier=target_uniprot_id, name=get_mnemonic(target_uniprot_id), ) for pubmed_id in pubmed_ids: if relation == 'deubiquitination': target_ub = target.with_variants( pybel.dsl.ProteinModification('Ub')) graph.add_decreases( source, target_ub, citation=pubmed_id, evidence='From intact', ) elif relation == 'ubiqutination': target_ub = target.with_variants( pybel.dsl.ProteinModification('Ub')) graph.add_increases( source, target_ub, citation=..., evidence='From intact', ) elif relation == 'degratation': graph.add_decreases( source, target, citation=..., evidence='From intact', ) elif relation == 'activates': graph.add_increases( source, target, ..., object_modifier=pybel.dsl.activity(), ) elif relation == 'co-expressed': graph.add_correlation( pybel.dsl.Rna( namespace='uniprot', identifier=source_uniprot_id, name=get_mnemonic(source_uniprot_id), ), pybel.dsl.Rna( namespace='uniprot', identifier=target_uniprot_id, name=get_mnemonic(target_uniprot_id), ), annotations=dict(cell_line={'HEK2': True}), )