コード例 #1
0
def test_query_protein_deprecated():
    g = uniprot_client.query_protein('Q8NHX1')
    assert g is not None
    gene_name = uniprot_client.get_gene_name('Q8NHX1')
    assert gene_name == 'MAPK3'
    gene_name = uniprot_client.get_gene_name('Q8NHX1', web_fallback=False)
    assert gene_name == 'MAPK3'
コード例 #2
0
ファイル: api.py プロジェクト: indralab/protmapper
    def map_peptide_to_human_ref(prot_id, prot_ns, peptide, site_pos):
        """Return a mapped site for a given peptide.

        Parameters
        ----------
        prot_id : str
            A Uniprot ID or HGNC gene symbol for the protein.
        prot_ns : str
            One of 'uniprot' or 'hgnc' indicating the type of ID given.
        peptide : str
            A string of amino acid symbols representing a peptide.
        site_pos : int
            A site position within the peptide. Note: site_pos is 1-indexed.

        Returns
        -------
        MappedSite
            The MappedSite object gives information on results of mapping the
            site. See :py:class:`protmapper.api.MappedSite` documentation for
            details.
        """
        # Get the uniprot ID for the gene
        # Check the protein ID and namespace
        if prot_id is None:
            raise ValueError("prot_id must not be None.")
        if prot_ns not in ('uniprot', 'hgnc'):
            raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for "
                             "HGNC symbols)")
        if prot_ns  == 'uniprot' and len(prot_id.split('-')) != 1 and \
                                                prot_id.split('-')[1] != '1':
            raise ValueError("Protein ID passed in appears to be a "
                             "non-reference isoform: %s" % prot_id)
        # Get Uniprot ID and gene name
        up_id = _get_uniprot_id(prot_id, prot_ns)
        # If an HGNC ID was given and the uniprot entry is not found, flag
        # as error
        if up_id is None:
            assert prot_ns == 'hgnc' and prot_id is not None
            return MappedSite(None, None, None, None,
                              gene_name=prot_id, error_code='NO_UNIPROT_ID')
        # Get the gene name from Uniprot
        gene_name = uniprot_client.get_gene_name(up_id)
        mapped_pos = ProtMapper.map_peptide(up_id, peptide, site_pos)
        ms = MappedSite(up_id=up_id, valid=None, orig_res=None, orig_pos=None,
                        error_code=None, description=None, gene_name=gene_name)
        if mapped_pos is None:
            ms.valid = False
        else:
            ms.valid = True
            ms.mapped_id = up_id
            ms.mapped_res = peptide[site_pos - 1]
            ms.mapped_pos = str(mapped_pos)
        return ms
コード例 #3
0
ファイル: processor.py プロジェクト: steppi/indra
 def get_subject(record) -> Optional[Agent]:
     ncbigene_id = record["id"][len("gene:") :]
     uniprot_id = uniprot_client.get_id_from_entrez(ncbigene_id)
     if uniprot_id is None:
         logger.debug(f"Could not convert ncbigene:{ncbigene_id} to UniProt")
         return None
     name = uniprot_client.get_gene_name(uniprot_id)
     return get_standard_agent(
         name,
         {
             "EGID": ncbigene_id,
             "UP": uniprot_id,
         },
     )
コード例 #4
0
def iter_terms(force: bool = False) -> Iterable[Term]:
    """Iterate over NCI PID terms."""
    hgnc_id_to_name = get_id_name_mapping("hgnc")
    hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()}

    for uuid, cx in iter_networks(force=force, use_tqdm=True):
        name = None
        for node in iterate_aspect(cx, "networkAttributes"):
            if node["n"] == "name":
                name = node["v"]

        term = Term(reference=Reference(prefix=PREFIX,
                                        identifier=uuid,
                                        name=name), )

        genes = set()
        for node in iterate_aspect(cx, "nodes"):
            name, reference = node["n"], node["r"]
            hgnc_id = hgnc_name_to_id.get(name)
            if hgnc_id:
                genes.add((hgnc_id, name))
            elif any(reference.startswith(x) for x in ("CHEBI:", "cas:")):
                pass
            elif reference.startswith("uniprot:"):
                uniprot_id = reference[len("uniprot:"):]
                hgnc_id = get_hgnc_id(uniprot_id)
                if hgnc_id is None:  # this only happens for proteins that seem to be virus related
                    # TODO reinvestigate this later
                    logger.debug(
                        "uniprot could not map %s/%s/%s to HGNC",
                        name,
                        reference,
                        get_gene_name(uniprot_id, web_fallback=False),
                    )
                else:
                    name = hgnc_id_to_name[hgnc_id]
                    genes.add((hgnc_id, name))
            else:
                logger.debug(f"unmapped: {name}, {reference}")

        for hgnc_id, hgnc_symbol in genes:
            term.append_relationship(has_part,
                                     Reference("hgnc", hgnc_id, hgnc_symbol))

        yield term
コード例 #5
0
ファイル: pid.py プロジェクト: ddomingof/pyobo
def iter_terms() -> Iterable[Term]:
    """Iterate over NCI PID terms."""
    hgnc_id_to_name = get_id_name_mapping('hgnc')
    hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()}

    for uuid, cx in iter_networks(use_tqdm=True):
        name = None
        for node in iterate_aspect(cx, 'networkAttributes'):
            if node['n'] == 'name':
                name = node['v']

        term = Term(reference=Reference(prefix=PREFIX,
                                        identifier=uuid,
                                        name=name), )

        genes = set()
        for node in iterate_aspect(cx, 'nodes'):
            name, reference = node['n'], node['r']
            hgnc_id = hgnc_name_to_id.get(name)
            if hgnc_id:
                genes.add((hgnc_id, name))
            elif any(reference.startswith(x) for x in ('CHEBI:', 'cas:')):
                pass
            elif reference.startswith('uniprot:'):
                uniprot_id = reference[len('uniprot:'):]
                hgnc_id = get_hgnc_id(uniprot_id)
                if hgnc_id is None:  # this only happens for proteins that seem to be virus related
                    # TODO reinvestigate this later
                    logger.debug(
                        'uniprot could not map %s/%s/%s to HGNC',
                        name,
                        reference,
                        get_gene_name(uniprot_id, web_fallback=False),
                    )
                else:
                    name = hgnc_id_to_name[hgnc_id]
                    genes.add((hgnc_id, name))
            else:
                logger.debug(f'unmapped: {name}, {reference}')

        for hgnc_id, hgnc_symbol in genes:
            term.append_relationship(pathway_has_part,
                                     Reference('hgnc', hgnc_id, hgnc_symbol))

        yield term
コード例 #6
0
def test_get_gene_name_multiple_gene_names():
    gene_name = uniprot_client.get_gene_name('Q5VWM5')
    assert gene_name == 'PRAMEF9'
コード例 #7
0
def test_get_gene_name_no_gene_name():
    gene_name = uniprot_client.get_gene_name('P04434', web_fallback=False)
    assert gene_name is None
    gene_name = uniprot_client.get_gene_name('P04434', web_fallback=True)
    assert gene_name is None
コード例 #8
0
def test_get_gene_name_unreviewed():
    gene_name = uniprot_client.get_gene_name('X6RK18', web_fallback=False)
    assert gene_name == 'EXO5'
コード例 #9
0
def test_get_gene_name_nonhuman():
    gene_name = uniprot_client.get_gene_name('P31938')
    assert gene_name == 'Map2k1'
コード例 #10
0
def test_get_gene_name_human():
    gene_name = uniprot_client.get_gene_name('P00533')
    assert gene_name == 'EGFR'
コード例 #11
0
def test_get_gene_name_only_protein_name():
    assert uniprot_client.get_gene_name('P04377') == 'Pseudoazurin'
コード例 #12
0
def test_more_gene_names_for_nonhuman():
    gene_name = uniprot_client.get_gene_name('P59632', web_fallback=False)
    assert gene_name == '3a'
    gene_name = uniprot_client.get_gene_name('P0DTD2', web_fallback=False)
    assert gene_name == '9b'
コード例 #13
0
ファイル: pid.py プロジェクト: amanchoudhri/bio2bel
def _get_gene_name(protein_id: str, web_fallback: bool = True):
    from protmapper.uniprot_client import get_gene_name
    return get_gene_name(protein_id, web_fallback=web_fallback)
コード例 #14
0
ファイル: api.py プロジェクト: indralab/protmapper
    def map_to_human_ref(self, prot_id, prot_ns, residue, position,
                         do_methionine_offset=True,
                         do_orthology_mapping=True,
                         do_isoform_mapping=True):
        """Check an agent for invalid sites and look for mappings.

        Look up each modification site on the agent in Uniprot and then the
        site map.

        Parameters
        ----------
        prot_id : str
            A Uniprot ID or HGNC gene symbol for the protein.
        prot_ns : str
            One of 'uniprot' or 'hgnc' indicating the type of ID given.
        residue : str
            Residue to map on the protein to check for validity and map.
        position : str
            Position of the residue to check for validity and map.
        do_methionine_offset : boolean
            Whether to check for off-by-one errors in site position (possibly)
            attributable to site numbering from mature proteins after
            cleavage of the initial methionine. If True, checks the reference
            sequence for a known modification at 1 site position greater
            than the given one; if there exists such a site, creates the
            mapping. Default is True.
        do_orthology_mapping : boolean
            Whether to check sequence positions for known modification sites
            in mouse or rat sequences (based on PhosphoSitePlus data). If a
            mouse/rat site is found that is linked to a site in the human
            reference sequence, a mapping is created. Default is True.
        do_isoform_mapping : boolean
            Whether to check sequence positions for known modifications
            in other human isoforms of the protein (based on PhosphoSitePlus
            data). If a site is found that is linked to a site in the human
            reference sequence, a mapping is created. Default is True.

        Returns
        -------
        MappedSite
            The MappedSite object gives information on results of mapping the
            site. See :py:class:`protmapper.api.MappedSite` documentation for
            details.
        """
        # Check the protein ID and namespace
        if prot_id is None:
            raise ValueError("prot_id must not be None.")
        if prot_ns not in ('uniprot', 'hgnc'):
            raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for "
                             "HGNC symbols)")
        # Get Uniprot ID and gene name
        up_id = _get_uniprot_id(prot_id, prot_ns)
        # If an HGNC ID was given and the uniprot entry is not found, flag
        # as error
        if up_id is None:
            assert prot_ns == 'hgnc' and prot_id is not None
            return MappedSite(None, None, residue, position,
                              gene_name=prot_id, error_code='NO_UNIPROT_ID')
        # Make sure the sites are proper amino acids/positions
        try:
            valid_res, valid_pos = _validate_site(residue, position)
        except InvalidSiteException as ex:
            return MappedSite(up_id, None, residue, position,
                              error_code='INVALID_SITE',
                              description=str(ex))
        # Get the gene name from Uniprot
        gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False)
        site_key = (up_id, residue, position)
        # First, check the cache to potentially avoid a costly sequence
        # lookup
        cached_site = self._cache.get(site_key)
        if cached_site is not None:
            return cached_site
        # If not cached, continue
        # Look up the residue/position in uniprot
        try:
            site_valid = uniprot_client.verify_location(up_id, residue,
                                                        position)
            error_code = None
        except HTTPError as ex:
            if ex.response.status_code == 404:
                error_code = 'UNIPROT_HTTP_NOT_FOUND'
            else:
                error_code = 'UNIPROT_HTTP_OTHER'
        except Exception as ex:
            error_code = 'UNIPROT_OTHER'
            logger.error(ex)
        if error_code:
            # Set error_code; valid will set to None, not True/False
            mapped_site = MappedSite(up_id, None, residue, position,
                                     error_code=error_code)
            return mapped_site
        # It's a valid site
        if site_valid:
            mapped_site = MappedSite(up_id, True, residue, position,
                                     description='VALID',
                                     gene_name=gene_name)
            self._cache[site_key] = mapped_site
            return mapped_site
        # If it's not a valid site, check the site map first
        curated_site = self.site_map.get(site_key, None)
        # Manually mapped in the site map
        if curated_site is not None:
            mapped_res, mapped_pos, description = curated_site
            mapped_site = MappedSite(up_id, False, residue, position,
                                     mapped_id=up_id,
                                     mapped_res=mapped_res,
                                     mapped_pos=mapped_pos,
                                     description=description,
                                     gene_name=gene_name)
            self._cache[site_key] = mapped_site
            return mapped_site

        # There is no manual mapping, next we try to see if UniProt
        # reports a signal peptide that could be responsible for the position
        # being shifted
        signal_peptide = uniprot_client.get_signal_peptide(up_id, False)
        # If there is valid signal peptide information from UniProt
        if signal_peptide and signal_peptide.begin == 1 and \
                signal_peptide.end is not None:
            offset_pos = str(int(position) + signal_peptide.end)
            # Check to see if the offset position is known to be phosphorylated
            mapped_site = self.get_psp_mapping(
                                up_id, up_id, gene_name, residue, position,
                                offset_pos, 'SIGNAL_PEPTIDE_REMOVED')
            if mapped_site:
                return mapped_site
        # ...there's no manually curated site or signal peptide, so do mapping
        # via PhosphoSite if the data is available:
        human_prot = uniprot_client.is_human(up_id)
        if phosphosite_client.has_data():
            # First, look for other entries in phosphosite for this protein
            # where this sequence position is legit (i.e., other isoforms)
            if do_isoform_mapping and up_id and human_prot:
                mapped_site = self.get_psp_mapping(
                        up_id, up_id, gene_name, residue, position, position,
                        'INFERRED_ALTERNATIVE_ISOFORM')
                if mapped_site:
                    return mapped_site
            # Try looking for rat or mouse sites
            if do_orthology_mapping and up_id and human_prot:
                # Get the mouse ID for this protein
                up_mouse = uniprot_client.get_mouse_id(up_id)
                # Get mouse sequence
                mapped_site = self.get_psp_mapping(
                                    up_id, up_mouse, gene_name, residue,
                                    position, position, 'INFERRED_MOUSE_SITE')
                if mapped_site:
                    return mapped_site
                # Try the rat sequence
                up_rat = uniprot_client.get_rat_id(up_id)
                mapped_site = self.get_psp_mapping(
                                    up_id, up_rat, gene_name, residue, position,
                                    position, 'INFERRED_RAT_SITE')
                if mapped_site:
                    return mapped_site
            # Check for methionine offset (off by one)
            if do_methionine_offset and up_id and human_prot:
                offset_pos = str(int(position) + 1)
                mapped_site = self.get_psp_mapping(
                                    up_id, up_id, gene_name, residue, position,
                                    offset_pos, 'INFERRED_METHIONINE_CLEAVAGE')
                if mapped_site:
                    return mapped_site
        # If we've gotten here, the entry is 1) not in the site map, and
        # 2) we either don't have PSP data or no mapping was found using PSP
        mapped_site = MappedSite(up_id, False, residue, position,
                                 description='NO_MAPPING_FOUND',
                                 gene_name=gene_name)
        self._cache[site_key] = mapped_site
        return mapped_site
コード例 #15
0
def test_protein_name_no_ec_code():
    assert uniprot_client.get_gene_name('P84122') == 'Thrombin'
コード例 #16
0
def get_graph_from_cx(network_uuid: str, cx: CX) -> BELGraph:  # noqa: C901
    """Get a PID network from NDEx."""
    metadata = {}
    for entry in iterate_aspect(cx, 'networkAttributes'):
        member_name = entry['n']
        if member_name == 'name':
            metadata['name'] = entry['v']
        elif member_name == 'version':
            metadata['version'] = entry['v']
        elif member_name == 'description':
            metadata['description'] = entry['v']

    graph = BELGraph(**metadata)

    id_to_type = {}
    id_to_members = {}
    id_to_alias = {}
    # TODO nodeAttributes have list of protein definitions for some things
    for entry in iterate_aspect(cx, 'nodeAttributes'):
        node_id = entry['po']
        member_name = entry['n']
        if member_name == 'type':
            id_to_type[node_id] = entry['v']
        elif member_name == 'alias':
            id_to_alias[node_id] = entry['v']
        elif member_name == 'member':
            id_to_members[node_id] = entry['v']
        else:
            logger.warning(f'unhandled node attribute: {member_name}')

    id_to_citations = {}
    for entry in iterate_aspect(cx, 'edgeAttributes'):
        if entry['n'] == 'citation':
            id_to_citations[entry['po']] = [
                x[len('pubmed:'):] for x in entry['v']
            ]

    id_to_dsl = {}
    for node in iterate_aspect(cx, 'nodes'):
        node_id = node['@id']
        reference = node['r']
        if reference in MAPPING:
            id_to_dsl[node_id] = [MAPPING[reference]]
            continue
        if node_id in id_to_members:
            node_type = id_to_type[node_id]
            members = id_to_members[node_id]
            if node_type != 'proteinfamily':
                logger.warning(
                    f'unhandled node: {node_id} type={node_type} members={members}'
                )

            _rv = []
            for member in members:
                if not member.startswith('hgnc.symbol:'):
                    logger.warning(
                        f'unhandled member for node: {node_id} -> {member}')
                    continue
                member_name = member[len('hgnc.symbol:'):]
                member_identifier = hgnc_name_to_id.get(member_name)
                if member_identifier is None:
                    logger.warning(
                        f'unhandled member for node: {node_id} -> {member}')
                    continue
                _rv.append(
                    pybel.dsl.Protein(namespace='hgnc',
                                      identifier=member_identifier,
                                      name=member_name))
            id_to_dsl[node_id] = _rv
            continue
        if ':' not in reference:
            logger.warning(f'no curie: {node_id} {reference}')
            UNMAPPED.add(reference)
            continue
        prefix, identifier = reference.split(':')
        if prefix == 'hprd':
            # nodes.write(f'unhandled hprd:{identifier}')
            continue
        elif prefix == 'cas':
            # nodes.write(f'unhandled cas:{identifier}')
            continue  # not sure what to do with this
        elif prefix == 'CHEBI':
            name = chebi_id_to_name[identifier]
            id_to_dsl[node_id] = [
                pybel.dsl.Abundance(namespace='chebi',
                                    identifier=identifier,
                                    name=name)
            ]
        elif prefix == 'uniprot':
            name = node['n']
            if name not in hgnc_name_to_id:
                name = get_gene_name(identifier)
                if name is None:
                    logger.warning('could not map uniprot to name')
            identifier = hgnc_name_to_id.get(name)
            if identifier is None:
                logger.warning(f'could not map HGNC symbol {name}')
                continue
            id_to_dsl[node_id] = [
                pybel.dsl.Protein(namespace='hgnc',
                                  identifier=identifier,
                                  name=name)
            ]
        else:
            logger.warning(f'unexpected prefix: {prefix}')
            continue

    for edge in iterate_aspect(cx, 'edges'):
        source_id, target_id = edge['s'], edge['t']
        if source_id not in id_to_dsl or target_id not in id_to_dsl:
            continue
        edge_type = edge['i']
        edge_id = edge['@id']

        sources = id_to_dsl[source_id]
        targets = id_to_dsl[target_id]
        citations = id_to_citations.get(edge_id, [('ndex', network_uuid)])
        for source, target, citation in product(sources, targets, citations):
            if edge_type == 'in-complex-with':
                graph.add_binds(source,
                                target,
                                citation=citation,
                                evidence=edge_id)
            elif edge_type == 'controls-phosphorylation-of':
                graph.add_regulates(
                    source,
                    target.with_variants(pybel.dsl.ProteinModification('Ph')),
                    citation=citation,
                    evidence=edge_id,
                )
            elif edge_type in {
                    'controls-transport-of', 'controls-transport-of-chemical'
            }:
                graph.add_regulates(
                    source,
                    target,
                    citation=citation,
                    evidence=edge_id,
                    # object_modifier=pybel.dsl.translocation(),
                )
            elif edge_type == 'chemical-affects':
                graph.add_regulates(
                    source,
                    target,
                    citation=citation,
                    evidence=edge_id,
                    object_modifier=pybel.dsl.activity(),
                )
            elif edge_type in {
                    'controls-expression-of', 'controls-production-of',
                    'consumption-controlled-by', 'controls-state-change-of',
                    'catalysis-precedes'
            }:
                graph.add_regulates(source,
                                    target,
                                    citation=citation,
                                    evidence=edge_id)
            elif edge_type == 'used-to-produce':
                graph.add_node_from_data(
                    pybel.dsl.Reaction(
                        reactants=source,
                        products=target,
                    ))
            elif edge_type == 'reacts-with':
                graph.add_binds(source,
                                target,
                                citation=citation,
                                evidence=edge_id)
                # graph.add_node_from_data(pybel.dsl.Reaction(
                #     reactants=[source, target],
                # ))

            else:
                logger.warning(
                    f'unhandled edge type: {source} {edge_type} {target}')

    return graph