Esempio n. 1
0
def main():
    """Import mappings from ComPath."""
    df = pd.read_csv(URL, sep="\t")
    df = df[df["relation"] == "skos:exactMatch"]
    df = df[~df["source prefix"].isin(BLACKLIST)]
    df = df[~df["target prefix"].isin(BLACKLIST)]
    df["type"] = "manual"
    df["source"] = "orcid:0000-0002-2046-6145"  # ComPath is courtesy of Uncle Daniel

    # TODO check that species are the same

    # Make sure nomenclature is correct
    df["source name"] = [
        name if prefix == "kegg.pathway" else pyobo.get_name(
            prefix, identifier) for prefix, identifier, name in tqdm(df[
                ["source prefix", "source identifier", "source name"]].values)
    ]
    df["target name"] = [
        name if prefix == "kegg.pathway" else pyobo.get_name(
            prefix, identifier) for prefix, identifier, name in tqdm(df[
                ["target prefix", "target identifier", "target name"]].values)
    ]
    df = df.drop_duplicates()
    mappings = (mapping for _, mapping in df.iterrows())
    append_true_mappings(mappings, sort=True)
Esempio n. 2
0
def _process_interactor(s: str) -> Optional[Tuple[str, str, Optional[str]]]:
    if s.startswith('uniprotkb:'):
        uniprot_id = s[len('uniprotkb:'):]
        try:
            ncbigene_id = get_entrez_id(uniprot_id)
        except Exception:
            ncbigene_id = None
        if ncbigene_id:
            return 'ncbigene', ncbigene_id, pyobo.get_name(
                'ncbigene', ncbigene_id)
        return 'uniprot', uniprot_id, get_mnemonic(uniprot_id)
    if s.startswith('chebi:"CHEBI:'):
        chebi_id = s[len('chebi:"CHEBI:'):-1]
        return 'chebi', chebi_id, pyobo.get_name('chebi', chebi_id)
    if s.startswith('chembl target:'):
        return 'chembl.target', s[len('chembl target:'):-1], None
    if s.startswith('intact:'):
        prefix, identifier = 'intact', s[len('intact:'):]

        complexportal_identifier = _map_complexportal(identifier)
        if complexportal_identifier is not None:
            return 'complexportal', complexportal_identifier, None

        reactome_identifier = _map_reactome(identifier)
        if reactome_identifier is not None:
            return 'reactome', reactome_identifier, None

        _unhandled[prefix] += 1
        logger.debug('could not find complexportal/reactome mapping for %s:%s',
                     prefix, identifier)
        return prefix, identifier, None
    if s.startswith('intenz:'):
        return 'eccode', s[len('intenz:'):], None
    """
    Counter({'chebi': 9534,
         'ensembl': 3156,
         'refseq': 444,
         'ensemblgenomes': 439,
         'ddbj/embl/genbank': 204,
         'wwpdb': 163,
         'matrixdb': 102,
         'reactome': 87,
         'intenz': 43,
         'signor': 15,
         'chembl target': 11,
         'dip': 4,
         'entrezgene/locuslink': 2,
         'protein ontology': 2,
         'emdb': 2})
    """
    _unhandled[s.split(':')[0]] += 1
    if s not in _logged_unhandled:
        logger.warning('unhandled identifier: %s', s)
        _logged_unhandled.add(s)
Esempio n. 3
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
        if kegg_genome.identifier in SKIP:
            continue
        term = Term.from_triple(
            prefix=KEGG_GENOME_PREFIX,
            identifier=kegg_genome.identifier,
            name=kegg_genome.name,
        )
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name("ncbitaxon",
                                           kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                logger.debug(
                    f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}"
                )
            term.append_xref(
                Reference(
                    prefix="ncbitaxon",
                    identifier=kegg_genome.taxonomy_id,
                    name=taxonomy_name,
                ))
        yield term

    logger.info("[%s] unable to find %d taxonomy names in NCBI",
                KEGG_GENOME_PREFIX, errors)
Esempio n. 4
0
def iter_terms() -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes():
        xrefs = []
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}')
            xrefs.append(Reference(
                prefix='ncbitaxon',
                identifier=kegg_genome.taxonomy_id,
                name=taxonomy_name,
            ))

        term = Term(
            reference=Reference(
                prefix='kegg.genome',
                identifier=kegg_genome.identifier,
                name=kegg_genome.name,
            ),
            xrefs=xrefs,
        )
        yield term

    logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
Esempio n. 5
0
    def get_drug_to_hgnc_symbols(self, cache=True, recalculate=False) -> Dict[str, List[str]]:
        """Get a dictionary of drug names to HGNC gene symbols."""
        if cache and not recalculate and os.path.exists(_dti_symbols_cache_path):
            log.debug('loading cached DTIs with gene symbols')
            with open(_dti_symbols_cache_path) as file:
                return json.load(file)

        drug_to_hgnc_ids = self.get_drug_to_hgnc_ids()
        rv = defaultdict(list)

        for drug, hgnc_ids in drug_to_hgnc_ids.items():
            for hgnc_id in hgnc_ids:
                hgnc_symbol = pyobo.get_name('hgnc', hgnc_id)

                if hgnc_symbol is None:
                    log.warning('could not map HGNC identifier: %s', hgnc_id)
                    continue

                rv[drug].append(hgnc_symbol)

        if cache:
            with open(_dti_symbols_cache_path, 'w') as file:
                log.info('dumping cached DTIs')
                json.dump(rv, file)

        return dict(rv)
Esempio n. 6
0
 def test_already_primary(self, _, __):
     """Test when you give a primary id."""
     primary_id = get_primary_identifier('go', '0003700')
     self.assertIsNotNone(primary_id)
     self.assertEqual('0003700', primary_id)
     name = get_name('go', '0003700')
     self.assertIsNotNone(name)
     self.assertEqual('DNA-binding transcription factor activity', name)
Esempio n. 7
0
 def test_get_primary(self, _, __):
     """Test upgrading an obsolete identifier."""
     primary_id = get_primary_identifier('go', '0001071')
     self.assertIsNotNone(primary_id)
     self.assertEqual('0003700', primary_id)
     name = get_name('go', '0001071')
     self.assertIsNotNone(name)
     self.assertEqual('DNA-binding transcription factor activity', name)
Esempio n. 8
0
 def test_already_primary(self, _, __):
     """Test when you give a primary id."""
     primary_id = get_primary_identifier("go", "0003700")
     self.assertIsNotNone(primary_id)
     self.assertEqual("0003700", primary_id)
     name = get_name("go", "0003700")
     self.assertIsNotNone(name)
     self.assertEqual("DNA-binding transcription factor activity", name)
Esempio n. 9
0
 def test_get_primary(self, _, __):
     """Test upgrading an obsolete identifier."""
     primary_id = get_primary_identifier("go", "0001071")
     self.assertIsNotNone(primary_id)
     self.assertEqual("0003700", primary_id)
     name = get_name("go", "0001071")
     self.assertIsNotNone(name)
     self.assertEqual("DNA-binding transcription factor activity", name)
Esempio n. 10
0
 def set_species(self, identifier: str, name: Optional[str] = None):
     """Append the from_species relation."""
     if name is None:
         import pyobo
         name = pyobo.get_name('ncbitaxon', identifier)
     self.append_relationship(
         from_species,
         Reference(prefix='ncbitaxon', identifier=identifier, name=name))
Esempio n. 11
0
def _enrich_graph_with_df(graph: pybel.BELGraph, df: pd.DataFrame) -> None:
    it = df[['ncbigene_id', 'source_name', 'target_id']].values
    for ncbigene_id, ncbi_name, go_id in it:
        graph.add_association(
            pybel.dsl.Protein('ncbigene', identifier=ncbigene_id, name=ncbi_name),
            pybel.dsl.BiologicalProcess('go', identifier=go_id, name=pyobo.get_name('go', go_id)),
            citation='',
            evidence='',
        )
Esempio n. 12
0
def get_graph_from_cx(network_uuid: str, cx: CX) -> BELGraph:  # noqa: C901
    """Get a PID network from NDEx."""
    metadata = {}
    for entry in iterate_aspect(cx, 'networkAttributes'):
        member_name = entry['n']
        if member_name == 'name':
            metadata['name'] = entry['v']
        elif member_name == 'version':
            metadata['version'] = entry['v']
        elif member_name == 'description':
            metadata['description'] = entry['v']

    graph = BELGraph(**metadata)

    id_to_type = {}
    id_to_members = {}
    id_to_alias = {}
    # TODO nodeAttributes have list of protein definitions for some things
    for entry in iterate_aspect(cx, 'nodeAttributes'):
        node_id = entry['po']
        member_name = entry['n']
        if member_name == 'type':
            id_to_type[node_id] = entry['v']
        elif member_name == 'alias':
            id_to_alias[node_id] = entry['v']
        elif member_name == 'member':
            id_to_members[node_id] = entry['v']
        else:
            logger.warning(f'unhandled node attribute: {member_name}')

    id_to_citations = {}
    for entry in iterate_aspect(cx, 'edgeAttributes'):
        if entry['n'] == 'citation':
            id_to_citations[entry['po']] = [
                x[len('pubmed:'):] for x in entry['v']
            ]

    id_to_dsl = {}
    for node in iterate_aspect(cx, 'nodes'):
        node_id = node['@id']
        reference = node['r']
        if reference in MAPPING:
            id_to_dsl[node_id] = [MAPPING[reference]]
            continue
        if node_id in id_to_members:
            node_type = id_to_type[node_id]
            members = id_to_members[node_id]
            if node_type != 'proteinfamily':
                logger.warning(
                    f'unhandled node: {node_id} type={node_type} members={members}'
                )

            _rv = []
            for member in members:
                if not member.startswith('hgnc.symbol:'):
                    logger.warning(
                        f'unhandled member for node: {node_id} -> {member}')
                    continue
                member_name = member[len('hgnc.symbol:'):]
                member_identifier = _get_hgnc_id_from_name(member_name)
                if member_identifier is None:
                    logger.warning(
                        f'unhandled member for node: {node_id} -> {member}')
                    continue
                _rv.append(
                    pybel.dsl.Protein(namespace='hgnc',
                                      identifier=member_identifier,
                                      name=member_name))
            id_to_dsl[node_id] = _rv
            continue
        if ':' not in reference:
            logger.warning(f'no curie: {node_id} {reference}')
            UNMAPPED.add(reference)
            continue
        prefix, identifier = reference.split(':')
        if prefix == 'hprd':
            # nodes.write(f'unhandled hprd:{identifier}')
            continue
        elif prefix == 'cas':
            # nodes.write(f'unhandled cas:{identifier}')
            continue  # not sure what to do with this
        elif prefix == 'CHEBI':
            name = get_name('chebi', identifier)
            id_to_dsl[node_id] = [
                pybel.dsl.Abundance(namespace='chebi',
                                    identifier=identifier,
                                    name=name)
            ]
        elif prefix == 'uniprot':
            name = node['n']
            hgnc_id = _get_hgnc_id_from_name(name)
            if hgnc_id:
                name = _get_gene_name(identifier)
                if name is None:
                    logger.warning('could not map uniprot to name')
            if identifier is None:
                logger.warning(f'could not map HGNC symbol {name}')
                continue
            id_to_dsl[node_id] = [
                pybel.dsl.Protein(namespace='hgnc',
                                  identifier=identifier,
                                  name=name)
            ]
        else:
            logger.warning(f'unexpected prefix: {prefix}')
            continue

    for edge in iterate_aspect(cx, 'edges'):
        source_id, target_id = edge['s'], edge['t']
        if source_id not in id_to_dsl or target_id not in id_to_dsl:
            continue
        edge_type = edge['i']
        edge_id = edge['@id']

        sources = id_to_dsl[source_id]
        targets = id_to_dsl[target_id]
        citations = id_to_citations.get(edge_id, [('ndex', network_uuid)])
        for source, target, citation in product(sources, targets, citations):
            if edge_type == 'in-complex-with':
                graph.add_binds(source,
                                target,
                                citation=citation,
                                evidence=edge_id)
            elif edge_type == 'controls-phosphorylation-of':
                graph.add_regulates(
                    source,
                    target.with_variants(pybel.dsl.ProteinModification('Ph')),
                    citation=citation,
                    evidence=edge_id,
                )
            elif edge_type in {
                    'controls-transport-of', 'controls-transport-of-chemical'
            }:
                graph.add_regulates(
                    source,
                    target,
                    citation=citation,
                    evidence=edge_id,
                    # object_modifier=pybel.dsl.translocation(),
                )
            elif edge_type == 'chemical-affects':
                graph.add_regulates(
                    source,
                    target,
                    citation=citation,
                    evidence=edge_id,
                    object_modifier=pybel.dsl.activity(),
                )
            elif edge_type in {
                    'controls-expression-of', 'controls-production-of',
                    'consumption-controlled-by', 'controls-state-change-of',
                    'catalysis-precedes'
            }:
                graph.add_regulates(source,
                                    target,
                                    citation=citation,
                                    evidence=edge_id)
            elif edge_type == 'used-to-produce':
                graph.add_node_from_data(
                    pybel.dsl.Reaction(
                        reactants=source,
                        products=target,
                    ))
            elif edge_type == 'reacts-with':
                graph.add_binds(source,
                                target,
                                citation=citation,
                                evidence=edge_id)
                # graph.add_node_from_data(pybel.dsl.Reaction(
                #     reactants=[source, target],
                # ))

            else:
                logger.warning(
                    f'unhandled edge type: {source} {edge_type} {target}')

    return graph
Esempio n. 13
0
def get_relations_df(use_sub_roles=False) -> pd.DataFrame:
    """Assemble the relations dataframe."""
    xrefs_df = get_xrefs_df()

    logger.info('loading famplex mapping')
    famplex_id_to_members = defaultdict(list)
    famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL)
    for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values:
        if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower(
        ) == 'fplx':
            try:
                hgnc_id = hgnc_name_to_id[source_name]
            except KeyError:
                logger.warning(
                    f'Could not find {source_name} for fplx:{target_name}')
                continue
            famplex_id_to_members[target_name].append((hgnc_id, source_name))

    logger.info('getting enzyme classes')
    expasy_graph, ec_code_to_children = get_expasy_closure()
    logger.info('getting ec2go')
    ec2go = get_ec2go()

    x = defaultdict(list)
    it = tqdm(
        xrefs_df.values,
        total=len(xrefs_df.index),
        desc='inferring over target hierarchies',
    )
    for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in it:
        if source_db != 'chebi':
            continue

        if target_db == 'hgnc':
            # Append original
            x[source_db, source_id].append(
                (modulation, 'protein', 'hgnc', target_id, target_name))
            # Append inferred
            for uniprot_id, uniprot_name in get_uniprot_id_names(target_id):
                x[source_db, source_id].append(
                    (modulation, 'protein', 'uniprot', uniprot_id,
                     uniprot_name))

        elif target_db == 'fplx':
            # Append original
            x[source_db, source_id].append(
                (modulation, target_type, target_db, target_id, target_name))
            # Append inferred
            for hgnc_id, hgnc_symbol in famplex_id_to_members.get(
                    target_id, []):
                x[source_db, source_id].append(
                    (modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol))
                for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id):
                    x[source_db, source_id].append(
                        (modulation, 'protein', 'uniprot', uniprot_id,
                         uniprot_name))

        elif target_db == 'ec-code':
            children_ec_codes = ec_code_to_children.get(target_id)
            if children_ec_codes is None:
                # this is the case for about 15 entries
                logger.info(
                    f'could not find children of {target_db}:{target_id}')
                continue

            for sub_target_db, sub_target_id, sub_target_name in children_ec_codes:
                target_type = DB_TO_TYPE[sub_target_db]
                x[source_db, source_id].append((
                    modulation,
                    target_type,
                    sub_target_db,
                    sub_target_id,
                    sub_target_name,
                ))

            for go_id, go_name in ec2go.get(target_id, []):
                x[source_db, source_id].append((
                    modulation,
                    'molecular function',
                    'go',
                    go_id,
                    go_name,
                ))

        else:
            x[source_db, source_id].append(
                (modulation, target_type, target_db, target_id, target_name))

    logger.info('inferring over role hiearchies')
    db_to_role_to_chemical_curies = {
        'chebi': get_chebi_role_to_children(),
    }

    rows = []
    for (role_db,
         role_id), entries in tqdm(x.items(),
                                   desc='inferring over role hierarchies'):
        sub_role_curies = {(role_db, role_id)}

        if role_db == 'chebi' and use_sub_roles:
            sub_role_curies |= {
                pyobo.normalize_curie(c)
                for c in pyobo.get_subhierarchy(role_db, role_id)
            }

        for modulation, target_type, target_db, target_id, target_name in entries:
            chemical_curies = set(
                itt.chain.from_iterable(
                    db_to_role_to_chemical_curies[sub_role_db].get(
                        sub_role_id, [])
                    for sub_role_db, sub_role_id in sub_role_curies))
            if not chemical_curies:
                logger.debug('no inference for %s:%s', role_db, role_id)
                continue
            for chemical_db, chemical_id in chemical_curies:
                rows.append((
                    chemical_db,
                    chemical_id,
                    pyobo.get_name(chemical_db, chemical_id),
                    modulation,
                    target_type,
                    target_db,
                    target_id,
                    target_name,
                ))
    return pd.DataFrame(rows, columns=XREFS_COLUMNS)
Esempio n. 14
0
 def test_no_alts(self, _, __):
     """Test alternate behavior for nomenclature source with no alts."""
     primary_id = get_primary_identifier('ncbitaxon', '52818')
     self.assertEqual('52818', primary_id)
     self.assertEqual('Allamanda cathartica', get_name('ncbitaxon', '52818'))
Esempio n. 15
0
 def test_no_alts(self, _, __):
     """Test alternate behavior for nomenclature source with no alts."""
     primary_id = get_primary_identifier("ncbitaxon", "52818")
     self.assertEqual("52818", primary_id)
     self.assertEqual("Allamanda cathartica",
                      get_name("ncbitaxon", "52818"))
Esempio n. 16
0
def get_relations_df(use_sub_roles: bool = False, use_inferred: bool = True) -> pd.DataFrame:
    """Assemble the relations dataframe."""
    xrefs_df = get_xrefs_df()
    if not use_inferred:
        return xrefs_df

    famplex_id_to_members = _get_famplex()

    logger.info('getting enzyme classes')
    expasy_graph, ec_code_to_children = get_expasy_closure()
    logger.info('getting ec2go')
    ec2go = expasy.get_ec2go()
    logger.info('ec2go has %d elements', len(ec2go))

    rows = list(xrefs_df.values)
    x = defaultdict(list)
    it = tqdm(
        rows,
        total=len(xrefs_df.index),
        desc='inferring over target hierarchies',
    )
    non_chebi_counter = 0
    for source_db, source_id, _source_name, modulation, target_type, target_db, target_id, target_name in it:
        if source_db != 'chebi':
            non_chebi_counter += 1
            continue

        if source_id.startswith(f'{source_db.upper()}:'):
            source_id = source_id[len(source_db) + 1:]
        if target_id.startswith(f'{target_db.upper()}:'):
            target_id = target_id[len(target_db) + 1:]

        if target_db == 'hgnc':
            # Append original
            x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name))
            # Append inferred
            for uniprot_id, uniprot_name in get_uniprot_id_names(target_id):
                x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'fplx':
            # Append original
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))
            # Append inferred
            for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []):
                x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol))
                for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id):
                    x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'eccode':
            children_ec_codes = ec_code_to_children.get(target_id)
            if children_ec_codes is None:
                # this is the case for about 15 entries
                logger.info(f'could not find children of {target_db}:{target_id}')
                continue

            for sub_target_db, sub_target_id, sub_target_name in children_ec_codes:
                target_type = DB_TO_TYPE[sub_target_db]
                x[source_db, source_id].append((
                    modulation, target_type, sub_target_db, sub_target_id, sub_target_name,
                ))

            for go_id, go_name in ec2go.get(target_id, []):
                x[source_db, source_id].append((
                    modulation, 'molecular function', 'go', go_id, go_name,
                ))

        else:
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))

    logger.info('x mapping: %d/%d', len(x), sum(map(len, x.values())))
    logger.info('skipped %d non-chebi source terms', non_chebi_counter)

    logger.info('inferring over role hiearchies')
    db_to_role_to_chemical_curies = {
        'chebi': get_chebi_role_to_children(),
    }
    for (role_db, role_id), entries in tqdm(sorted(x.items()), desc='inferring over role hierarchies'):
        sub_role_curies = {(role_db, role_id)}

        if role_db == 'chebi' and use_sub_roles:
            sub_role_curies |= {
                pyobo.normalize_curie(c)
                for c in pyobo.get_subhierarchy(role_db, role_id)
            }

        chemical_curies = set(itt.chain.from_iterable(
            db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, [])
            for sub_role_db, sub_role_id in sub_role_curies
        ))
        if not chemical_curies:
            tqdm.write(f'no inference for {role_db}:{role_id} ! {pyobo.get_name(role_db, role_id)}')
            continue

        for modulation, target_type, target_db, target_id, target_name in entries:
            for chemical_db, chemical_id in chemical_curies:
                rows.append((
                    chemical_db, chemical_id, pyobo.get_name(chemical_db, chemical_id),
                    modulation, target_type, target_db, target_id, target_name,
                ))

    logger.info('inferred df has %d rows', len(rows))
    rv = pd.DataFrame(rows, columns=XREFS_COLUMNS)
    rv.sort_values(XREFS_COLUMNS, inplace=True)
    rv.drop_duplicates(inplace=True)
    return rv