def main(): """Import mappings from ComPath.""" df = pd.read_csv(URL, sep="\t") df = df[df["relation"] == "skos:exactMatch"] df = df[~df["source prefix"].isin(BLACKLIST)] df = df[~df["target prefix"].isin(BLACKLIST)] df["type"] = "manual" df["source"] = "orcid:0000-0002-2046-6145" # ComPath is courtesy of Uncle Daniel # TODO check that species are the same # Make sure nomenclature is correct df["source name"] = [ name if prefix == "kegg.pathway" else pyobo.get_name( prefix, identifier) for prefix, identifier, name in tqdm(df[ ["source prefix", "source identifier", "source name"]].values) ] df["target name"] = [ name if prefix == "kegg.pathway" else pyobo.get_name( prefix, identifier) for prefix, identifier, name in tqdm(df[ ["target prefix", "target identifier", "target name"]].values) ] df = df.drop_duplicates() mappings = (mapping for _, mapping in df.iterrows()) append_true_mappings(mappings, sort=True)
def _process_interactor(s: str) -> Optional[Tuple[str, str, Optional[str]]]: if s.startswith('uniprotkb:'): uniprot_id = s[len('uniprotkb:'):] try: ncbigene_id = get_entrez_id(uniprot_id) except Exception: ncbigene_id = None if ncbigene_id: return 'ncbigene', ncbigene_id, pyobo.get_name( 'ncbigene', ncbigene_id) return 'uniprot', uniprot_id, get_mnemonic(uniprot_id) if s.startswith('chebi:"CHEBI:'): chebi_id = s[len('chebi:"CHEBI:'):-1] return 'chebi', chebi_id, pyobo.get_name('chebi', chebi_id) if s.startswith('chembl target:'): return 'chembl.target', s[len('chembl target:'):-1], None if s.startswith('intact:'): prefix, identifier = 'intact', s[len('intact:'):] complexportal_identifier = _map_complexportal(identifier) if complexportal_identifier is not None: return 'complexportal', complexportal_identifier, None reactome_identifier = _map_reactome(identifier) if reactome_identifier is not None: return 'reactome', reactome_identifier, None _unhandled[prefix] += 1 logger.debug('could not find complexportal/reactome mapping for %s:%s', prefix, identifier) return prefix, identifier, None if s.startswith('intenz:'): return 'eccode', s[len('intenz:'):], None """ Counter({'chebi': 9534, 'ensembl': 3156, 'refseq': 444, 'ensemblgenomes': 439, 'ddbj/embl/genbank': 204, 'wwpdb': 163, 'matrixdb': 102, 'reactome': 87, 'intenz': 43, 'signor': 15, 'chembl target': 11, 'dip': 4, 'entrezgene/locuslink': 2, 'protein ontology': 2, 'emdb': 2}) """ _unhandled[s.split(':')[0]] += 1 if s not in _logged_unhandled: logger.warning('unhandled identifier: %s', s) _logged_unhandled.add(s)
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"): if kegg_genome.identifier in SKIP: continue term = Term.from_triple( prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name, ) if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name("ncbitaxon", kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 logger.debug( f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}" ) term.append_xref( Reference( prefix="ncbitaxon", identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) yield term logger.info("[%s] unable to find %d taxonomy names in NCBI", KEGG_GENOME_PREFIX, errors)
def iter_terms() -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(): xrefs = [] if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}') xrefs.append(Reference( prefix='ncbitaxon', identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) term = Term( reference=Reference( prefix='kegg.genome', identifier=kegg_genome.identifier, name=kegg_genome.name, ), xrefs=xrefs, ) yield term logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
def get_drug_to_hgnc_symbols(self, cache=True, recalculate=False) -> Dict[str, List[str]]: """Get a dictionary of drug names to HGNC gene symbols.""" if cache and not recalculate and os.path.exists(_dti_symbols_cache_path): log.debug('loading cached DTIs with gene symbols') with open(_dti_symbols_cache_path) as file: return json.load(file) drug_to_hgnc_ids = self.get_drug_to_hgnc_ids() rv = defaultdict(list) for drug, hgnc_ids in drug_to_hgnc_ids.items(): for hgnc_id in hgnc_ids: hgnc_symbol = pyobo.get_name('hgnc', hgnc_id) if hgnc_symbol is None: log.warning('could not map HGNC identifier: %s', hgnc_id) continue rv[drug].append(hgnc_symbol) if cache: with open(_dti_symbols_cache_path, 'w') as file: log.info('dumping cached DTIs') json.dump(rv, file) return dict(rv)
def test_already_primary(self, _, __): """Test when you give a primary id.""" primary_id = get_primary_identifier('go', '0003700') self.assertIsNotNone(primary_id) self.assertEqual('0003700', primary_id) name = get_name('go', '0003700') self.assertIsNotNone(name) self.assertEqual('DNA-binding transcription factor activity', name)
def test_get_primary(self, _, __): """Test upgrading an obsolete identifier.""" primary_id = get_primary_identifier('go', '0001071') self.assertIsNotNone(primary_id) self.assertEqual('0003700', primary_id) name = get_name('go', '0001071') self.assertIsNotNone(name) self.assertEqual('DNA-binding transcription factor activity', name)
def test_already_primary(self, _, __): """Test when you give a primary id.""" primary_id = get_primary_identifier("go", "0003700") self.assertIsNotNone(primary_id) self.assertEqual("0003700", primary_id) name = get_name("go", "0003700") self.assertIsNotNone(name) self.assertEqual("DNA-binding transcription factor activity", name)
def test_get_primary(self, _, __): """Test upgrading an obsolete identifier.""" primary_id = get_primary_identifier("go", "0001071") self.assertIsNotNone(primary_id) self.assertEqual("0003700", primary_id) name = get_name("go", "0001071") self.assertIsNotNone(name) self.assertEqual("DNA-binding transcription factor activity", name)
def set_species(self, identifier: str, name: Optional[str] = None): """Append the from_species relation.""" if name is None: import pyobo name = pyobo.get_name('ncbitaxon', identifier) self.append_relationship( from_species, Reference(prefix='ncbitaxon', identifier=identifier, name=name))
def _enrich_graph_with_df(graph: pybel.BELGraph, df: pd.DataFrame) -> None: it = df[['ncbigene_id', 'source_name', 'target_id']].values for ncbigene_id, ncbi_name, go_id in it: graph.add_association( pybel.dsl.Protein('ncbigene', identifier=ncbigene_id, name=ncbi_name), pybel.dsl.BiologicalProcess('go', identifier=go_id, name=pyobo.get_name('go', go_id)), citation='', evidence='', )
def get_graph_from_cx(network_uuid: str, cx: CX) -> BELGraph: # noqa: C901 """Get a PID network from NDEx.""" metadata = {} for entry in iterate_aspect(cx, 'networkAttributes'): member_name = entry['n'] if member_name == 'name': metadata['name'] = entry['v'] elif member_name == 'version': metadata['version'] = entry['v'] elif member_name == 'description': metadata['description'] = entry['v'] graph = BELGraph(**metadata) id_to_type = {} id_to_members = {} id_to_alias = {} # TODO nodeAttributes have list of protein definitions for some things for entry in iterate_aspect(cx, 'nodeAttributes'): node_id = entry['po'] member_name = entry['n'] if member_name == 'type': id_to_type[node_id] = entry['v'] elif member_name == 'alias': id_to_alias[node_id] = entry['v'] elif member_name == 'member': id_to_members[node_id] = entry['v'] else: logger.warning(f'unhandled node attribute: {member_name}') id_to_citations = {} for entry in iterate_aspect(cx, 'edgeAttributes'): if entry['n'] == 'citation': id_to_citations[entry['po']] = [ x[len('pubmed:'):] for x in entry['v'] ] id_to_dsl = {} for node in iterate_aspect(cx, 'nodes'): node_id = node['@id'] reference = node['r'] if reference in MAPPING: id_to_dsl[node_id] = [MAPPING[reference]] continue if node_id in id_to_members: node_type = id_to_type[node_id] members = id_to_members[node_id] if node_type != 'proteinfamily': logger.warning( f'unhandled node: {node_id} type={node_type} members={members}' ) _rv = [] for member in members: if not member.startswith('hgnc.symbol:'): logger.warning( f'unhandled member for node: {node_id} -> {member}') continue member_name = member[len('hgnc.symbol:'):] member_identifier = _get_hgnc_id_from_name(member_name) if member_identifier is None: logger.warning( f'unhandled member for node: {node_id} -> {member}') continue _rv.append( pybel.dsl.Protein(namespace='hgnc', identifier=member_identifier, name=member_name)) id_to_dsl[node_id] = _rv continue if ':' not in reference: logger.warning(f'no curie: {node_id} {reference}') UNMAPPED.add(reference) continue prefix, identifier = reference.split(':') if prefix == 'hprd': # nodes.write(f'unhandled hprd:{identifier}') continue elif prefix == 'cas': # nodes.write(f'unhandled cas:{identifier}') continue # not sure what to do with this elif prefix == 'CHEBI': name = get_name('chebi', identifier) id_to_dsl[node_id] = [ pybel.dsl.Abundance(namespace='chebi', identifier=identifier, name=name) ] elif prefix == 'uniprot': name = node['n'] hgnc_id = _get_hgnc_id_from_name(name) if hgnc_id: name = _get_gene_name(identifier) if name is None: logger.warning('could not map uniprot to name') if identifier is None: logger.warning(f'could not map HGNC symbol {name}') continue id_to_dsl[node_id] = [ pybel.dsl.Protein(namespace='hgnc', identifier=identifier, name=name) ] else: logger.warning(f'unexpected prefix: {prefix}') continue for edge in iterate_aspect(cx, 'edges'): source_id, target_id = edge['s'], edge['t'] if source_id not in id_to_dsl or target_id not in id_to_dsl: continue edge_type = edge['i'] edge_id = edge['@id'] sources = id_to_dsl[source_id] targets = id_to_dsl[target_id] citations = id_to_citations.get(edge_id, [('ndex', network_uuid)]) for source, target, citation in product(sources, targets, citations): if edge_type == 'in-complex-with': graph.add_binds(source, target, citation=citation, evidence=edge_id) elif edge_type == 'controls-phosphorylation-of': graph.add_regulates( source, target.with_variants(pybel.dsl.ProteinModification('Ph')), citation=citation, evidence=edge_id, ) elif edge_type in { 'controls-transport-of', 'controls-transport-of-chemical' }: graph.add_regulates( source, target, citation=citation, evidence=edge_id, # object_modifier=pybel.dsl.translocation(), ) elif edge_type == 'chemical-affects': graph.add_regulates( source, target, citation=citation, evidence=edge_id, object_modifier=pybel.dsl.activity(), ) elif edge_type in { 'controls-expression-of', 'controls-production-of', 'consumption-controlled-by', 'controls-state-change-of', 'catalysis-precedes' }: graph.add_regulates(source, target, citation=citation, evidence=edge_id) elif edge_type == 'used-to-produce': graph.add_node_from_data( pybel.dsl.Reaction( reactants=source, products=target, )) elif edge_type == 'reacts-with': graph.add_binds(source, target, citation=citation, evidence=edge_id) # graph.add_node_from_data(pybel.dsl.Reaction( # reactants=[source, target], # )) else: logger.warning( f'unhandled edge type: {source} {edge_type} {target}') return graph
def get_relations_df(use_sub_roles=False) -> pd.DataFrame: """Assemble the relations dataframe.""" xrefs_df = get_xrefs_df() logger.info('loading famplex mapping') famplex_id_to_members = defaultdict(list) famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL) for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values: if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower( ) == 'fplx': try: hgnc_id = hgnc_name_to_id[source_name] except KeyError: logger.warning( f'Could not find {source_name} for fplx:{target_name}') continue famplex_id_to_members[target_name].append((hgnc_id, source_name)) logger.info('getting enzyme classes') expasy_graph, ec_code_to_children = get_expasy_closure() logger.info('getting ec2go') ec2go = get_ec2go() x = defaultdict(list) it = tqdm( xrefs_df.values, total=len(xrefs_df.index), desc='inferring over target hierarchies', ) for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in it: if source_db != 'chebi': continue if target_db == 'hgnc': # Append original x[source_db, source_id].append( (modulation, 'protein', 'hgnc', target_id, target_name)) # Append inferred for uniprot_id, uniprot_name in get_uniprot_id_names(target_id): x[source_db, source_id].append( (modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'fplx': # Append original x[source_db, source_id].append( (modulation, target_type, target_db, target_id, target_name)) # Append inferred for hgnc_id, hgnc_symbol in famplex_id_to_members.get( target_id, []): x[source_db, source_id].append( (modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol)) for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id): x[source_db, source_id].append( (modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'ec-code': children_ec_codes = ec_code_to_children.get(target_id) if children_ec_codes is None: # this is the case for about 15 entries logger.info( f'could not find children of {target_db}:{target_id}') continue for sub_target_db, sub_target_id, sub_target_name in children_ec_codes: target_type = DB_TO_TYPE[sub_target_db] x[source_db, source_id].append(( modulation, target_type, sub_target_db, sub_target_id, sub_target_name, )) for go_id, go_name in ec2go.get(target_id, []): x[source_db, source_id].append(( modulation, 'molecular function', 'go', go_id, go_name, )) else: x[source_db, source_id].append( (modulation, target_type, target_db, target_id, target_name)) logger.info('inferring over role hiearchies') db_to_role_to_chemical_curies = { 'chebi': get_chebi_role_to_children(), } rows = [] for (role_db, role_id), entries in tqdm(x.items(), desc='inferring over role hierarchies'): sub_role_curies = {(role_db, role_id)} if role_db == 'chebi' and use_sub_roles: sub_role_curies |= { pyobo.normalize_curie(c) for c in pyobo.get_subhierarchy(role_db, role_id) } for modulation, target_type, target_db, target_id, target_name in entries: chemical_curies = set( itt.chain.from_iterable( db_to_role_to_chemical_curies[sub_role_db].get( sub_role_id, []) for sub_role_db, sub_role_id in sub_role_curies)) if not chemical_curies: logger.debug('no inference for %s:%s', role_db, role_id) continue for chemical_db, chemical_id in chemical_curies: rows.append(( chemical_db, chemical_id, pyobo.get_name(chemical_db, chemical_id), modulation, target_type, target_db, target_id, target_name, )) return pd.DataFrame(rows, columns=XREFS_COLUMNS)
def test_no_alts(self, _, __): """Test alternate behavior for nomenclature source with no alts.""" primary_id = get_primary_identifier('ncbitaxon', '52818') self.assertEqual('52818', primary_id) self.assertEqual('Allamanda cathartica', get_name('ncbitaxon', '52818'))
def test_no_alts(self, _, __): """Test alternate behavior for nomenclature source with no alts.""" primary_id = get_primary_identifier("ncbitaxon", "52818") self.assertEqual("52818", primary_id) self.assertEqual("Allamanda cathartica", get_name("ncbitaxon", "52818"))
def get_relations_df(use_sub_roles: bool = False, use_inferred: bool = True) -> pd.DataFrame: """Assemble the relations dataframe.""" xrefs_df = get_xrefs_df() if not use_inferred: return xrefs_df famplex_id_to_members = _get_famplex() logger.info('getting enzyme classes') expasy_graph, ec_code_to_children = get_expasy_closure() logger.info('getting ec2go') ec2go = expasy.get_ec2go() logger.info('ec2go has %d elements', len(ec2go)) rows = list(xrefs_df.values) x = defaultdict(list) it = tqdm( rows, total=len(xrefs_df.index), desc='inferring over target hierarchies', ) non_chebi_counter = 0 for source_db, source_id, _source_name, modulation, target_type, target_db, target_id, target_name in it: if source_db != 'chebi': non_chebi_counter += 1 continue if source_id.startswith(f'{source_db.upper()}:'): source_id = source_id[len(source_db) + 1:] if target_id.startswith(f'{target_db.upper()}:'): target_id = target_id[len(target_db) + 1:] if target_db == 'hgnc': # Append original x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name)) # Append inferred for uniprot_id, uniprot_name in get_uniprot_id_names(target_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'fplx': # Append original x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) # Append inferred for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []): x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol)) for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'eccode': children_ec_codes = ec_code_to_children.get(target_id) if children_ec_codes is None: # this is the case for about 15 entries logger.info(f'could not find children of {target_db}:{target_id}') continue for sub_target_db, sub_target_id, sub_target_name in children_ec_codes: target_type = DB_TO_TYPE[sub_target_db] x[source_db, source_id].append(( modulation, target_type, sub_target_db, sub_target_id, sub_target_name, )) for go_id, go_name in ec2go.get(target_id, []): x[source_db, source_id].append(( modulation, 'molecular function', 'go', go_id, go_name, )) else: x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) logger.info('x mapping: %d/%d', len(x), sum(map(len, x.values()))) logger.info('skipped %d non-chebi source terms', non_chebi_counter) logger.info('inferring over role hiearchies') db_to_role_to_chemical_curies = { 'chebi': get_chebi_role_to_children(), } for (role_db, role_id), entries in tqdm(sorted(x.items()), desc='inferring over role hierarchies'): sub_role_curies = {(role_db, role_id)} if role_db == 'chebi' and use_sub_roles: sub_role_curies |= { pyobo.normalize_curie(c) for c in pyobo.get_subhierarchy(role_db, role_id) } chemical_curies = set(itt.chain.from_iterable( db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, []) for sub_role_db, sub_role_id in sub_role_curies )) if not chemical_curies: tqdm.write(f'no inference for {role_db}:{role_id} ! {pyobo.get_name(role_db, role_id)}') continue for modulation, target_type, target_db, target_id, target_name in entries: for chemical_db, chemical_id in chemical_curies: rows.append(( chemical_db, chemical_id, pyobo.get_name(chemical_db, chemical_id), modulation, target_type, target_db, target_id, target_name, )) logger.info('inferred df has %d rows', len(rows)) rv = pd.DataFrame(rows, columns=XREFS_COLUMNS) rv.sort_values(XREFS_COLUMNS, inplace=True) rv.drop_duplicates(inplace=True) return rv