def update_from_obo_library( cls, prefix: str, extension: str = "owl", **kwargs, ): prefix = prefix.lower() cache_path = get_resource_path(f"{prefix}.{extension}.pkl") if os.path.exists(cache_path): with open(cache_path, "rb") as file: ontology = pickle.load(file) else: try: import pronto except ImportError: raise ImportError( "To use the INDRA OWL Client, you must first" "install Pronto with `pip install pronto`." ) ontology = pronto.Ontology.from_obo_library( f"{prefix.upper()}.{extension}") with open(cache_path, "wb") as file: pickle.dump(ontology, file, protocol=pickle.HIGHEST_PROTOCOL) cls.update_resource(prefix=prefix, ontology=ontology, **kwargs)
def _process_categories(): """Collect protein category labels from multiple sources.""" idg_df = pandas.read_csv('IDG_target_final.csv') tf_df = pandas.read_csv(get_resource_path('transcription_factors.csv')) pp_df = pandas.read_csv(get_resource_path('phosphatases.tsv'), sep='\t', header=None) categories = {} for _, row in idg_df.iterrows(): categories[row['gene']] = row['idgFamily'] for _, row in tf_df.iterrows(): categories[row[1]] = 'Transcription factor' for _, row in pp_df.iterrows(): categories[row[0]] = 'Phosphatase' return categories
def _read_famplex_map(): fname = get_resource_path('famplex_map.tsv') raw_map = read_unicode_csv(fname, '\t') m = {} for row in raw_map: m[(row[0], row[1])] = row[2] return m
def add_famplex_nodes(self): nodes = [] for row in read_unicode_csv(get_resource_path( os.path.join('famplex', 'entities.csv')), delimiter=','): entity = row[0] nodes.append((self.label('FPLX', entity), {'name': entity})) self.add_nodes_from(nodes)
def _build_chebi_map(): fname = get_resource_path('bel_chebi_map.tsv') chebi_name_id = {} csv_rows = read_unicode_csv(fname, delimiter='\t') for row in csv_rows: chebi_name = row[0] chebi_id = row[1] chebi_name_id[chebi_name] = chebi_id return chebi_name_id
def _build_famplex_map(): fname = get_resource_path('famplex_map.tsv') bel_to_indra = {} csv_rows = read_unicode_csv(fname, delimiter='\t') for row in csv_rows: namespace = row[0] entry = row[1] indra_name = row[2] if namespace == 'BEL': bel_to_indra[entry] = indra_name return bel_to_indra
def add_famplex_hierarchy(self): from indra.databases import hgnc_client edges = [] for row in read_unicode_csv(get_resource_path( os.path.join('famplex', 'relations.csv')), delimiter=','): ns1, id1, rel, ns2, id2 = row if ns1 == 'HGNC': id1 = hgnc_client.get_hgnc_id(id1) edges.append((self.label(ns1, id1), self.label(ns2, id2), {'type': rel})) self.add_edges_from(edges)
def add_biomappings(self): biomappings_tsv = get_resource_path('biomappings.tsv') edges = [] for source_ns, source_id, _, target_ns, target_id, _ in \ read_unicode_csv(biomappings_tsv, delimiter='\t'): edges.append((self.label(source_ns, source_id), self.label(target_ns, target_id), {'type': 'xref', 'source': 'biomappings'})) edges.append((self.label(target_ns, target_id), self.label(source_ns, source_id), {'type': 'xref', 'source': 'biomappings'})) self.add_edges_from(edges)
def read_selventa_resources(): fname = get_resource_path('selventa_entries.tsv') csv_rows = read_unicode_csv(fname) selventa_lookup = {} for namespace, sid, name, xrefs_str in csv_rows: # namespace, name -> ID, xrefs if xrefs_str: xrefs_dict = { x.split(':', 1)[0]: x.split(':', 1)[1] for x in xrefs_str.split('|') } else: xrefs_dict = {} selventa_lookup[(namespace, name)] = (sid, xrefs_dict) return selventa_lookup
def add_famplex_xrefs(self): edges = [] include_refs = {'PF', 'IP', 'GO', 'NCIT', 'ECCODE', 'HGNC_GROUP', 'MESH'} for row in read_unicode_csv(get_resource_path('famplex_map.tsv'), delimiter='\t'): ref_ns, ref_id, fplx_id = row if ref_ns not in include_refs: continue edges.append((self.label(ref_ns, ref_id), self.label('FPLX', fplx_id), {'type': 'xref', 'source': 'fplx'})) edges.append((self.label('FPLX', fplx_id), self.label(ref_ns, ref_id), {'type': 'xref', 'source': 'fplx'})) self.add_edges_from(edges)
def add_lspci(self): lspci = read_unicode_csv(get_resource_path('lspci.tsv'), delimiter='\t') nodes_to_add = [] edges_to_add = [] next(lspci) for (lspcid, name, members_str) in lspci: label = self.label('LSPCI', lspcid) nodes_to_add.append((label, {'name': name, 'type': 'small_molecule'})) members = [member.split(':', maxsplit=1) for member in members_str.split('|')] edges_to_add += [(self.label(*member), label, {'type': 'isa'}) for member in members] self.add_nodes_from(nodes_to_add) self.add_edges_from(edges_to_add)
def update_resource( cls, directory, url, prefix, *args, remove_prefix=False, allowed_synonyms=None, allowed_external_ns=None, force: bool = False, ): """Write the OBO information to files in the given directory.""" resource_path = get_resource_path(f'{prefix}.json') obo_path = os.path.join(directory, '%s.obo.pkl' % prefix) if os.path.exists(obo_path) and not force: with open(obo_path, 'rb') as file: g = pickle.load(file) else: g = obonet.read_obo(url) with open(obo_path, 'wb') as file: pickle.dump(g, file) entries = \ OboClient.entries_from_graph( g, prefix=prefix, remove_prefix=remove_prefix, allowed_synonyms=allowed_synonyms, allowed_external_ns=allowed_external_ns) entries = prune_standard(entries) def sort_key(x): val = x['id'] if not remove_prefix: val = val.split(':')[1] try: val = int(val) except ValueError: pass return val entries = sorted(entries, key=sort_key) with open(resource_path, 'w') as file: json.dump(entries, file, indent=1, sort_keys=True)
def add_famplex_xrefs(self): edges = [] include_refs = {'PF', 'IP', 'GO', 'NCIT', 'ECCODE', 'HGNC_GROUP', 'MESH'} for row in read_unicode_csv(get_resource_path('famplex_map.tsv'), delimiter='\t'): ref_ns, ref_id, fplx_id = row if ref_ns not in include_refs: continue edges.append((self.label(ref_ns, ref_id), self.label('FPLX', fplx_id), {'type': 'xref', 'source': 'fplx'})) # We avoid FPLX->MESH mappings in this direction due to # species-specificity issues if ref_ns != 'MESH': edges.append((self.label('FPLX', fplx_id), self.label(ref_ns, ref_id), {'type': 'xref', 'source': 'fplx'})) self.add_edges_from(edges)
def update_by_prefix( cls, prefix: str, include_relations: bool = False, predicate: Optional[Callable[["pyobo.Term"], bool]] = None, indra_prefix: str = None, ): """Update the JSON data by looking up the ontology through PyOBO.""" import pyobo terms = iter(pyobo.get_ontology(prefix)) if predicate: terms = filter(predicate, terms) terms = sorted(terms, key=attrgetter("identifier")) entries = [{ 'id': term.identifier, 'name': term.name, 'synonyms': [synonym.name for synonym in term.synonyms], 'xrefs': [ dict(namespace=xref.prefix, id=xref.identifier) for xref in term.xrefs ], 'alt_ids': [alt_id.identifier for alt_id in term.alt_ids], 'relations': _get_pyobo_rels( term, include_relations=include_relations, ), } for term in terms] entries = prune_standard(entries) indra_prefix = prefix if not indra_prefix else indra_prefix resource_path = get_resource_path(f'{indra_prefix}.json') with open(resource_path, 'w') as file: json.dump(entries, fp=file, indent=1, sort_keys=True)
def update_resource( cls, prefix: str, ontology: "pronto.Ontology", skip_obsolete: bool = True, remove_prefix: bool = False, ): prefix = prefix.lower() entries = cls.entries_from_ontology( prefix=prefix, ontology=ontology, skip_obsolete=skip_obsolete, remove_prefix=remove_prefix ) entries = prune_empty_entries( entries, {"synonyms", "xrefs", "alt_ids", "relations"}, ) entries = sorted( entries, key=itemgetter("id") if remove_prefix else _id_key, ) resource_path = get_resource_path(f"{prefix}.json") with open(resource_path, "w") as file: json.dump(entries, file, indent=1, sort_keys=True)
def _read_hgnc_maps(): hgnc_file = get_resource_path("hgnc_entries.tsv") csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8') hgnc_names = {} hgnc_ids = {} hgnc_withdrawn = [] uniprot_ids = {} entrez_ids = {} entrez_ids_reverse = {} mouse_map = {} rat_map = {} prev_sym_map = {} ensembl_ids = {} ensembl_ids_reverse = {} hgnc_withdrawn_new_ids = {} gene_types = {} hgnc_to_enzymes = defaultdict(set) enzyme_to_hgncs = defaultdict(set) # Skip the header next(csv_rows) for row in csv_rows: hgnc_id = row[0][5:] hgnc_status = row[3] if hgnc_status in {'Approved', 'Entry Withdrawn'}: hgnc_name = row[1] hgnc_names[hgnc_id] = hgnc_name # Note that withdrawn entries don't overlap with approved # entries at this point so it's safe to add mappings for # withdrawn names hgnc_ids[hgnc_name] = hgnc_id elif hgnc_status == 'Symbol Withdrawn': descr = row[2] m = re.match(r'symbol withdrawn, see \[HGNC:(?: ?)(\d+)\]', descr) new_id = m.groups()[0] hgnc_withdrawn.append(hgnc_id) hgnc_withdrawn_new_ids[hgnc_id] = new_id # Uniprot uniprot_id = row[6] if uniprot_id: uniprot_ids[hgnc_id] = uniprot_id # Entrez entrez_id = row[5] if entrez_id: entrez_ids[hgnc_id] = entrez_id entrez_ids_reverse[entrez_id] = hgnc_id # Mouse mgi_id = row[7] if mgi_id: mgi_ids = mgi_id.split(', ') for mgi_id in mgi_ids: if mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] mouse_map[mgi_id] = hgnc_id # Rat rgd_id = row[8] if rgd_id: rgd_ids = rgd_id.split(', ') for rgd_id in rgd_ids: if rgd_id.startswith('RGD:'): rgd_id = rgd_id[4:] rat_map[rgd_id] = hgnc_id # Previous symbols prev_sym_entry = row[9] if prev_sym_entry: prev_syms = prev_sym_entry.split(', ') for prev_sym in prev_syms: # If we already mapped this previous symbol to another ID if prev_sym in prev_sym_map: # If we already have a list here, we just extend it if isinstance(prev_sym_map[prev_sym], list): prev_sym_map[prev_sym].append(hgnc_id) # Otherwise we create a list and start it with the two # IDs we know the symbol is mapped to else: prev_sym_map[prev_sym] = [ prev_sym_map[prev_sym], hgnc_id ] # Otherwise we just make a string entry here else: prev_sym_map[prev_sym] = hgnc_id ensembl_id = row[10] # Ensembl IDs if ensembl_id: ensembl_ids[hgnc_id] = ensembl_id ensembl_ids_reverse[ensembl_id] = hgnc_id gene_type = row[11] if gene_type: gene_types[hgnc_id] = gene_type enyzyme_ids = row[12] if enyzyme_ids: for enzyme_id in enyzyme_ids.split(", "): hgnc_to_enzymes[hgnc_id].add(enzyme_id) enzyme_to_hgncs[enzyme_id].add(hgnc_id) for old_id, new_id in hgnc_withdrawn_new_ids.items(): hgnc_names[old_id] = hgnc_names[new_id] return ( hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_types, dict(hgnc_to_enzymes), dict(enzyme_to_hgncs), )
def _load_pubchem_mesh_map(): rows = read_unicode_csv(get_resource_path('pubchem_mesh_map.tsv'), delimiter='\t') mappings = {row[0]: row[1] for row in rows} return mappings