def get_terms(force: bool = False) -> Iterable[Term]: """Get terms.""" # DDB ID DDB_G ID Name UniProt ID uniprot_mappings = multisetdict( ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values ) terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv") # GENE ID (DDB_G ID) Gene Name Synonyms Gene products for identifier, name, synonyms, products in tqdm(terms.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, ) if products and pd.notna(products) and products != "unknown": for synonym in products.split(","): term.append_synonym(synonym.strip()) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym.strip())) for uniprot_id in uniprot_mappings.get(identifier, []): if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}: continue term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) term.set_species(identifier="44689", name="Dictyostelium discoideum") yield term
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]: base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" xrefs_url = f"{base_url}/equivalences.csv" xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=",", dtype=str) # Normalize nextprot families ns_remapping = { "NXP": "nextprot.family", } xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s)) xrefs_df[1] = [ xref_identifier if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):] for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values ] xrefs_df[0] = xrefs_df[0].map(normalize_prefix) xrefs_df = xrefs_df[xrefs_df[0].notna()] xrefs_df = xrefs_df[xrefs_df[0] != "bel"] return multidict( (identifier, Reference(xref_prefix, xref_identifier)) for xref_prefix, xref_identifier, identifier in xrefs_df.values)
def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version('biogrid') url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip' df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df
def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over selventa chemical terms.""" df = ensure_df(PREFIX, url=URL, skiprows=8, force=force) for identifier, label, xrefs in df[["ID", "LABEL", "XREF"]].values: term = Term.from_triple(PREFIX, identifier, label) for xref in xrefs.split("|") if pd.notna(xrefs) else []: term.append_xref(xref) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" orthologs_df = ensure_df(PREFIX, url=ORTHOLOGS_URL, force=force, header=None, version=version) identifier_to_hgnc_ids = defaultdict(set) hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc") for identifier, hgnc_symbols in orthologs_df.values: if hgnc_symbols == "NONE": continue for hgnc_symbol in hgnc_symbols.split("|"): hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol) if hgnc_id is not None: identifier_to_hgnc_ids[identifier].add(hgnc_id) df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version) so = { gtype: Reference.auto("SO", POMBASE_TO_SO[gtype]) for gtype in sorted(df[df.columns[6]].unique()) } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=name if pd.notna(name) else None, ) term.append_property("chromosome", chromosome[len("chromosome_"):]) term.append_parent(so[gtype]) term.set_species(identifier="4896", name="Schizosaccharomyces pombe") for hgnc_id in identifier_to_hgnc_ids.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) if uniprot_id and pd.notna(uniprot_id): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym)) yield term
def ensure_list_pathways(version: str) -> Mapping[str, str]: """Ensure the KEGG Map (non species specific).""" rv = ensure_df( KEGG_PATHWAY_PREFIX, url=f"{BASE}/list/pathway", name="pathway.tsv", version=version, ) return {k[len("path:"):]: v for k, v in rv.values}
def _get_synonyms(version, force): url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/synonyms/fb_synonym_fb_{version}.tsv.gz" df = ensure_df(PREFIX, url=url, force=force, version=version, skiprows=4, usecols=[0, 2]) return df # TODO use this
def get_premature_to_prefamily_df(version: str) -> pd.DataFrame: """Get premature miRNA to premature family dataframe.""" url = f'ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_2_prefam.txt.gz' return ensure_df( PREFIX, url=url, version=version, names=['premature_key', 'prefamily_key'], dtype=str, )
def _get_definitions(version: str, force: bool = False) -> Mapping[str, str]: url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/genes/automated_gene_summaries.tsv.gz" df = ensure_df(PREFIX, url=url, force=force, version=version, skiprows=2, header=None, usecols=[0, 1]) return dict(df.values)
def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over selventa disease terms.""" df = ensure_df(PREFIX, url=URL, skiprows=9, force=force) for identifier, label, synonyms, xrefs in df[["ID", "LABEL", "SYNONYMS", "XREF"]].values: term = Term.from_triple(PREFIX, identifier, label) for synonym in synonyms.split("|") if pd.notna(synonyms) else []: term.append_synonym(synonym) for xref in xrefs.split("|") if pd.notna(xrefs) else []: term.append_xref(xref) yield term
def get_premature_family_df(version: str) -> pd.DataFrame: """Get premature family dataframe.""" url = f"ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_prefam.txt.gz" return ensure_df( PREFIX, url=url, version=version, names=["prefamily_key", "family_id", "family_name"], usecols=[0, 1, 2], index_col=0, dtype=str, )
def get_premature_to_prefamily_df(version: str, force: bool = False) -> pd.DataFrame: """Get premature miRNA to premature family dataframe.""" url = f"https://mirbase.org/ftp/{version}/database_files/mirna_2_prefam.txt.gz" return ensure_df( PREFIX, url=url, version=version, names=["premature_key", "prefamily_key"], dtype=str, force=force, )
def get_premature_df(version: str, force: bool = False) -> pd.DataFrame: """Get premature miRNA dataframe.""" url = f"https://mirbase.org/ftp/{version}/database_files/mirna.txt.gz" return ensure_df( PREFIX, url=url, version=version, names=["premature_key", "mirbase_id", "mirna_name"], usecols=[0, 1, 2], dtype=str, force=force, )
def get_premature_df(version: str) -> pd.DataFrame: """Get premature miRNA dataframe.""" url = f'ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna.txt.gz' return ensure_df( PREFIX, url=url, version=version, names=['premature_key', 'mirbase_id', 'mirna_name'], usecols=[0, 1, 2], index_col=0, dtype=str, )
def _get_names(version: str, force: bool = False) -> pd.DataFrame: url = f"{BASE_URL}/FB{version}/precomputed_files/genes/fbgn_fbtr_fbpp_expanded_fb_{version}.tsv.gz" df = ensure_df( PREFIX, url=url, force=force, version=version, skiprows=4, usecols=[0, 1, 2, 3, 4], skipfooter=1, ) return df
def _get_organisms(version: str, force: bool = False) -> Mapping[str, str]: """Get mapping from abbreviation column to NCBI taxonomy ID column.""" url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/species/organism_list_fb_{version}.tsv.gz" df = ensure_df(PREFIX, url=url, force=force, version=version, skiprows=4, header=None, usecols=[2, 4]) df.dropna(inplace=True) return dict(df.values)
def get_chembl_compound_equivalences_raw(usecols=None, version: Optional[str] = None ) -> pd.DataFrame: """Get the chemical representations raw dataframe.""" if version is None: version = bioversions.get_version('chembl') base_url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}' url = f'{base_url}/chembl_{version}_chemreps.txt.gz' return ensure_df(CHEMBL_COMPOUND_PREFIX, url=url, sep='\t', usecols=usecols)
def get_chunks(force: bool = False) -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version(PREFIX) df = ensure_df( PREFIX, url=URL, name="results.csv", force=force, version=version, sep=",", chunksize=CHUNKSIZE, usecols=[0, 1, 2, 3, 5], ) return df
def get_df(version: str) -> pd.DataFrame: """Get a combine ComplexPortal dataframe.""" url_base = f'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/{version}/complextab' dfs = [ ensure_df( PREFIX, url=f'{url_base}/{ncbitaxonomy_id}.tsv', version=version, na_values={'-'}, header=0, dtype=str, ) for ncbitaxonomy_id in SPECIES ] return pd.concat(dfs)
def _get_human_orthologs(version: str, force: bool = False) -> Mapping[str, Set[str]]: url = (f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/" f"orthologs/dmel_human_orthologs_disease_fb_{version}.tsv.gz") df = ensure_df( PREFIX, url=url, force=force, version=version, skiprows=2, header=None, usecols=[0, 2], names=["flybase_id", "hgnc_id"], ) return multisetdict(df.values)
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over DrugCentral terms.""" url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv" df = ensure_df(PREFIX, url=url, version=version, force=force) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue term = Term.from_triple(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name) term.append_xref(Reference(prefix="inchikey", identifier=inchi_key)) term.append_property("smiles", smiles) term.append_property("inchi", inchi) if pd.notna(cas): term.append_xref(Reference(prefix="cas", identifier=cas)) yield term
def get_mature_df(version: str) -> pd.DataFrame: """Get mature miRNA dataframe.""" url = f"ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_mature.txt.gz" return ensure_df( PREFIX, url=url, version=version, names=[ "mature_key", "name", "previous", "mirbase.mature_id", ], usecols=[0, 1, 2, 3], index_col=0, dtype=str, )
def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over selventa family terms.""" df = ensure_df(PREFIX, url=URL, skiprows=9, force=force) terms = {} for identifier, label, synonyms in df[["ID", "LABEL", "SYNONYMS"]].values: term = Term.from_triple(PREFIX, identifier, label) for synonym in synonyms.split("|") if pd.notna(synonyms) else []: term.append_synonym(synonym) terms[identifier] = term df.PARENTS = df.PARENTS.map(lambda x: x[len("SFAM:"):], na_action="ignore") for child, parent in df.loc[df.PARENTS.notna(), ["ID", "PARENTS"]].values: if child == parent: continue # wow... terms[child].append_parent(terms[parent]) yield from terms.values()
def get_chembl_protein_equivalences( version: Optional[str] = None) -> pd.DataFrame: """Get ChEMBL protein equivalences.""" if version is None: version = bioversions.get_version('chembl') url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_uniprot_mapping.txt' df = ensure_df( CHEMBL_TARGET_PREFIX, url=url, sep='\t', usecols=[0, 1], names=[TARGET_ID, SOURCE_ID], # switch around ) df.loc[:, SOURCE_PREFIX] = 'chembl.target' df.loc[:, TARGET_PREFIX] = 'uniprot' df.loc[:, PROVENANCE] = f'chembl{version}' df = df[XREF_COLUMNS] return df
def iter_terms(version: str) -> Iterable[Term]: """Iterate over DrugCentral terms.""" df = ensure_df(PREFIX, url=URL, version=version) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue xrefs = [ Reference(prefix="smiles", identifier=smiles), Reference(prefix="inchi", identifier=inchi), Reference(prefix="inchikey", identifier=inchi_key), ] if pd.notna(cas): xrefs.append(Reference(prefix="cas", identifier=cas)) yield Term( reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name), xrefs=xrefs, )
def get_terms(force: bool = False) -> Iterable[Term]: """Get CGNC terms.""" df = ensure_df(PREFIX, url=URL, name=f"{PREFIX}.tsv", force=force, header=0, names=HEADER) for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _, _) in enumerate(df.values): if pd.isna(cgnc_id): logger.warning(f"row {i} CGNC ID is none") continue try: int(cgnc_id) except ValueError: logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}") continue term = Term.from_triple( prefix=PREFIX, identifier=cgnc_id, name=name, ) term.set_species(identifier="9031", name="Gallus gallus") if entrez_id and pd.notna(entrez_id): term.append_xref(Reference(prefix="ncbigene", identifier=entrez_id)) if pd.notna(ensembl_id): term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id)) if synonym_1 and pd.notna(synonym_1): term.append_synonym(synonym_1) if synoynm_2 and pd.notna(synoynm_2): term.append_synonym(synoynm_2) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" entities_url = f"{base_url}/entities.csv" entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f"{base_url}/relations.csv" relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=",", dtype=str, force=force) definitions_url = f"{base_url}/descriptions.csv" definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=",", dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } id_xrefs = _get_xref_df(version) hgnc_name_to_id = get_name_id_mapping("hgnc") in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == "HGNC": h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, h_name) h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name) elif h_ns == "FPLX": h = Reference(prefix="fplx", identifier=h_name, name=h_name) elif h_ns == "UP": continue else: logger.exception(h_ns) raise if t_ns == "HGNC": t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, t_name) t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name) elif t_ns == "FPLX": t = Reference(prefix="fplx", identifier=t_name, name=t_name) elif h_ns == "UP": continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for (entity, ) in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for xref_reference in id_xrefs.get(entity, []): term.append_xref(xref_reference) for r, t in out_edges.get(reference, []): if r == "isa" and t.prefix == "fplx": term.append_parent(t) elif r == "isa": term.append_relationship(is_a, t) elif r == "partof": term.append_relationship(part_of, t) else: logging.warning("unhandled relation %s", r) for r, h in in_edges.get(reference, []): if r == "isa": term.append_relationship(has_member, h) elif r == "partof": term.append_relationship(has_part, h) else: logging.warning("unhandled relation %s", r) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}' entities_url = f'{base_url}/entities.csv' entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f'{base_url}/relations.csv' relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force) definitions_url = f'{base_url}/descriptions.csv' definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } # TODO add xrefs # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv' # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str) hgnc_name_to_id = get_name_id_mapping('hgnc') in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == 'HGNC': h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name) h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name) elif h_ns == 'FPLX': h = Reference(prefix='fplx', identifier=h_name, name=h_name) elif h_ns == 'UP': continue else: logger.exception(h_ns) raise if t_ns == 'HGNC': t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name) t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name) elif t_ns == 'FPLX': t = Reference(prefix='fplx', identifier=t_name, name=t_name) elif h_ns == 'UP': continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for entity, in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for r, t in out_edges.get(reference, []): if r == 'isa' and t.prefix == 'fplx': term.append_parent(t) elif r == 'isa': term.append_relationship(is_a, t) elif r == 'partof': term.append_relationship(part_of, t) else: logging.warning('unhandled relation %s', r) for r, h in in_edges.get(reference, []): if r == 'isa': term.append_relationship(has_member, h) elif r == 'partof': term.append_relationship(has_part, h) else: logging.warning('unhandled relation %s', r) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms in Rhea.""" terms = {} directions = ensure_df( PREFIX, url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv', version=version) for master, lr, rl, bi in directions.values: terms[master] = Term(reference=Reference(PREFIX, master)) terms[lr] = Term(reference=Reference(PREFIX, lr)) terms[rl] = Term(reference=Reference(PREFIX, rl)) terms[bi] = Term(reference=Reference(PREFIX, bi)) terms[master].append_relationship(has_lr, terms[lr]) terms[master].append_relationship(has_rl, terms[rl]) terms[master].append_relationship(has_bi, terms[bi]) terms[lr].append_parent(terms[master]) terms[rl].append_parent(terms[master]) terms[bi].append_parent(terms[master]) hierarchy = ensure_df( PREFIX, url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv', version=version) for source, relation, target in hierarchy.values: if relation != 'is_a': raise ValueError(f'RHEA unrecognized relation: {relation}') terms[source].append_parent(terms[target]) for xref_prefix, url in [ ('ecocyc', 'rhea2ecocyc'), ('kegg.reaction', 'rhea2kegg_reaction'), ('reactome', 'rhea2reactome'), ('macie', 'rhea2macie'), ('metacyc', 'rhea2metacyc'), ]: xref_df = ensure_df( PREFIX, url=f'ftp://ftp.expasy.org/databases/rhea/tsv/{url}.tsv', version=version) for rhea_id, _, _, xref_id in xref_df.values: if rhea_id not in terms: logger.warning('[%s] could not find %s:%s for xref %s:%s', PREFIX, PREFIX, rhea_id, xref_prefix, xref_id) continue terms[rhea_id].append_xref(Reference(xref_prefix, xref_id)) # TODO are EC codes equivalent? # TODO uniprot enabled by (RO:0002333) # TODO names? url = 'ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz' graph = pystow.ensure_rdf('pyobo', 'raw', PREFIX, version, url=url) result = graph.query(''' PREFIX rh:<http://rdf.rhea-db.org/> SELECT ?reaction ?reactionId ?reactionLabel WHERE { ?reaction rdfs:subClassOf rh:Reaction . ?reaction rh:id ?reactionId . ?reaction rdfs:label ?reactionLabel . } ''') for _, identifier, name in result: identifier = str(identifier) if identifier not in terms: logger.warning('isolated element in rdf: rhea:%s ! %s', identifier, name) continue terms[identifier].reference.name = name # TODO participants? yield from terms.values()