def get_terms(force: bool = False) -> Iterable[Term]: """Get terms.""" # DDB ID DDB_G ID Name UniProt ID uniprot_mappings = multisetdict( ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values ) terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv") # GENE ID (DDB_G ID) Gene Name Synonyms Gene products for identifier, name, synonyms, products in tqdm(terms.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, ) if products and pd.notna(products) and products != "unknown": for synonym in products.split(","): term.append_synonym(synonym.strip()) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym.strip())) for uniprot_id in uniprot_mappings.get(identifier, []): if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}: continue term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) term.set_species(identifier="44689", name="Dictyostelium discoideum") yield term
def _iter_map_terms(version: str) -> Iterable[Term]: for identifier, name in ensure_list_pathways(version=version).items(): yield Term.from_triple( prefix=KEGG_PATHWAY_PREFIX, identifier=identifier, name=name, )
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"): if kegg_genome.identifier in SKIP: continue term = Term.from_triple( prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name, ) if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name("ncbitaxon", kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 logger.debug( f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}" ) term.append_xref( Reference( prefix="ncbitaxon", identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) yield term logger.info("[%s] unable to find %d taxonomy names in NCBI", KEGG_GENOME_PREFIX, errors)
def iter_terms(version: str) -> Iterable[Term]: """Iterate over UniProt Terms.""" with open_reader(ensure(version)) as reader: for uniprot_id, name, taxonomy_id in reader: term = Term.from_triple(prefix=PREFIX, identifier=uniprot_id, name=name) term.set_species(taxonomy_id) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" definitions = _get_definitions(version=version, force=force) abbr_to_taxonomy = _get_organisms(version=version, force=force) names_df = _get_names(version=version, force=force) human_orthologs = _get_human_orthologs(version=version, force=force) missing_taxonomies = set() so = {} for gtype in names_df[names_df.columns[1]].unique(): so_id = GTYPE_TO_SO.get(gtype) if so_id is None: logger.warning( "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype) else: so[gtype] = Reference.auto("SO", so_id) for _, reference in sorted(so.items()): yield Term(reference=reference) for organism, gtype, identifier, symbol, name in tqdm(names_df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=definitions.get(identifier), ) if gtype and pd.notna(gtype) and gtype in so: term.append_parent(so[gtype]) if pd.notna(name): term.append_synonym(name) for hgnc_curie in human_orthologs.get(identifier, []): if not hgnc_curie or pd.isna(hgnc_curie): continue hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True) if hgnc_ortholog is None: tqdm.write( f"fb:{identifier} had invalid ortholog: {hgnc_curie}") else: term.append_relationship(orthologous, hgnc_ortholog) taxonomy_id = abbr_to_taxonomy.get(organism) if taxonomy_id is not None: term.append_relationship(from_species, Reference(NCBITAXON_PREFIX, taxonomy_id)) elif organism not in missing_taxonomies: tqdm.write(f"missing mapping for species abbreviation: {organism}") missing_taxonomies.add(organism) yield term if missing_taxonomies: tqdm.write( f"there were {len(missing_taxonomies)} missing taxa in flybase genes" )
def _iter_genome_terms( *, list_pathway_path: str, link_pathway_path: str, kegg_genome: KEGGGenome, ) -> Iterable[Term]: terms = {} with open(list_pathway_path) as file: list_pathway_lines = [line.strip() for line in file] for line in list_pathway_lines: line = line.strip() pathway_id, name = [part.strip() for part in line.split("\t")] pathway_id = pathway_id[len("path:"):] terms[pathway_id] = term = Term.from_triple( prefix=KEGG_PATHWAY_PREFIX, identifier=pathway_id, name=name, ) # Annotate species information kegg_genome.annotate_term(term) # Annotate the non-species specific code _start = min(i for i, e in enumerate(pathway_id) if e.isnumeric()) pathway_code = pathway_id[_start:] term.append_relationship( species_specific, Reference(prefix=KEGG_PATHWAY_PREFIX, identifier=f"map{pathway_code}"), ) for pathway_id, protein_ids in _get_link_pathway_map( link_pathway_path).items(): term = terms.get(pathway_id) if term is None: tqdm.write( f"could not find kegg.pathway:{pathway_id} for {kegg_genome.name}" ) continue for protein_id in protein_ids: term.append_relationship( has_part, Reference( prefix=KEGG_GENES_PREFIX, identifier=protein_id, ), ) yield from terms.values()
def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: """Iterate over UniProt Terms.""" with open_reader(ensure(version=version, force=force)) as reader: _ = next(reader) # header for uniprot_id, name, taxonomy_id in tqdm(reader, desc="Mapping UniProt"): term = Term.from_triple(prefix=PREFIX, identifier=uniprot_id, name=name) # TODO add gene encodes from relationship # TODO add description term.append_relationship( from_species, Reference(prefix=NCBITAXON_PREFIX, identifier=taxonomy_id)) yield term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over DrugCentral terms.""" url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv" df = ensure_df(PREFIX, url=url, version=version, force=force) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue term = Term.from_triple(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name) term.append_xref(Reference(prefix="inchikey", identifier=inchi_key)) term.append_property("smiles", smiles) term.append_property("inchi", inchi) if pd.notna(cas): term.append_xref(Reference(prefix="cas", identifier=cas)) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" orthologs_df = ensure_df(PREFIX, url=ORTHOLOGS_URL, force=force, header=None, version=version) identifier_to_hgnc_ids = defaultdict(set) hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc") for identifier, hgnc_symbols in orthologs_df.values: if hgnc_symbols == "NONE": continue for hgnc_symbol in hgnc_symbols.split("|"): hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol) if hgnc_id is not None: identifier_to_hgnc_ids[identifier].add(hgnc_id) df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version) so = { gtype: Reference.auto("SO", POMBASE_TO_SO[gtype]) for gtype in sorted(df[df.columns[6]].unique()) } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=name if pd.notna(name) else None, ) term.append_property("chromosome", chromosome[len("chromosome_"):]) term.append_parent(so[gtype]) term.set_species(identifier="4896", name="Schizosaccharomyces pombe") for hgnc_id in identifier_to_hgnc_ids.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) if uniprot_id and pd.notna(uniprot_id): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym)) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over ChEMBL compounds.""" with chembl_downloader.connect(version=version) as conn: logger.info("using connection %s", conn) with closing(conn.cursor()) as cursor: logger.info("using cursor %s", cursor) cursor.execute(QUERY) for chembl_id, name, smiles, inchi, inchi_key in cursor.fetchall(): # TODO add xrefs? term = Term.from_triple(prefix=PREFIX, identifier=chembl_id, name=name) if smiles: term.append_property("smiles", smiles) if inchi: term.append_property("inchi", inchi) if inchi_key: term.append_xref(Reference("inchikey", inchi_key)) yield term
def get_terms(force: bool = False) -> Iterable[Term]: """Get CGNC terms.""" df = ensure_df(PREFIX, url=URL, name=f"{PREFIX}.tsv", force=force, header=0, names=HEADER) for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _, _) in enumerate(df.values): if pd.isna(cgnc_id): logger.warning(f"row {i} CGNC ID is none") continue try: int(cgnc_id) except ValueError: logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}") continue term = Term.from_triple( prefix=PREFIX, identifier=cgnc_id, name=name, ) term.set_species(identifier="9031", name="Gallus gallus") if entrez_id and pd.notna(entrez_id): term.append_xref(Reference(prefix="ncbigene", identifier=entrez_id)) if pd.notna(ensembl_id): term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id)) if synonym_1 and pd.notna(synonym_1): term.append_synonym(synonym_1) if synoynm_2 and pd.notna(synoynm_2): term.append_synonym(synoynm_2) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term