def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name='ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move( os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def _ensure_conv_genome_helper( kegg_genome_id: str, target_database: str, *, version: str, error_on_missing: bool = False, ) -> Optional[str]: """Get the KEGG-external protein map for the given organism/database.""" name = f"{kegg_genome_id}.tsv" try: rv = ensure_path( KEGG_GENES_PREFIX, f"conv_{target_database}", url=f"{BASE}/conv/{target_database}/{kegg_genome_id}", name=name, error_on_missing=error_on_missing, version=version, ) except urllib.error.HTTPError: path_rv = prefix_directory_join( KEGG_GENES_PREFIX, f"conv_{target_database}", name=name, version=version, ) with path_rv.open("w") as file: print(file=file) # noqa: T201 return path_rv.as_posix() except FileNotFoundError: return None else: return rv
def get_path(version: str): """Get the path to the extracted ChEMBL SQLite database.""" url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz' path = ensure_path(PREFIX, url=url, version=version) name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db' d = prefix_directory_join(PREFIX, version=version) op = os.path.join(d, name) if not os.path.exists(op): with tarfile.open(path, mode='r', encoding='utf-8') as tar_file: tar_file.extractall(d) return op
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL, force=force, version=version) sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for file in zip_file.filelist: if file.filename.endswith(".sqlite") and not file.is_dir(): zip_file.extract(file, sqlite_dir) shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path) os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename))) if not os.path.exists(sqlite_path): raise FileNotFoundError(f"file missing: {sqlite_path}") conn = sqlite3.connect(sqlite_path.as_posix()) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == "0": # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
return get_name_id_mapping('ncbitaxon')[name] def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version('biogrid') url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip' df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df @cached_mapping( path=prefix_directory_join(PREFIX, 'cache', 'xrefs', name='ncbigene.tsv', version=version_getter(PREFIX)), header=['biogrid_id', 'ncbigene_id'], ) def get_ncbigene_mapping() -> Mapping[str, str]: """Get BioGRID to NCBIGENE mapping. Is basically equivalent to: .. code-block:: python from pyobo import get_filtered_xrefs biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene') """ df = get_df()
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901 """Get HGNC terms.""" if version is None: version = datetime.date.today().strftime("%Y-%m-01") unhandled_entry_keys: typing.Counter[str] = Counter() unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict) path = ensure_path( PREFIX, url=DEFINITIONS_URL_FMT.format(version=version), force=force, version=version, name="hgnc_complete_set.json", ) with open(path) as file: entries = json.load(file)["response"]["docs"] yield from sorted( { Term(reference=Reference.auto("SO", so_id)) for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id }, key=attrgetter("identifier"), ) statuses = set() for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True): name, symbol, identifier = ( entry.pop("name"), entry.pop("symbol"), entry.pop("hgnc_id")[len("HGNC:"):], ) status = entry.pop("status") if status == "Approved": is_obsolete = False elif status not in statuses: statuses.add(status) logger.warning("UNHANDLED %s", status) is_obsolete = True else: raise ValueError( f"Unhandled status for hgnc:{identifier}: {status}") term = Term( definition=name, reference=Reference(prefix=PREFIX, identifier=identifier, name=symbol), is_obsolete=is_obsolete, ) for uniprot_id in entry.pop("uniprot_ids", []): term.append_relationship( has_gene_product, Reference.auto("uniprot", uniprot_id), ) for ec_code in entry.pop("enzyme_id", []): if "-" in ec_code: continue # only add concrete annotations term.append_relationship( gene_product_member_of, Reference.auto("eccode", ec_code), ) for rna_central_ids in entry.pop("rna_central_id", []): for rna_central_id in rna_central_ids.split(","): term.append_relationship( transcribes_to, Reference(prefix="rnacentral", identifier=rna_central_id.strip()), ) mirbase_id = entry.pop("mirbase", None) if mirbase_id: term.append_relationship( transcribes_to, Reference.auto( "mirbase", mirbase_id, ), ) snornabase_id = entry.pop("snornabase", None) if snornabase_id: term.append_relationship( transcribes_to, Reference(prefix="snornabase", identifier=snornabase_id)) for rgd_curie in entry.pop("rgd_id", []): if not rgd_curie.startswith("RGD:"): logger.warning( f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}") continue rgd_id = rgd_curie[len("RGD:"):] term.append_relationship( orthologous, Reference.auto(prefix="rgd", identifier=rgd_id), ) for mgi_curie in entry.pop("mgd_id", []): if not mgi_curie.startswith("MGI:"): logger.warning( f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}") continue mgi_id = mgi_curie[len("MGI:"):] if not mgi_id: continue term.append_relationship( orthologous, Reference.auto(prefix="mgi", identifier=mgi_id), ) for xref_prefix, key in gene_xrefs: xref_identifiers = entry.pop(key, None) if xref_identifiers is None: continue if not isinstance(xref_identifiers, list): xref_identifiers = [xref_identifiers] for xref_identifier in xref_identifiers: term.append_xref( Reference(prefix=xref_prefix, identifier=str(xref_identifier))) for pubmed_id in entry.pop("pubmed_id", []): term.append_provenance( Reference(prefix="pubmed", identifier=str(pubmed_id))) gene_group_ids = entry.pop("gene_group_id", []) gene_groups = entry.pop("gene_group", []) for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups): term.append_relationship( member_of, Reference( prefix="hgnc.genegroup", identifier=str(gene_group_id), name=gene_group_label, ), ) for alias_symbol in entry.pop("alias_symbol", []): term.append_synonym( Synonym(name=alias_symbol, type=alias_symbol_type)) for alias_name in entry.pop("alias_name", []): term.append_synonym(Synonym(name=alias_name, type=alias_name_type)) for previous_symbol in entry.pop("previous_symbol", []): term.append_synonym( Synonym(name=previous_symbol, type=previous_symbol_type)) for previous_name in entry.pop("prev_name", []): term.append_synonym( Synonym(name=previous_name, type=previous_name_type)) for prop in ["location"]: value = entry.pop(prop, None) if value: term.append_property(prop, value) locus_type = entry.pop("locus_type") locus_group = entry.pop("locus_group") so_id = LOCUS_TYPE_TO_SO.get(locus_type) if so_id: term.append_parent(Reference.auto("SO", so_id)) else: term.append_parent(Reference.auto("SO", "0000704")) # gene unhandle_locus_types[locus_type][identifier] = term term.append_property("locus_type", locus_type) term.append_property("locus_group", locus_group) term.set_species(identifier="9606", name="H**o sapiens") for key in entry: unhandled_entry_keys[key] += 1 yield term with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file: json.dump( { k: {hgnc_id: term.name for hgnc_id, term in v.items()} for k, v in unhandle_locus_types.items() }, file, indent=2, ) with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file: for k, v in sorted(unhandle_locus_types.items()): t = tabulate( [( hgnc_id, term.name, term.is_obsolete, term.link, ", ".join(p.link for p in term.provenance if p.link), ) for hgnc_id, term in sorted(v.items())], headers=["hgnc_id", "name", "obsolete", "link", "provenance"], tablefmt="github", ) print(f"## {k} ({len(v)})", file=file) # noqa: T201 print(t, "\n", file=file) # noqa: T201 unhandle_locus_type_counter = Counter( {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}) logger.warning("Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())) logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))
return get_name_id_mapping("ncbitaxon")[name] def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version("biogrid") url = f"{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip" df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version) df["taxonomy_id"] = df["ORGANISM_OFFICIAL_NAME"].map(_lookup) return df @cached_mapping( path=prefix_directory_join(PREFIX, "cache", "xrefs", name="ncbigene.tsv", version=version_getter(PREFIX)), header=["biogrid_id", "ncbigene_id"], ) def get_ncbigene_mapping() -> Mapping[str, str]: """Get BioGRID to NCBIGENE mapping. Is basically equivalent to: .. code-block:: python from pyobo import get_filtered_xrefs biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene') """ df = get_df()