def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f"{PREFIX} download"): try: path = ensure_path( PREFIX, "phenotype", version=version, url= f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json", name=f"HGVPM{n}.json", force=force, ) except OSError as e: tqdm.write(f"{n}: {e}") continue with open(path) as file: j = json.load(file) description = j.get("description") if description is not None: description = description.strip().replace("\n", " ") term = Term( reference=Reference(PREFIX, j["identifier"], j["name"]), definition=description, ) yield term
def _get_version() -> str: """Get the version of the current data.""" zip_path = ensure_path(PREFIX, url=URL) with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): return x.filename[len('itisSqlite'):-len('/ITIS.sqlite')] raise ValueError('could not find a file with the version in it')
def get_path(version: str): """Get the path to the extracted ChEMBL SQLite database.""" url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz' path = ensure_path(PREFIX, url=url, version=version) name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db' d = prefix_directory_join(PREFIX, version=version) op = os.path.join(d, name) if not os.path.exists(op): with tarfile.open(path, mode='r', encoding='utf-8') as tar_file: tar_file.extractall(d) return op
def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over UniProt PTM Terms.""" path = ensure_path(PREFIX, url=URL, force=force) with open(path) as file: lines = list(file) it: Iterable[Tuple[str, str]] = ((line[:2], line[2:].strip()) for line in lines[47:-5]) for i, (_, term_lines) in enumerate( itt.groupby(it, key=lambda p: p[0] == "//")): term = _parse(i, term_lines) if term: yield term
def iterate_terms(version: str) -> Iterable[Term]: """Iterate over GWAS Central Study terms.""" path = ensure_path(PREFIX, url=URL, version=version) with tarfile.open(path) as tar_file: for tar_info in tar_file: if not tar_info.path.endswith(".xml"): continue with tar_file.extractfile(tar_info) as file: try: tree = ElementTree.parse(file) except ElementTree.ParseError: logger.warning("malformed XML in %s", tar_info.path) continue yield _get_term_from_tree(tree)
def iterate_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over GWAS Central Study terms.""" url = f"http://www.gwascentral.org/docs/GC_{version}.tar.gz" path = ensure_path(PREFIX, url=url, version=version, force=force) with tarfile.open(path) as tar_file: for tar_info in tar_file: if not tar_info.path.endswith(".xml"): continue with tar_file.extractfile(tar_info) as file: # type:ignore try: tree = ElementTree.parse(file) except ElementTree.ParseError: logger.warning("malformed XML in %s", tar_info.path) continue yield _get_term_from_tree(tree)
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name='ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move( os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f'{PREFIX} download'): try: path = ensure_path( PREFIX, 'phenotype', version=version, url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json', name=f'HGVPM{n}.json', ) except OSError as e: tqdm.write(f'{n}: {e}') continue with open(path) as file: j = json.load(file) term = Term( reference=Reference(PREFIX, j['identifier'], j['name']), definition=j['description'].strip().replace('\n', ' '), ) yield term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL, force=force, version=version) sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for file in zip_file.filelist: if file.filename.endswith(".sqlite") and not file.is_dir(): zip_file.extract(file, sqlite_dir) shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path) os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename))) if not os.path.exists(sqlite_path): raise FileNotFoundError(f"file missing: {sqlite_path}") conn = sqlite3.connect(sqlite_path.as_posix()) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == "0": # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms(version: str, synonym_abb, autodownload: bool = False) -> Iterable[Term]: """Iterate over UMLS terms.""" name = f"umls-{version}-mrconso.zip" url = f"https://download.nlm.nih.gov/umls/kss/{version}/{name}" if autodownload: # FIXME needs automated scrapy step where you put in user/password path = ensure_path(PREFIX, url=url, version=version) else: path = RAW_MODULE.get(PREFIX, version, name=name) if not path.exists(): raise FileNotFoundError( f"UMLS needs to be downloaded manually still and moved to {path}. " f"See https://www.nlm.nih.gov/research/umls/index.html", ) with zipfile.ZipFile(path) as zip_file: with zip_file.open("MRCONSO.RRF", mode="r") as file: it = tqdm(file, unit_scale=True, desc="[umls] parsing") lines = (line.decode("utf-8").strip().split("|") for line in it) for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)): df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS) df = df[df["LAT - Language"] == "ENG"] idx = ((df["ISPREF - is preferred"] == "Y") & (df["TS - Term Status"] == "P") & (df["STT - String Type"] == "PF"), ) pref_rows_df = df.loc[idx] if len(pref_rows_df.index) != 1: it.write( f"no preferred term for umls:{cui}. got {len(pref_rows_df.index)}" ) continue df["TTY - Term Type in Source"] = df[ "TTY - Term Type in Source"].map(synonym_abb.__getitem__) _r = pref_rows_df.iloc[0] sdf = df[[ "SAB - source name", "CODE", "TTY - Term Type in Source", "STR" ]] synonyms = [] xrefs = [] for source, identifier, synonym_type, synonym in sdf.values: norm_source = normalize_prefix(source) if norm_source is None or not identifier: provenance = [] else: ref = Reference(prefix=norm_source, identifier=identifier) provenance = [ref] xrefs.append(ref) synonyms.append( Synonym( name=synonym, provenance=provenance, type=SynonymTypeDef.from_text(synonym_type), )) xrefs = sorted(set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)) term = Term( reference=Reference(prefix=PREFIX, identifier=cui, name=_r["STR"]), synonyms=synonyms, xrefs=xrefs, ) yield term
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901 """Get HGNC terms.""" if version is None: version = datetime.date.today().strftime("%Y-%m-01") unhandled_entry_keys: typing.Counter[str] = Counter() unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict) path = ensure_path( PREFIX, url=DEFINITIONS_URL_FMT.format(version=version), force=force, version=version, name="hgnc_complete_set.json", ) with open(path) as file: entries = json.load(file)["response"]["docs"] yield from sorted( { Term(reference=Reference.auto("SO", so_id)) for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id }, key=attrgetter("identifier"), ) statuses = set() for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True): name, symbol, identifier = ( entry.pop("name"), entry.pop("symbol"), entry.pop("hgnc_id")[len("HGNC:"):], ) status = entry.pop("status") if status == "Approved": is_obsolete = False elif status not in statuses: statuses.add(status) logger.warning("UNHANDLED %s", status) is_obsolete = True else: raise ValueError( f"Unhandled status for hgnc:{identifier}: {status}") term = Term( definition=name, reference=Reference(prefix=PREFIX, identifier=identifier, name=symbol), is_obsolete=is_obsolete, ) for uniprot_id in entry.pop("uniprot_ids", []): term.append_relationship( has_gene_product, Reference.auto("uniprot", uniprot_id), ) for ec_code in entry.pop("enzyme_id", []): if "-" in ec_code: continue # only add concrete annotations term.append_relationship( gene_product_member_of, Reference.auto("eccode", ec_code), ) for rna_central_ids in entry.pop("rna_central_id", []): for rna_central_id in rna_central_ids.split(","): term.append_relationship( transcribes_to, Reference(prefix="rnacentral", identifier=rna_central_id.strip()), ) mirbase_id = entry.pop("mirbase", None) if mirbase_id: term.append_relationship( transcribes_to, Reference.auto( "mirbase", mirbase_id, ), ) snornabase_id = entry.pop("snornabase", None) if snornabase_id: term.append_relationship( transcribes_to, Reference(prefix="snornabase", identifier=snornabase_id)) for rgd_curie in entry.pop("rgd_id", []): if not rgd_curie.startswith("RGD:"): logger.warning( f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}") continue rgd_id = rgd_curie[len("RGD:"):] term.append_relationship( orthologous, Reference.auto(prefix="rgd", identifier=rgd_id), ) for mgi_curie in entry.pop("mgd_id", []): if not mgi_curie.startswith("MGI:"): logger.warning( f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}") continue mgi_id = mgi_curie[len("MGI:"):] if not mgi_id: continue term.append_relationship( orthologous, Reference.auto(prefix="mgi", identifier=mgi_id), ) for xref_prefix, key in gene_xrefs: xref_identifiers = entry.pop(key, None) if xref_identifiers is None: continue if not isinstance(xref_identifiers, list): xref_identifiers = [xref_identifiers] for xref_identifier in xref_identifiers: term.append_xref( Reference(prefix=xref_prefix, identifier=str(xref_identifier))) for pubmed_id in entry.pop("pubmed_id", []): term.append_provenance( Reference(prefix="pubmed", identifier=str(pubmed_id))) gene_group_ids = entry.pop("gene_group_id", []) gene_groups = entry.pop("gene_group", []) for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups): term.append_relationship( member_of, Reference( prefix="hgnc.genegroup", identifier=str(gene_group_id), name=gene_group_label, ), ) for alias_symbol in entry.pop("alias_symbol", []): term.append_synonym( Synonym(name=alias_symbol, type=alias_symbol_type)) for alias_name in entry.pop("alias_name", []): term.append_synonym(Synonym(name=alias_name, type=alias_name_type)) for previous_symbol in entry.pop("previous_symbol", []): term.append_synonym( Synonym(name=previous_symbol, type=previous_symbol_type)) for previous_name in entry.pop("prev_name", []): term.append_synonym( Synonym(name=previous_name, type=previous_name_type)) for prop in ["location"]: value = entry.pop(prop, None) if value: term.append_property(prop, value) locus_type = entry.pop("locus_type") locus_group = entry.pop("locus_group") so_id = LOCUS_TYPE_TO_SO.get(locus_type) if so_id: term.append_parent(Reference.auto("SO", so_id)) else: term.append_parent(Reference.auto("SO", "0000704")) # gene unhandle_locus_types[locus_type][identifier] = term term.append_property("locus_type", locus_type) term.append_property("locus_group", locus_group) term.set_species(identifier="9606", name="H**o sapiens") for key in entry: unhandled_entry_keys[key] += 1 yield term with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file: json.dump( { k: {hgnc_id: term.name for hgnc_id, term in v.items()} for k, v in unhandle_locus_types.items() }, file, indent=2, ) with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file: for k, v in sorted(unhandle_locus_types.items()): t = tabulate( [( hgnc_id, term.name, term.is_obsolete, term.link, ", ".join(p.link for p in term.provenance if p.link), ) for hgnc_id, term in sorted(v.items())], headers=["hgnc_id", "name", "obsolete", "link", "provenance"], tablefmt="github", ) print(f"## {k} ({len(v)})", file=file) # noqa: T201 print(t, "\n", file=file) # noqa: T201 unhandle_locus_type_counter = Counter( {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}) logger.warning("Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())) logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))