def _get_xref_df(version: str) -> Mapping[str, List[Reference]]: base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" xrefs_url = f"{base_url}/equivalences.csv" xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=",", dtype=str) # Normalize nextprot families ns_remapping = { "NXP": "nextprot.family", } xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s)) xrefs_df[1] = [ xref_identifier if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):] for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values ] xrefs_df[0] = xrefs_df[0].map(normalize_prefix) xrefs_df = xrefs_df[xrefs_df[0].notna()] xrefs_df = xrefs_df[xrefs_df[0] != "bel"] return multidict( (identifier, Reference(xref_prefix, xref_identifier)) for xref_prefix, xref_identifier, identifier in xrefs_df.values)
def get_grounder( prefix: Union[str, Iterable[str]], unnamed: Optional[Iterable[str]] = None, grounder_cls: Optional[Type[Grounder]] = None, ) -> Grounder: """Get a Gilda grounder for the given namespace.""" unnamed = set() if unnamed is None else set(unnamed) if isinstance(prefix, str): prefix = [prefix] terms: List[gilda.term.Term] = [] for p in prefix: try: p_terms = list( get_gilda_terms(p, identifiers_are_names=p in unnamed)) except NoBuild: continue else: terms.extend(p_terms) terms = filter_out_duplicates(terms) terms_dict = multidict((term.norm_text, term) for term in terms) if grounder_cls is None: return Grounder(terms_dict) else: return grounder_cls(terms_dict)
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name='ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move( os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL, force=force, version=version) sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for file in zip_file.filelist: if file.filename.endswith(".sqlite") and not file.is_dir(): zip_file.extract(file, sqlite_dir) shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path) os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename))) if not os.path.exists(sqlite_path): raise FileNotFoundError(f"file missing: {sqlite_path}") conn = sqlite3.connect(sqlite_path.as_posix()) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == "0": # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term
def get_grounder(prefix, url: Optional[str] = None) -> Grounder: """Get a Gilda grounder for the given namespace.""" terms = list(get_gilda_terms(prefix, url=url)) terms = filter_out_duplicates(terms) terms = multidict((term.norm_text, term) for term in terms) return Grounder(terms)