def _parse_xrefs(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for xref in s.split("|"): xref = xref.replace("protein ontology:PR:", "PR:") xref = xref.replace("protein ontology:PR_", "PR:") try: xref_curie, note = xref.split("(") except ValueError: logger.warning("xref missing (: %s", xref) continue note = note.rstrip(")") try: reference = Reference.from_curie(xref_curie) except ValueError: logger.warning("can not parse CURIE: %s", xref) continue if reference is None: logger.warning("reference is None after parsing: %s", xref) continue rv.append((reference, note)) return rv
def _parse_xrefs(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for xref in s.split('|'): xref = xref.replace('protein ontology:PR:', 'PR:') xref = xref.replace('protein ontology:PR_', 'PR:') try: xref_curie, note = xref.split('(') except ValueError: logger.warning('xref missing (: %s', xref) continue note = note.rstrip(')') try: reference = Reference.from_curie(xref_curie) except ValueError: logger.warning('can not parse CURIE: %s', xref) continue if reference is None: logger.warning('reference is None after parsing: %s', xref) continue rv.append((reference, note)) return rv
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}' entities_url = f'{base_url}/entities.csv' entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f'{base_url}/relations.csv' relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force) definitions_url = f'{base_url}/descriptions.csv' definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } # TODO add xrefs # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv' # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str) hgnc_name_to_id = get_name_id_mapping('hgnc') in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == 'HGNC': h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name) h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name) elif h_ns == 'FPLX': h = Reference(prefix='fplx', identifier=h_name, name=h_name) elif h_ns == 'UP': continue else: logger.exception(h_ns) raise if t_ns == 'HGNC': t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name) t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name) elif t_ns == 'FPLX': t = Reference(prefix='fplx', identifier=t_name, name=t_name) elif h_ns == 'UP': continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for entity, in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for r, t in out_edges.get(reference, []): if r == 'isa' and t.prefix == 'fplx': term.append_parent(t) elif r == 'isa': term.append_relationship(is_a, t) elif r == 'partof': term.append_relationship(part_of, t) else: logging.warning('unhandled relation %s', r) for r, h in in_edges.get(reference, []): if r == 'isa': term.append_relationship(has_member, h) elif r == 'partof': term.append_relationship(has_part, h) else: logging.warning('unhandled relation %s', r) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" entities_url = f"{base_url}/entities.csv" entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f"{base_url}/relations.csv" relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=",", dtype=str, force=force) definitions_url = f"{base_url}/descriptions.csv" definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=",", dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } id_xrefs = _get_xref_df(version) hgnc_name_to_id = get_name_id_mapping("hgnc") in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == "HGNC": h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, h_name) h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name) elif h_ns == "FPLX": h = Reference(prefix="fplx", identifier=h_name, name=h_name) elif h_ns == "UP": continue else: logger.exception(h_ns) raise if t_ns == "HGNC": t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, t_name) t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name) elif t_ns == "FPLX": t = Reference(prefix="fplx", identifier=t_name, name=t_name) elif h_ns == "UP": continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for (entity, ) in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for xref_reference in id_xrefs.get(entity, []): term.append_xref(xref_reference) for r, t in out_edges.get(reference, []): if r == "isa" and t.prefix == "fplx": term.append_parent(t) elif r == "isa": term.append_relationship(is_a, t) elif r == "partof": term.append_relationship(part_of, t) else: logging.warning("unhandled relation %s", r) for r, h in in_edges.get(reference, []): if r == "isa": term.append_relationship(has_member, h) elif r == "partof": term.append_relationship(has_part, h) else: logging.warning("unhandled relation %s", r) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term