def _parse(i, lines: Iterable[Tuple[str, str]]) -> Optional[Term]: dd_: DefaultDict[str, List[str]] = defaultdict(list) for key, value in lines: dd_[key].append(value) dd: Mapping[str, List[str]] = dict(dd_) if "//" in dd: return None accessions = dd["AC"] labels = dd.get("ID") reference = Reference( prefix="uniprot.ptm", identifier=accessions[0], name=labels[0] if labels else None, ) term = Term(reference=reference) for line in dd.get("DR", []): line = line.rstrip(".") for x, y in [ ("MOD; ", "PSI-MOD; MOD:"), ("CHEBI; ", "ChEBI; CHEBI:"), ]: if line.startswith(y): line = x + line[len(y):] ref = Reference.from_curie(line.replace("; ", ":")) if ref: term.append_xref(ref) else: tqdm.write(f"Failure on xref {line}") return term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over UMLS terms.""" with open_umls(version=version) as file: it = tqdm(file, unit_scale=True, desc="[umls] parsing") lines = (line.decode("utf-8").strip().split("|") for line in it) for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)): df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS) df = df[df["LAT - Language"] == "ENG"] idx = ((df["ISPREF - is preferred"] == "Y") & (df["TS - Term Status"] == "P") & (df["STT - String Type"] == "PF"), ) pref_rows_df = df.loc[idx] if len(pref_rows_df.index) != 1: it.write( f"no preferred term for umls:{cui}. got {len(pref_rows_df.index)}" ) continue df["TTY - Term Type in Source"] = df[ "TTY - Term Type in Source"].map(SYNONYM_ABB.__getitem__) _r = pref_rows_df.iloc[0] sdf = df[[ "SAB - source name", "CODE", "TTY - Term Type in Source", "STR" ]] synonyms = [] xrefs = [] for source, identifier, synonym_type, synonym in sdf.values: norm_source = normalize_prefix(source) if norm_source is None or not identifier: provenance = [] else: ref = Reference(prefix=norm_source, identifier=identifier) provenance = [ref] xrefs.append(ref) synonyms.append( Synonym( name=synonym, provenance=provenance, type=SynonymTypeDef.from_text(synonym_type), )) xrefs = sorted(set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)) term = Term( reference=Reference(prefix=PREFIX, identifier=cui, name=_r["STR"]), synonyms=synonyms, xrefs=xrefs, ) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" definitions = _get_definitions(version=version, force=force) abbr_to_taxonomy = _get_organisms(version=version, force=force) names_df = _get_names(version=version, force=force) human_orthologs = _get_human_orthologs(version=version, force=force) missing_taxonomies = set() so = {} for gtype in names_df[names_df.columns[1]].unique(): so_id = GTYPE_TO_SO.get(gtype) if so_id is None: logger.warning( "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype) else: so[gtype] = Reference.auto("SO", so_id) for _, reference in sorted(so.items()): yield Term(reference=reference) for organism, gtype, identifier, symbol, name in tqdm(names_df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=definitions.get(identifier), ) if gtype and pd.notna(gtype) and gtype in so: term.append_parent(so[gtype]) if pd.notna(name): term.append_synonym(name) for hgnc_curie in human_orthologs.get(identifier, []): if not hgnc_curie or pd.isna(hgnc_curie): continue hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True) if hgnc_ortholog is None: tqdm.write( f"fb:{identifier} had invalid ortholog: {hgnc_curie}") else: term.append_relationship(orthologous, hgnc_ortholog) taxonomy_id = abbr_to_taxonomy.get(organism) if taxonomy_id is not None: term.append_relationship(from_species, Reference(NCBITAXON_PREFIX, taxonomy_id)) elif organism not in missing_taxonomies: tqdm.write(f"missing mapping for species abbreviation: {organism}") missing_taxonomies.add(organism) yield term if missing_taxonomies: tqdm.write( f"there were {len(missing_taxonomies)} missing taxa in flybase genes" )
def get_reference(self) -> Reference: """Get the reference for this genome.""" return Reference( prefix="kegg.genome", identifier=self.identifier, name=self.name, )
def iter_terms() -> Iterable[Term]: df = get_relations_df() it = tqdm(df.dropna().values, total=len(df.index), desc='mapping to OBO', unit_scale=True) ref_term = {} for source_db, source_id, source_name, modulation, target_type, target_db, target_id, target_name in it: source = Reference(source_db.upper(), source_id, source_name) term = ref_term.get(source) if term is None: term = ref_term[source] = Term(reference=source) typedef = _get_typedef(target_db=target_db, target_type=target_type, modulation=modulation) if typedef is not None: term.append_relationship(typedef, Reference(target_db.upper(), target_id, target_name)) for term in ref_term.values(): if len(term.relationships) > 0: yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" orthologs_df = ensure_df(PREFIX, url=ORTHOLOGS_URL, force=force, header=None, version=version) identifier_to_hgnc_ids = defaultdict(set) hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc") for identifier, hgnc_symbols in orthologs_df.values: if hgnc_symbols == "NONE": continue for hgnc_symbol in hgnc_symbols.split("|"): hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol) if hgnc_id is not None: identifier_to_hgnc_ids[identifier].add(hgnc_id) df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version) so = { gtype: Reference.auto("SO", POMBASE_TO_SO[gtype]) for gtype in sorted(df[df.columns[6]].unique()) } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=name if pd.notna(name) else None, ) term.append_property("chromosome", chromosome[len("chromosome_"):]) term.append_parent(so[gtype]) term.set_species(identifier="4896", name="Schizosaccharomyces pombe") for hgnc_id in identifier_to_hgnc_ids.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) if uniprot_id and pd.notna(uniprot_id): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym)) yield term
def test_extract_definition(self): """Test extracting a definition.""" expected_text = "Test Text." for s, expected_references in [ (f'"{expected_text}"', []), (f'"{expected_text}" []', []), (f'"{expected_text}" [PMID:1234]', [Reference("pubmed", "1234")]), ( f'"{expected_text}" [PMID:1234, PMID:1235]', [Reference("pubmed", "1234"), Reference("pubmed", "1235")], ), ]: with self.subTest(s=s): actual_text, actual_references = _extract_definition( s, prefix="chebi", identifier="XXX") self.assertEqual(expected_text, actual_text) self.assertEqual(expected_references, actual_references)
def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: """Iterate over DepMap cell line terms.""" df = ensure(force=force, version=version) columns = [ "DepMap_ID", "cell_line_name", "stripped_cell_line_name", "alias", "COSMICID", "RRID", "WTSI_Master_Cell_ID", "Sanger_Model_ID", ] df["WTSI_Master_Cell_ID"] = df["WTSI_Master_Cell_ID"].map(_fix_mangled_int) df["COSMICID"] = df["COSMICID"].map(_fix_mangled_int) for identifier, name, sname, aliases, cosmic_id, cellosaurus_id, _wtsi_id, _sanger_id in df[ columns].values: term = Term.from_triple(PREFIX, identifier, name) if pd.notna(sname): term.append_synonym(sname) if pd.notna(aliases): for alias in aliases.split(","): alias = alias.strip() if alias == name: continue term.append_synonym(alias) if pd.notna(cosmic_id): term.append_xref(Reference("cosmic.cell", cosmic_id)) if pd.notna(cellosaurus_id): term.append_xref(Reference("cellosaurus", cellosaurus_id)) # WTSI stands for welcome trust sanger institute # Not sure where this prefix goes # if pd.notna(wtsi_id): # term.append_xref(Reference("sanger", wtsi_id)) # Not sure what this is # if pd.notna(sanger_id): # term.append_xref(Reference("sanger", sanger_id)) # TODO There's lots of other great ontological information in here. Next time. yield term
def test_extract_definition_with_escapes(self): """Test extracting a definition with escapes in it.""" expected_text = """The canonical 3' splice site has the sequence "AG".""" s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]""" actual_text, actual_references = _extract_definition(s, strict=True, prefix="chebi", identifier="XXX") self.assertEqual(expected_text, actual_text) self.assertEqual([Reference("pubmed", "1234")], actual_references)
def test_extract_synonym(self): """Test extracting synonym strings.""" iupac_name = SynonymTypeDef(id="IUPAC_NAME", name="IUPAC NAME") synoynym_typedefs = { "IUPAC_NAME": iupac_name, } for synonym, s in [ ( Synonym( name="LTEC I", specificity="EXACT", type=iupac_name, provenance=[Reference("orphanet", "93938")], ), '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT", provenance=[Reference("orphanet", "93938")]), '"LTEC I" EXACT [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT", provenance=[Reference("orphanet", "93938")]), '"LTEC I" [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT"), '"LTEC I" []', ), ]: with self.subTest(s=s): self.assertEqual( synonym, _extract_synonym(s, synoynym_typedefs, prefix="chebi", identifier="XXX"), )
def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: """Iterate over CCLE Cells.""" df = ensure_df(version=version, force=force) for identifier, depmap_id, name in df.values: if pd.isna(name) or pd.isnull(name): name = None term = Term.from_triple(PREFIX, identifier, name) if pd.notna(depmap_id): term.append_xref(Reference("depmap", depmap_id)) yield term
def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: """Iterate over UniProt Terms.""" with open_reader(ensure(version=version, force=force)) as reader: _ = next(reader) # header for uniprot_id, name, taxonomy_id in tqdm(reader, desc="Mapping UniProt"): term = Term.from_triple(prefix=PREFIX, identifier=uniprot_id, name=name) # TODO add gene encodes from relationship # TODO add description term.append_relationship( from_species, Reference(prefix=NCBITAXON_PREFIX, identifier=taxonomy_id)) yield term
def _extract_references(s: str) -> Iterable[Reference]: for curie in s.split(','): reference = Reference.from_curie(curie) if reference is not None: yield reference
KEGG_GENES_PREFIX = "kegg.genes" KEGG_GENOME_PREFIX = "kegg.genome" KEGG_PATHWAY_PREFIX = "kegg.pathway" BASE = "http://rest.kegg.jp" SKIP = { "T03333", "T03334", "T03356", "T03357", "T03358", "T03359", } from_kegg_species = TypeDef( reference=Reference.default("inKeggTaxon", "in KEGG taxon"), parents=from_species.reference, ) @dataclass class KEGGGenome: """A data structure for a parsed line of the KEGG Genomes list.""" identifier: str name: str code: Optional[str] long_code: Optional[str] taxonomy_id: Optional[str] def annotate_term(self, term: Term) -> None:
term = ref_term.get(source) if term is None: term = ref_term[source] = Term(reference=source) typedef = _get_typedef(target_db=target_db, target_type=target_type, modulation=modulation) if typedef is not None: term.append_relationship(typedef, Reference(target_db.upper(), target_id, target_name)) for term in ref_term.values(): if len(term.relationships) > 0: yield term # TODO fill out rest _typedefs: Mapping[Tuple[str, str, str], TypeDef] = { ('go', 'biological process', 'activator'): TypeDef(Reference('RO', '0002213', 'positively regulates')), ('go', 'biological process', 'inhibitor'): TypeDef(Reference('RO', '0002212', 'negatively regulates')), } _logged = set() def _get_typedef(target_db, target_type, modulation) -> Optional[TypeDef]: t = (target_db, target_type, modulation) rv = _typedefs.get(t) if rv is not None: return rv if t not in _logged: _logged.add(t) tqdm.write(f'no strategy for: {target_db} {target_type} {modulation}')
def get_content() -> Tuple[List[Term], List[TypeDef]]: """Iterate CONSO terms.""" with open(TYPEDEF_PATH) as file: reader = csv.reader(file, delimiter='\t') _ = next(reader) # skip the header typedefs: Dict[str, TypeDef] = { identifier: TypeDef( reference=Reference(prefix=CONSO, identifier=identifier, name=name), namespace=namespace, xrefs=list(_extract_references(xrefs)), is_transitive=transitive == 'true', comment=comment, ) for identifier, name, namespace, xrefs, transitive, comment in reader } typedefs.update(part_of=part_of, has_role=has_role) del typedefs['bel'] with open(AUTHORS_PATH) as file: reader = csv.reader(file, delimiter='\t') _ = next(reader) # skip the header authors: Mapping[str, Reference] = { orcid_identifier: Reference( prefix='orcid', identifier=orcid_identifier, name=author, ) for orcid_identifier, author in reader } with open(TERMS_PATH) as file: reader = csv.reader(file, delimiter='\t') _ = next(reader) # skip the header terms: Dict[str, Term] = {} for conso_id, author_key, name, namespace, references, description in reader: if name == 'WITHDRAWN': continue terms[conso_id] = Term( reference=Reference( prefix=CONSO, identifier=conso_id, name=name, ), provenance=list(_extract_references(references)), namespace=namespace, definition=description, ) terms[conso_id].relationships[typedefs['author']].append( authors[author_key]) with open(SYNONYMS_PATH) as file: reader = csv.reader(file, delimiter='\t') _ = next(reader) # skip the header for conso_id, synonym, references, specificity in reader: references = ([r.strip() for r in references.split(',')] if references and references != '?' else []) specificity = ('EXACT' if specificity == '?' else specificity) terms[conso_id].synonyms.append( Synonym(synonym, specificity, provenance=references)) with open(XREFS_PATH) as file: reader = csv.reader(file, delimiter='\t') _ = next(reader) # skip the header for conso_id, database, identifier in reader: if database.lower() == 'bel': terms[conso_id].append_property('bel', identifier) else: terms[conso_id].append_xref( Reference(prefix=database, identifier=identifier)) with open(RELATIONS_PATH) as file: reader = enumerate(csv.reader(file, delimiter='\t'), start=1) _ = next(reader) # skip the header handled_relations = {'is_a'} | set(typedefs) for line, (source_ns, source_id, _source_name, relation, target_ns, target_id, target_name) in reader: if relation not in handled_relations: print( f'{RELATIONS_PATH} can not handle line {line} because unhandled relation: {relation}' ) continue if source_ns != CONSO and target_ns != CONSO: print( f'{RELATIONS_PATH}: skipping line {line} because neither entity is from {CONSO}' ) continue if source_ns != CONSO: print(f'{RELATIONS_PATH} can not handle line {line} because of' f' inverse relation definition to external identifier') continue target = Reference(prefix=target_ns, identifier=target_id, name=target_name) if relation == 'is_a': terms[source_id].append_parent(target) else: terms[source_id].append_relationship(typedefs[relation], target) return list(terms.values()), list(typedefs.values())
def iter_terms(version: str, autodownload: bool = False) -> Iterable[Term]: """Iterate over UMLS terms.""" name = f'umls-{version}-mrconso.zip' url = f'https://download.nlm.nih.gov/umls/kss/{version}/{name}' if autodownload: # FIXME needs automated scrapy step where you put in user/password path = ensure_path(PREFIX, url=url, version=version) else: path = RAW_MODULE.get(PREFIX, version, name) if not path.exists(): raise FileNotFoundError( f'UMLS needs to be downloaded manually still and moved to {path}. ' f'See https://www.nlm.nih.gov/research/umls/index.html', ) with zipfile.ZipFile(path) as zip_file: with zip_file.open('MRCONSO.RRF', mode='r') as file: it = tqdm(file, unit_scale=True, desc='[umls] parsing') lines = (line.decode('utf-8').strip().split('|') for line in it) for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)): df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS) df = df[df['LAT - Language'] == 'ENG'] idx = ((df['ISPREF - is preferred'] == 'Y') & (df['TS - Term Status'] == 'P') & (df['STT - String Type'] == 'PF'), ) pref_rows_df = df.loc[idx] if len(pref_rows_df.index) != 1: it.write( f'no preferred term for umls:{cui}. got {len(pref_rows_df.index)}' ) continue df['TTY - Term Type in Source'] = df[ 'TTY - Term Type in Source'].map(synonym_abb.__getitem__) _r = pref_rows_df.iloc[0] sdf = df[[ 'SAB - source name', 'CODE', 'TTY - Term Type in Source', 'STR' ]] synonyms = [] xrefs = [] for source, identifier, synonym_type, synonym in sdf.values: norm_source = normalize_prefix(source) if norm_source is None or not identifier: provenance = [] else: ref = Reference(prefix=norm_source, identifier=identifier) provenance = [ref] xrefs.append(ref) synonyms.append( Synonym( name=synonym, provenance=provenance, type=SynonymTypeDef.from_text(synonym_type), )) xrefs = sorted(set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)) term = Term( reference=Reference(prefix=PREFIX, identifier=cui, name=_r['STR']), synonyms=synonyms, xrefs=xrefs, ) yield term
KEGG_GENES_PREFIX = 'kegg.genes' KEGG_GENOME_PREFIX = 'kegg.genome' KEGG_PATHWAY_PREFIX = 'kegg.pathway' BASE = 'http://rest.kegg.jp' SKIP = { 'T03333', 'T03334', 'T03356', 'T03357', 'T03358', 'T03359', } from_kegg_species = TypeDef( reference=Reference.default('inKeggTaxon', 'in KEGG taxon'), parents=from_species.reference, ) @dataclass class KEGGGenome: """A data structure for a parsed line of the KEGG Genomes list.""" identifier: str name: str code: Optional[str] long_code: Optional[str] taxonomy_id: Optional[str] def annotate_term(self, term: Term) -> None: