Beispiel #1
0
def _parse(i, lines: Iterable[Tuple[str, str]]) -> Optional[Term]:
    dd_: DefaultDict[str, List[str]] = defaultdict(list)
    for key, value in lines:
        dd_[key].append(value)
    dd: Mapping[str, List[str]] = dict(dd_)

    if "//" in dd:
        return None

    accessions = dd["AC"]
    labels = dd.get("ID")
    reference = Reference(
        prefix="uniprot.ptm",
        identifier=accessions[0],
        name=labels[0] if labels else None,
    )
    term = Term(reference=reference)
    for line in dd.get("DR", []):
        line = line.rstrip(".")
        for x, y in [
            ("MOD; ", "PSI-MOD; MOD:"),
            ("CHEBI; ", "ChEBI; CHEBI:"),
        ]:
            if line.startswith(y):
                line = x + line[len(y):]

        ref = Reference.from_curie(line.replace("; ", ":"))
        if ref:
            term.append_xref(ref)
        else:
            tqdm.write(f"Failure on xref {line}")
    return term
Beispiel #2
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over UMLS terms."""
    with open_umls(version=version) as file:
        it = tqdm(file, unit_scale=True, desc="[umls] parsing")
        lines = (line.decode("utf-8").strip().split("|") for line in it)
        for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)):
            df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
            df = df[df["LAT - Language"] == "ENG"]
            idx = ((df["ISPREF - is preferred"] == "Y")
                   & (df["TS - Term Status"] == "P")
                   & (df["STT - String Type"] == "PF"), )
            pref_rows_df = df.loc[idx]
            if len(pref_rows_df.index) != 1:
                it.write(
                    f"no preferred term for umls:{cui}. got {len(pref_rows_df.index)}"
                )
                continue

            df["TTY - Term Type in Source"] = df[
                "TTY - Term Type in Source"].map(SYNONYM_ABB.__getitem__)

            _r = pref_rows_df.iloc[0]
            sdf = df[[
                "SAB - source name", "CODE", "TTY - Term Type in Source", "STR"
            ]]

            synonyms = []
            xrefs = []
            for source, identifier, synonym_type, synonym in sdf.values:
                norm_source = normalize_prefix(source)
                if norm_source is None or not identifier:
                    provenance = []
                else:
                    ref = Reference(prefix=norm_source, identifier=identifier)
                    provenance = [ref]
                    xrefs.append(ref)
                synonyms.append(
                    Synonym(
                        name=synonym,
                        provenance=provenance,
                        type=SynonymTypeDef.from_text(synonym_type),
                    ))

            xrefs = sorted(set(xrefs),
                           key=lambda reference:
                           (reference.prefix, reference.identifier))

            term = Term(
                reference=Reference(prefix=PREFIX,
                                    identifier=cui,
                                    name=_r["STR"]),
                synonyms=synonyms,
                xrefs=xrefs,
            )
            yield term
Beispiel #3
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    definitions = _get_definitions(version=version, force=force)
    abbr_to_taxonomy = _get_organisms(version=version, force=force)
    names_df = _get_names(version=version, force=force)
    human_orthologs = _get_human_orthologs(version=version, force=force)
    missing_taxonomies = set()

    so = {}
    for gtype in names_df[names_df.columns[1]].unique():
        so_id = GTYPE_TO_SO.get(gtype)
        if so_id is None:
            logger.warning(
                "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s",
                gtype)
        else:
            so[gtype] = Reference.auto("SO", so_id)

    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for organism, gtype, identifier, symbol, name in tqdm(names_df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=definitions.get(identifier),
        )
        if gtype and pd.notna(gtype) and gtype in so:
            term.append_parent(so[gtype])
        if pd.notna(name):
            term.append_synonym(name)
        for hgnc_curie in human_orthologs.get(identifier, []):
            if not hgnc_curie or pd.isna(hgnc_curie):
                continue
            hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
            if hgnc_ortholog is None:
                tqdm.write(
                    f"fb:{identifier} had invalid ortholog: {hgnc_curie}")
            else:
                term.append_relationship(orthologous, hgnc_ortholog)
        taxonomy_id = abbr_to_taxonomy.get(organism)
        if taxonomy_id is not None:
            term.append_relationship(from_species,
                                     Reference(NCBITAXON_PREFIX, taxonomy_id))
        elif organism not in missing_taxonomies:
            tqdm.write(f"missing mapping for species abbreviation: {organism}")
            missing_taxonomies.add(organism)
        yield term

    if missing_taxonomies:
        tqdm.write(
            f"there were {len(missing_taxonomies)} missing taxa in flybase genes"
        )
Beispiel #4
0
 def get_reference(self) -> Reference:
     """Get the reference for this genome."""
     return Reference(
         prefix="kegg.genome",
         identifier=self.identifier,
         name=self.name,
     )
Beispiel #5
0
def iter_terms() -> Iterable[Term]:
    df = get_relations_df()
    it = tqdm(df.dropna().values, total=len(df.index), desc='mapping to OBO', unit_scale=True)
    ref_term = {}
    for source_db, source_id, source_name, modulation, target_type, target_db, target_id, target_name in it:
        source = Reference(source_db.upper(), source_id, source_name)
        term = ref_term.get(source)
        if term is None:
            term = ref_term[source] = Term(reference=source)

        typedef = _get_typedef(target_db=target_db, target_type=target_type, modulation=modulation)
        if typedef is not None:
            term.append_relationship(typedef, Reference(target_db.upper(), target_id, target_name))

    for term in ref_term.values():
        if len(term.relationships) > 0:
            yield term
Beispiel #6
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    orthologs_df = ensure_df(PREFIX,
                             url=ORTHOLOGS_URL,
                             force=force,
                             header=None,
                             version=version)
    identifier_to_hgnc_ids = defaultdict(set)
    hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc")
    for identifier, hgnc_symbols in orthologs_df.values:
        if hgnc_symbols == "NONE":
            continue
        for hgnc_symbol in hgnc_symbols.split("|"):
            hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol)
            if hgnc_id is not None:
                identifier_to_hgnc_ids[identifier].add(hgnc_id)

    df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
    so = {
        gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
        for gtype in sorted(df[df.columns[6]].unique())
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=name if pd.notna(name) else None,
        )
        term.append_property("chromosome", chromosome[len("chromosome_"):])
        term.append_parent(so[gtype])
        term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
        for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        if uniprot_id and pd.notna(uniprot_id):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym))
        yield term
Beispiel #7
0
    def test_extract_definition(self):
        """Test extracting a definition."""
        expected_text = "Test Text."

        for s, expected_references in [
            (f'"{expected_text}"', []),
            (f'"{expected_text}" []', []),
            (f'"{expected_text}" [PMID:1234]', [Reference("pubmed", "1234")]),
            (
                f'"{expected_text}" [PMID:1234, PMID:1235]',
                [Reference("pubmed", "1234"),
                 Reference("pubmed", "1235")],
            ),
        ]:
            with self.subTest(s=s):
                actual_text, actual_references = _extract_definition(
                    s, prefix="chebi", identifier="XXX")
                self.assertEqual(expected_text, actual_text)
                self.assertEqual(expected_references, actual_references)
Beispiel #8
0
def iter_terms(version: Optional[str] = None,
               force: bool = False) -> Iterable[Term]:
    """Iterate over DepMap cell line terms."""
    df = ensure(force=force, version=version)
    columns = [
        "DepMap_ID",
        "cell_line_name",
        "stripped_cell_line_name",
        "alias",
        "COSMICID",
        "RRID",
        "WTSI_Master_Cell_ID",
        "Sanger_Model_ID",
    ]
    df["WTSI_Master_Cell_ID"] = df["WTSI_Master_Cell_ID"].map(_fix_mangled_int)
    df["COSMICID"] = df["COSMICID"].map(_fix_mangled_int)
    for identifier, name, sname, aliases, cosmic_id, cellosaurus_id, _wtsi_id, _sanger_id in df[
            columns].values:
        term = Term.from_triple(PREFIX, identifier, name)
        if pd.notna(sname):
            term.append_synonym(sname)
        if pd.notna(aliases):
            for alias in aliases.split(","):
                alias = alias.strip()
                if alias == name:
                    continue
                term.append_synonym(alias)
        if pd.notna(cosmic_id):
            term.append_xref(Reference("cosmic.cell", cosmic_id))
        if pd.notna(cellosaurus_id):
            term.append_xref(Reference("cellosaurus", cellosaurus_id))

        # WTSI stands for welcome trust sanger institute
        # Not sure where this prefix goes
        # if pd.notna(wtsi_id):
        #    term.append_xref(Reference("sanger", wtsi_id))

        # Not sure what this is
        # if pd.notna(sanger_id):
        #    term.append_xref(Reference("sanger", sanger_id))

        # TODO There's lots of other great ontological information in here. Next time.
        yield term
Beispiel #9
0
 def test_extract_definition_with_escapes(self):
     """Test extracting a definition with escapes in it."""
     expected_text = """The canonical 3' splice site has the sequence "AG"."""
     s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]"""
     actual_text, actual_references = _extract_definition(s,
                                                          strict=True,
                                                          prefix="chebi",
                                                          identifier="XXX")
     self.assertEqual(expected_text, actual_text)
     self.assertEqual([Reference("pubmed", "1234")], actual_references)
Beispiel #10
0
    def test_extract_synonym(self):
        """Test extracting synonym strings."""
        iupac_name = SynonymTypeDef(id="IUPAC_NAME", name="IUPAC NAME")
        synoynym_typedefs = {
            "IUPAC_NAME": iupac_name,
        }

        for synonym, s in [
            (
                Synonym(
                    name="LTEC I",
                    specificity="EXACT",
                    type=iupac_name,
                    provenance=[Reference("orphanet", "93938")],
                ),
                '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" EXACT [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I", specificity="EXACT"),
                '"LTEC I" []',
            ),
        ]:
            with self.subTest(s=s):
                self.assertEqual(
                    synonym,
                    _extract_synonym(s,
                                     synoynym_typedefs,
                                     prefix="chebi",
                                     identifier="XXX"),
                )
Beispiel #11
0
def iter_terms(version: Optional[str] = None,
               force: bool = False) -> Iterable[Term]:
    """Iterate over CCLE Cells."""
    df = ensure_df(version=version, force=force)
    for identifier, depmap_id, name in df.values:
        if pd.isna(name) or pd.isnull(name):
            name = None
        term = Term.from_triple(PREFIX, identifier, name)
        if pd.notna(depmap_id):
            term.append_xref(Reference("depmap", depmap_id))
        yield term
Beispiel #12
0
def iter_terms(version: Optional[str] = None,
               force: bool = False) -> Iterable[Term]:
    """Iterate over UniProt Terms."""
    with open_reader(ensure(version=version, force=force)) as reader:
        _ = next(reader)  # header
        for uniprot_id, name, taxonomy_id in tqdm(reader,
                                                  desc="Mapping UniProt"):
            term = Term.from_triple(prefix=PREFIX,
                                    identifier=uniprot_id,
                                    name=name)
            # TODO add gene encodes from relationship
            # TODO add description
            term.append_relationship(
                from_species,
                Reference(prefix=NCBITAXON_PREFIX, identifier=taxonomy_id))
            yield term
Beispiel #13
0
def _extract_references(s: str) -> Iterable[Reference]:
    for curie in s.split(','):
        reference = Reference.from_curie(curie)
        if reference is not None:
            yield reference
Beispiel #14
0
KEGG_GENES_PREFIX = "kegg.genes"
KEGG_GENOME_PREFIX = "kegg.genome"
KEGG_PATHWAY_PREFIX = "kegg.pathway"

BASE = "http://rest.kegg.jp"
SKIP = {
    "T03333",
    "T03334",
    "T03356",
    "T03357",
    "T03358",
    "T03359",
}

from_kegg_species = TypeDef(
    reference=Reference.default("inKeggTaxon", "in KEGG taxon"),
    parents=from_species.reference,
)


@dataclass
class KEGGGenome:
    """A data structure for a parsed line of the KEGG Genomes list."""

    identifier: str
    name: str
    code: Optional[str]
    long_code: Optional[str]
    taxonomy_id: Optional[str]

    def annotate_term(self, term: Term) -> None:
Beispiel #15
0
        term = ref_term.get(source)
        if term is None:
            term = ref_term[source] = Term(reference=source)

        typedef = _get_typedef(target_db=target_db, target_type=target_type, modulation=modulation)
        if typedef is not None:
            term.append_relationship(typedef, Reference(target_db.upper(), target_id, target_name))

    for term in ref_term.values():
        if len(term.relationships) > 0:
            yield term


# TODO fill out rest
_typedefs: Mapping[Tuple[str, str, str], TypeDef] = {
    ('go', 'biological process', 'activator'): TypeDef(Reference('RO', '0002213', 'positively regulates')),
    ('go', 'biological process', 'inhibitor'): TypeDef(Reference('RO', '0002212', 'negatively regulates')),
}

_logged = set()


def _get_typedef(target_db, target_type, modulation) -> Optional[TypeDef]:
    t = (target_db, target_type, modulation)
    rv = _typedefs.get(t)
    if rv is not None:
        return rv
    if t not in _logged:
        _logged.add(t)
        tqdm.write(f'no strategy for: {target_db} {target_type} {modulation}')
Beispiel #16
0
def get_content() -> Tuple[List[Term], List[TypeDef]]:
    """Iterate CONSO terms."""
    with open(TYPEDEF_PATH) as file:
        reader = csv.reader(file, delimiter='\t')
        _ = next(reader)  # skip the header
        typedefs: Dict[str, TypeDef] = {
            identifier: TypeDef(
                reference=Reference(prefix=CONSO,
                                    identifier=identifier,
                                    name=name),
                namespace=namespace,
                xrefs=list(_extract_references(xrefs)),
                is_transitive=transitive == 'true',
                comment=comment,
            )
            for identifier, name, namespace, xrefs, transitive, comment in
            reader
        }
        typedefs.update(part_of=part_of, has_role=has_role)
        del typedefs['bel']

    with open(AUTHORS_PATH) as file:
        reader = csv.reader(file, delimiter='\t')
        _ = next(reader)  # skip the header
        authors: Mapping[str, Reference] = {
            orcid_identifier: Reference(
                prefix='orcid',
                identifier=orcid_identifier,
                name=author,
            )
            for orcid_identifier, author in reader
        }

    with open(TERMS_PATH) as file:
        reader = csv.reader(file, delimiter='\t')
        _ = next(reader)  # skip the header

        terms: Dict[str, Term] = {}
        for conso_id, author_key, name, namespace, references, description in reader:
            if name == 'WITHDRAWN':
                continue
            terms[conso_id] = Term(
                reference=Reference(
                    prefix=CONSO,
                    identifier=conso_id,
                    name=name,
                ),
                provenance=list(_extract_references(references)),
                namespace=namespace,
                definition=description,
            )
            terms[conso_id].relationships[typedefs['author']].append(
                authors[author_key])

    with open(SYNONYMS_PATH) as file:
        reader = csv.reader(file, delimiter='\t')
        _ = next(reader)  # skip the header
        for conso_id, synonym, references, specificity in reader:
            references = ([r.strip() for r in references.split(',')]
                          if references and references != '?' else [])
            specificity = ('EXACT' if specificity == '?' else specificity)
            terms[conso_id].synonyms.append(
                Synonym(synonym, specificity, provenance=references))

    with open(XREFS_PATH) as file:
        reader = csv.reader(file, delimiter='\t')
        _ = next(reader)  # skip the header
        for conso_id, database, identifier in reader:
            if database.lower() == 'bel':
                terms[conso_id].append_property('bel', identifier)
            else:
                terms[conso_id].append_xref(
                    Reference(prefix=database, identifier=identifier))

    with open(RELATIONS_PATH) as file:
        reader = enumerate(csv.reader(file, delimiter='\t'), start=1)
        _ = next(reader)  # skip the header
        handled_relations = {'is_a'} | set(typedefs)
        for line, (source_ns, source_id, _source_name, relation, target_ns,
                   target_id, target_name) in reader:
            if relation not in handled_relations:
                print(
                    f'{RELATIONS_PATH} can not handle line {line} because unhandled relation: {relation}'
                )
                continue

            if source_ns != CONSO and target_ns != CONSO:
                print(
                    f'{RELATIONS_PATH}: skipping line {line} because neither entity is from {CONSO}'
                )
                continue

            if source_ns != CONSO:
                print(f'{RELATIONS_PATH} can not handle line {line} because of'
                      f' inverse relation definition to external identifier')
                continue

            target = Reference(prefix=target_ns,
                               identifier=target_id,
                               name=target_name)
            if relation == 'is_a':
                terms[source_id].append_parent(target)
            else:
                terms[source_id].append_relationship(typedefs[relation],
                                                     target)

    return list(terms.values()), list(typedefs.values())
Beispiel #17
0
def iter_terms(version: str, autodownload: bool = False) -> Iterable[Term]:
    """Iterate over UMLS terms."""
    name = f'umls-{version}-mrconso.zip'
    url = f'https://download.nlm.nih.gov/umls/kss/{version}/{name}'
    if autodownload:
        # FIXME needs automated scrapy step where you put in user/password
        path = ensure_path(PREFIX, url=url, version=version)
    else:
        path = RAW_MODULE.get(PREFIX, version, name)
        if not path.exists():
            raise FileNotFoundError(
                f'UMLS needs to be downloaded manually still and moved to  {path}. '
                f'See https://www.nlm.nih.gov/research/umls/index.html', )

    with zipfile.ZipFile(path) as zip_file:
        with zip_file.open('MRCONSO.RRF', mode='r') as file:
            it = tqdm(file, unit_scale=True, desc='[umls] parsing')
            lines = (line.decode('utf-8').strip().split('|') for line in it)
            for cui, cui_lines in itt.groupby(lines,
                                              key=operator.itemgetter(0)):
                df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
                df = df[df['LAT - Language'] == 'ENG']
                idx = ((df['ISPREF - is preferred'] == 'Y')
                       & (df['TS - Term Status'] == 'P')
                       & (df['STT - String Type'] == 'PF'), )
                pref_rows_df = df.loc[idx]
                if len(pref_rows_df.index) != 1:
                    it.write(
                        f'no preferred term for umls:{cui}. got {len(pref_rows_df.index)}'
                    )
                    continue

                df['TTY - Term Type in Source'] = df[
                    'TTY - Term Type in Source'].map(synonym_abb.__getitem__)

                _r = pref_rows_df.iloc[0]
                sdf = df[[
                    'SAB - source name', 'CODE', 'TTY - Term Type in Source',
                    'STR'
                ]]

                synonyms = []
                xrefs = []
                for source, identifier, synonym_type, synonym in sdf.values:
                    norm_source = normalize_prefix(source)
                    if norm_source is None or not identifier:
                        provenance = []
                    else:
                        ref = Reference(prefix=norm_source,
                                        identifier=identifier)
                        provenance = [ref]
                        xrefs.append(ref)
                    synonyms.append(
                        Synonym(
                            name=synonym,
                            provenance=provenance,
                            type=SynonymTypeDef.from_text(synonym_type),
                        ))

                xrefs = sorted(set(xrefs),
                               key=lambda reference:
                               (reference.prefix, reference.identifier))

                term = Term(
                    reference=Reference(prefix=PREFIX,
                                        identifier=cui,
                                        name=_r['STR']),
                    synonyms=synonyms,
                    xrefs=xrefs,
                )
                yield term
Beispiel #18
0
KEGG_GENES_PREFIX = 'kegg.genes'
KEGG_GENOME_PREFIX = 'kegg.genome'
KEGG_PATHWAY_PREFIX = 'kegg.pathway'

BASE = 'http://rest.kegg.jp'
SKIP = {
    'T03333',
    'T03334',
    'T03356',
    'T03357',
    'T03358',
    'T03359',
}

from_kegg_species = TypeDef(
    reference=Reference.default('inKeggTaxon', 'in KEGG taxon'),
    parents=from_species.reference,
)


@dataclass
class KEGGGenome:
    """A data structure for a parsed line of the KEGG Genomes list."""

    identifier: str
    name: str
    code: Optional[str]
    long_code: Optional[str]
    taxonomy_id: Optional[str]

    def annotate_term(self, term: Term) -> None: