Beispiel #1
0
def iter_terms() -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes():
        xrefs = []
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}')
            xrefs.append(Reference(
                prefix='ncbitaxon',
                identifier=kegg_genome.taxonomy_id,
                name=taxonomy_name,
            ))

        term = Term(
            reference=Reference(
                prefix='kegg.genome',
                identifier=kegg_genome.identifier,
                name=kegg_genome.name,
            ),
            xrefs=xrefs,
        )
        yield term

    logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
Beispiel #2
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = _find_text(tree, "name")
    description = _find_text(tree, "description")
    if description:
        description = description.strip().replace("\n", " ")
    identifier = _find_text(tree, "identifier")
    if identifier is None:
        raise ValueError
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = _find_text(experiment, "name")
        experiment_identifier = _find_text(experiment, "identifier")
        if experiment_identifier is None:
            continue
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_identifier,
                name=experiment_name,
            ),
        )
    return term
Beispiel #3
0
def _iter_genome_terms(
    *,
    list_pathway_path: str,
    link_pathway_path: str,
    kegg_genome: KEGGGenome,
) -> Iterable[Term]:
    terms = {}
    with open(list_pathway_path) as file:
        list_pathway_lines = [line.strip() for line in file]
    for line in list_pathway_lines:
        line = line.strip()
        pathway_id, name = [part.strip() for part in line.split("\t")]
        pathway_id = pathway_id[len("path:"):]

        terms[pathway_id] = term = Term.from_triple(
            prefix=KEGG_PATHWAY_PREFIX,
            identifier=pathway_id,
            name=name,
        )

        # Annotate species information
        kegg_genome.annotate_term(term)

        # Annotate the non-species specific code
        _start = min(i for i, e in enumerate(pathway_id) if e.isnumeric())
        pathway_code = pathway_id[_start:]
        term.append_relationship(
            species_specific,
            Reference(prefix=KEGG_PATHWAY_PREFIX,
                      identifier=f"map{pathway_code}"),
        )

    for pathway_id, protein_ids in _get_link_pathway_map(
            link_pathway_path).items():
        term = terms.get(pathway_id)
        if term is None:
            tqdm.write(
                f"could not find kegg.pathway:{pathway_id} for {kegg_genome.name}"
            )
            continue
        for protein_id in protein_ids:
            term.append_relationship(
                has_part,
                Reference(
                    prefix=KEGG_GENES_PREFIX,
                    identifier=protein_id,
                ),
            )

    yield from terms.values()
Beispiel #4
0
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
    xrefs_url = f"{base_url}/equivalences.csv"
    xrefs_df = ensure_df(PREFIX,
                         url=xrefs_url,
                         version=version,
                         header=None,
                         sep=",",
                         dtype=str)

    # Normalize nextprot families
    ns_remapping = {
        "NXP": "nextprot.family",
    }
    xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
    xrefs_df[1] = [
        xref_identifier
        if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):]
        for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
    ]

    xrefs_df[0] = xrefs_df[0].map(normalize_prefix)
    xrefs_df = xrefs_df[xrefs_df[0].notna()]
    xrefs_df = xrefs_df[xrefs_df[0] != "bel"]
    return multidict(
        (identifier, Reference(xref_prefix, xref_identifier))
        for xref_prefix, xref_identifier, identifier in xrefs_df.values)
Beispiel #5
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
        if kegg_genome.identifier in SKIP:
            continue
        term = Term.from_triple(
            prefix=KEGG_GENOME_PREFIX,
            identifier=kegg_genome.identifier,
            name=kegg_genome.name,
        )
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name("ncbitaxon",
                                           kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                logger.debug(
                    f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}"
                )
            term.append_xref(
                Reference(
                    prefix="ncbitaxon",
                    identifier=kegg_genome.taxonomy_id,
                    name=taxonomy_name,
                ))
        yield term

    logger.info("[%s] unable to find %d taxonomy names in NCBI",
                KEGG_GENOME_PREFIX, errors)
Beispiel #6
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f"{PREFIX} download"):
        try:
            path = ensure_path(
                PREFIX,
                "phenotype",
                version=version,
                url=
                f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
                name=f"HGVPM{n}.json",
                force=force,
            )
        except OSError as e:
            tqdm.write(f"{n}: {e}")
            continue
        with open(path) as file:
            j = json.load(file)

        description = j.get("description")
        if description is not None:
            description = description.strip().replace("\n", " ")
        term = Term(
            reference=Reference(PREFIX, j["identifier"], j["name"]),
            definition=description,
        )
        yield term
Beispiel #7
0
    def test_extract_definition(self):
        """Test extracting a definition."""
        expected_text = "Test Text."

        for s, expected_references in [
            (f'"{expected_text}"', []),
            (f'"{expected_text}" []', []),
            (f'"{expected_text}" [PMID:1234]', [Reference('pubmed', '1234')]),
            (f'"{expected_text}" [PMID:1234, PMID:1235]',
             [Reference('pubmed', '1234'),
              Reference('pubmed', '1235')]),
        ]:
            with self.subTest(s=s):
                actual_text, actual_references = _extract_definition(
                    s, prefix='chebi', identifier='XXX')
                self.assertEqual(expected_text, actual_text)
                self.assertEqual(expected_references, actual_references)
Beispiel #8
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv"
    df = ensure_df(PREFIX, url=url, version=version, force=force)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        term = Term.from_triple(prefix=PREFIX,
                                identifier=drugcentral_id,
                                name=drugcentral_name)
        term.append_xref(Reference(prefix="inchikey", identifier=inchi_key))
        term.append_property("smiles", smiles)
        term.append_property("inchi", inchi)
        if pd.notna(cas):
            term.append_xref(Reference(prefix="cas", identifier=cas))
        yield term
Beispiel #9
0
 def test_extract_definition_with_escapes(self):
     """Test extracting a definition with escapes in it."""
     expected_text = '''The canonical 3' splice site has the sequence "AG".'''
     s = '''"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]'''
     actual_text, actual_references = _extract_definition(s,
                                                          strict=True,
                                                          prefix='chebi',
                                                          identifier='XXX')
     self.assertEqual(expected_text, actual_text)
     self.assertEqual([Reference('pubmed', '1234')], actual_references)
Beispiel #10
0
 def test_extract_definition_with_escapes(self):
     """Test extracting a definition with escapes in it."""
     expected_text = """The canonical 3' splice site has the sequence "AG"."""
     s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]"""
     actual_text, actual_references = _extract_definition(s,
                                                          strict=True,
                                                          prefix="chebi",
                                                          identifier="XXX")
     self.assertEqual(expected_text, actual_text)
     self.assertEqual([Reference("pubmed", "1234")], actual_references)
Beispiel #11
0
    def test_extract_definition(self):
        """Test extracting a definition."""
        expected_text = "Test Text."

        for s, expected_references in [
            (f'"{expected_text}"', []),
            (f'"{expected_text}" []', []),
            (f'"{expected_text}" [PMID:1234]', [Reference("pubmed", "1234")]),
            (
                f'"{expected_text}" [PMID:1234, PMID:1235]',
                [Reference("pubmed", "1234"),
                 Reference("pubmed", "1235")],
            ),
        ]:
            with self.subTest(s=s):
                actual_text, actual_references = _extract_definition(
                    s, prefix="chebi", identifier="XXX")
                self.assertEqual(expected_text, actual_text)
                self.assertEqual(expected_references, actual_references)
Beispiel #12
0
def get_terms() -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df()

    df['aliases'] = df['aliases'].map(lambda s: s.split('|')
                                      if pd.notna(s) else [])
    df['members'] = df['members'].map(_parse_members)
    df['xrefs'] = df['xrefs'].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
    df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

    slim_df = df[[
        'complexportal_id',
        'name',
        'definition',
        'aliases',
        'xrefs',
        'taxonomy_id',
        'taxonomy_name',
        'members',
    ]]
    it = tqdm(slim_df.values,
              total=len(slim_df.index),
              desc=f'mapping {PREFIX}')
    unhandled_xref_type = set()
    for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == 'identity':
                _xrefs.append(reference)
            elif note == 'see-also' and reference.prefix == 'pubmed':
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(
                    f'unhandled xref type: {note} / {reference.prefix}')
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=complexportal_id,
                                name=name),
            definition=definition.strip(),
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
Beispiel #13
0
    def test_extract_synonym(self):
        """Test extracting synonym strings."""
        iupac_name = SynonymTypeDef(id="IUPAC_NAME", name="IUPAC NAME")
        synoynym_typedefs = {
            "IUPAC_NAME": iupac_name,
        }

        for synonym, s in [
            (
                Synonym(
                    name="LTEC I",
                    specificity="EXACT",
                    type=iupac_name,
                    provenance=[Reference("orphanet", "93938")],
                ),
                '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" EXACT [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I", specificity="EXACT"),
                '"LTEC I" []',
            ),
        ]:
            with self.subTest(s=s):
                self.assertEqual(
                    synonym,
                    _extract_synonym(s,
                                     synoynym_typedefs,
                                     prefix="chebi",
                                     identifier="XXX"),
                )
Beispiel #14
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split('|'):
        entity_id, note = xref.split('(')
        note = note.rstrip(')')
        prefix, identifier = entity_id.split(':', 1)
        rv.append((Reference(prefix=prefix, identifier=identifier), note))
    return rv
Beispiel #15
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    df = ensure_df(PREFIX, url=URL, version=version)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        xrefs = [
            Reference(prefix="smiles", identifier=smiles),
            Reference(prefix="inchi", identifier=inchi),
            Reference(prefix="inchikey", identifier=inchi_key),
        ]

        if pd.notna(cas):
            xrefs.append(Reference(prefix="cas", identifier=cas))

        yield Term(
            reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name),
            xrefs=xrefs,
        )
Beispiel #16
0
    def test_extract_synonym(self):
        """Test extracting synonym strings."""
        iupac_name = SynonymTypeDef(id='IUPAC_NAME', name='IUPAC NAME')
        synoynym_typedefs = {
            'IUPAC_NAME': iupac_name,
        }

        for synonym, s in [
            (
                Synonym(
                    name='LTEC I',
                    specificity='EXACT',
                    type=iupac_name,
                    provenance=[Reference('orphanet', '93938')],
                ),
                '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I',
                        specificity='EXACT',
                        provenance=[Reference('orphanet', '93938')]),
                '"LTEC I" EXACT [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I',
                        specificity='EXACT',
                        provenance=[Reference('orphanet', '93938')]),
                '"LTEC I" [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I', specificity='EXACT'),
                '"LTEC I" []',
            ),
        ]:
            with self.subTest(s=s):
                self.assertEqual(
                    synonym,
                    _extract_synonym(s,
                                     synoynym_typedefs,
                                     prefix='chebi',
                                     identifier='XXX'))
Beispiel #17
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find('name').text
    description = tree.find('description').text
    if description:
        description = description.strip().replace('\n', ' ')
    identifier = tree.find('identifier').text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall('experiments'):
        experiment_name = experiment.find('name').text
        experiment_id = experiment.find('identifier').text
        term.append_relationship(
            has_part,
            Reference(
                'gwascentral.experiment',
                identifier=experiment_id,
                name=experiment_name,
            ))
    return term
Beispiel #18
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find("name").text
    description = tree.find("description").text
    if description:
        description = description.strip().replace("\n", " ")
    identifier = tree.find("identifier").text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = experiment.find("name").text
        experiment_id = experiment.find("identifier").text
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_id,
                name=experiment_name,
            ),
        )
    return term
Beispiel #19
0
def _parse_members(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for member in s.split('|'):
        entity_id, count = member.split('(')
        count = count.rstrip(')')
        if ':' in entity_id:
            prefix, identifier = entity_id.split(':', 1)
        else:
            prefix, identifier = 'uniprot', entity_id
        rv.append((Reference(prefix=prefix, identifier=identifier), count))
    return rv
Beispiel #20
0
def _parse_members(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for member in s.split("|"):
        entity_id, count = member.split("(")
        count = count.rstrip(")")
        if ":" in entity_id:
            prefix, identifier = entity_id.split(":", 1)
        else:
            prefix, identifier = "uniprot", entity_id
        rv.append((Reference(prefix=prefix, identifier=identifier), count))
    return rv
Beispiel #21
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get CGNC terms."""
    df = ensure_df(PREFIX,
                   url=URL,
                   name=f"{PREFIX}.tsv",
                   force=force,
                   header=0,
                   names=HEADER)
    for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _,
            _) in enumerate(df.values):
        if pd.isna(cgnc_id):
            logger.warning(f"row {i} CGNC ID is none")
            continue

        try:
            int(cgnc_id)
        except ValueError:
            logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}")
            continue

        term = Term.from_triple(
            prefix=PREFIX,
            identifier=cgnc_id,
            name=name,
        )
        term.set_species(identifier="9031", name="Gallus gallus")
        if entrez_id and pd.notna(entrez_id):
            term.append_xref(Reference(prefix="ncbigene",
                                       identifier=entrez_id))
        if pd.notna(ensembl_id):
            term.append_xref(Reference(prefix="ensembl",
                                       identifier=ensembl_id))
        if synonym_1 and pd.notna(synonym_1):
            term.append_synonym(synonym_1)
        if synoynm_2 and pd.notna(synoynm_2):
            term.append_synonym(synoynm_2)
        yield term
Beispiel #22
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX,
                                        name='ITIS.sqlite',
                                        version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(
                        os.path.join(sqlite_dir, f'itisSqlite{version}',
                                     'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Beispiel #23
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over ChEMBL compound's names."""
    op = get_path(version=version)
    logger.info('opening connection to %s', op)
    with closing(sqlite3.connect(op)) as conn:
        logger.info('using connection %s', conn)
        with closing(conn.cursor()) as cursor:
            logger.info('using cursor %s', cursor)
            cursor.execute(QUERY)
            for chembl_id, name in cursor.fetchall():
                # TODO add xrefs to smiles, inchi, inchikey here
                xrefs = []
                yield Term(
                    reference=Reference(prefix=PREFIX,
                                        identifier=chembl_id,
                                        name=name),
                    xrefs=xrefs,
                )
Beispiel #24
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over ChEMBL compounds."""
    with chembl_downloader.connect(version=version) as conn:
        logger.info("using connection %s", conn)
        with closing(conn.cursor()) as cursor:
            logger.info("using cursor %s", cursor)
            cursor.execute(QUERY)
            for chembl_id, name, smiles, inchi, inchi_key in cursor.fetchall():
                # TODO add xrefs?
                term = Term.from_triple(prefix=PREFIX,
                                        identifier=chembl_id,
                                        name=name)
                if smiles:
                    term.append_property("smiles", smiles)
                if inchi:
                    term.append_property("inchi", inchi)
                if inchi_key:
                    term.append_xref(Reference("inchikey", inchi_key))
                yield term
Beispiel #25
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f'{PREFIX} download'):
        try:
            path = ensure_path(
                PREFIX,
                'phenotype',
                version=version,
                url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json',
                name=f'HGVPM{n}.json',
            )
        except OSError as e:
            tqdm.write(f'{n}: {e}')
            continue
        with open(path) as file:
            j = json.load(file)
        term = Term(
            reference=Reference(PREFIX, j['identifier'], j['name']),
            definition=j['description'].strip().replace('\n', ' '),
        )
        yield term
Beispiel #26
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL, force=force, version=version)
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for file in zip_file.filelist:
                if file.filename.endswith(".sqlite") and not file.is_dir():
                    zip_file.extract(file, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename)))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f"file missing: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path.as_posix())

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == "0":  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Beispiel #27
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, URL)
    sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020',
                                        'ITIS.sqlite')
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(get_prefix_directory(PREFIX))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means its a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Beispiel #28
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms in Rhea."""
    terms = {}

    directions = ensure_df(
        PREFIX,
        url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv',
        version=version)
    for master, lr, rl, bi in directions.values:
        terms[master] = Term(reference=Reference(PREFIX, master))
        terms[lr] = Term(reference=Reference(PREFIX, lr))
        terms[rl] = Term(reference=Reference(PREFIX, rl))
        terms[bi] = Term(reference=Reference(PREFIX, bi))

        terms[master].append_relationship(has_lr, terms[lr])
        terms[master].append_relationship(has_rl, terms[rl])
        terms[master].append_relationship(has_bi, terms[bi])
        terms[lr].append_parent(terms[master])
        terms[rl].append_parent(terms[master])
        terms[bi].append_parent(terms[master])

    hierarchy = ensure_df(
        PREFIX,
        url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv',
        version=version)
    for source, relation, target in hierarchy.values:
        if relation != 'is_a':
            raise ValueError(f'RHEA unrecognized relation: {relation}')
        terms[source].append_parent(terms[target])

    for xref_prefix, url in [
        ('ecocyc', 'rhea2ecocyc'),
        ('kegg.reaction', 'rhea2kegg_reaction'),
        ('reactome', 'rhea2reactome'),
        ('macie', 'rhea2macie'),
        ('metacyc', 'rhea2metacyc'),
    ]:
        xref_df = ensure_df(
            PREFIX,
            url=f'ftp://ftp.expasy.org/databases/rhea/tsv/{url}.tsv',
            version=version)
        for rhea_id, _, _, xref_id in xref_df.values:
            if rhea_id not in terms:
                logger.warning('[%s] could not find %s:%s for xref %s:%s',
                               PREFIX, PREFIX, rhea_id, xref_prefix, xref_id)
                continue
            terms[rhea_id].append_xref(Reference(xref_prefix, xref_id))

    # TODO are EC codes equivalent?
    # TODO uniprot enabled by (RO:0002333)
    # TODO names?

    url = 'ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz'
    graph = pystow.ensure_rdf('pyobo', 'raw', PREFIX, version, url=url)
    result = graph.query('''
    PREFIX rh:<http://rdf.rhea-db.org/>
    SELECT ?reaction ?reactionId ?reactionLabel WHERE {
      ?reaction rdfs:subClassOf rh:Reaction .
      ?reaction rh:id ?reactionId .
      ?reaction rdfs:label ?reactionLabel .
    }
    ''')
    for _, identifier, name in result:
        identifier = str(identifier)
        if identifier not in terms:
            logger.warning('isolated element in rdf: rhea:%s ! %s', identifier,
                           name)
            continue
        terms[identifier].reference.name = name

    # TODO participants?

    yield from terms.values()
Beispiel #29
0
# -*- coding: utf-8 -*-
"""Converter for Rhea."""

import logging
from typing import Iterable

import bioversions
import pystow

from pyobo.struct import Obo, Reference, Term, TypeDef
from pyobo.utils.path import ensure_df

logger = logging.getLogger(__name__)
PREFIX = 'rhea'

has_lr = TypeDef(Reference(PREFIX, 'has_lr_reaction'))
has_rl = TypeDef(Reference(PREFIX, 'has_rl_reaction'))
has_bi = TypeDef(Reference(PREFIX, 'has_bi_reaction'))


def get_obo() -> Obo:
    """Get Rhea as OBO."""
    version = bioversions.get_version(PREFIX)
    return Obo(
        ontology=PREFIX,
        name='Rhea',
        iter_terms=iter_terms,
        iter_terms_kwargs=dict(version=version),
        data_version=version,
        auto_generated_by=f'bio2obo:{PREFIX}',
        typedefs=[has_lr, has_bi, has_rl],
Beispiel #30
0
def get_terms(version: str) -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df(version=version)
    df.rename(
        inplace=True,
        columns={
            "Aliases for complex": "aliases",
            "Identifiers (and stoichiometry) of molecules in complex": "members",
            "Taxonomy identifier": "taxonomy_id",
            "Cross references": "xrefs",
            "Description": "definition",
            "Recommended name": "name",
            "#Complex ac": "complexportal_id",
        },
    )

    df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else [])
    df["members"] = df["members"].map(_parse_members)
    df["xrefs"] = df["xrefs"].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping("ncbitaxon")
    df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get)

    slim_df = df[
        [
            "complexportal_id",
            "name",
            "definition",
            "aliases",
            "xrefs",
            "taxonomy_id",
            "taxonomy_name",
            "members",
        ]
    ]
    it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}")
    unhandled_xref_type = set()
    for (
        complexportal_id,
        name,
        definition,
        aliases,
        xrefs,
        taxonomy_id,
        taxonomy_name,
        members,
    ) in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == "identity":
                _xrefs.append(reference)
            elif note == "see-also" and reference.prefix == "pubmed":
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(f"unhandled xref type: {note} / {reference.prefix}")
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name),
            definition=definition.strip() if pd.notna(definition) else None,
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term