Esempio n. 1
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get terms."""
    # DDB ID	DDB_G ID	Name	UniProt ID
    uniprot_mappings = multisetdict(
        ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
    )

    terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
    # GENE ID (DDB_G ID)	Gene Name	Synonyms	Gene products
    for identifier, name, synonyms, products in tqdm(terms.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
        )
        if products and pd.notna(products) and products != "unknown":
            for synonym in products.split(","):
                term.append_synonym(synonym.strip())
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym.strip()))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
                continue
            term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))

        term.set_species(identifier="44689", name="Dictyostelium discoideum")
        yield term
Esempio n. 2
0
def _iter_map_terms(version: str) -> Iterable[Term]:
    for identifier, name in ensure_list_pathways(version=version).items():
        yield Term.from_triple(
            prefix=KEGG_PATHWAY_PREFIX,
            identifier=identifier,
            name=name,
        )
Esempio n. 3
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
        if kegg_genome.identifier in SKIP:
            continue
        term = Term.from_triple(
            prefix=KEGG_GENOME_PREFIX,
            identifier=kegg_genome.identifier,
            name=kegg_genome.name,
        )
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name("ncbitaxon",
                                           kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                logger.debug(
                    f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}"
                )
            term.append_xref(
                Reference(
                    prefix="ncbitaxon",
                    identifier=kegg_genome.taxonomy_id,
                    name=taxonomy_name,
                ))
        yield term

    logger.info("[%s] unable to find %d taxonomy names in NCBI",
                KEGG_GENOME_PREFIX, errors)
Esempio n. 4
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over UniProt Terms."""
    with open_reader(ensure(version)) as reader:
        for uniprot_id, name, taxonomy_id in reader:
            term = Term.from_triple(prefix=PREFIX,
                                    identifier=uniprot_id,
                                    name=name)
            term.set_species(taxonomy_id)
            yield term
Esempio n. 5
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    definitions = _get_definitions(version=version, force=force)
    abbr_to_taxonomy = _get_organisms(version=version, force=force)
    names_df = _get_names(version=version, force=force)
    human_orthologs = _get_human_orthologs(version=version, force=force)
    missing_taxonomies = set()

    so = {}
    for gtype in names_df[names_df.columns[1]].unique():
        so_id = GTYPE_TO_SO.get(gtype)
        if so_id is None:
            logger.warning(
                "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s",
                gtype)
        else:
            so[gtype] = Reference.auto("SO", so_id)

    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for organism, gtype, identifier, symbol, name in tqdm(names_df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=definitions.get(identifier),
        )
        if gtype and pd.notna(gtype) and gtype in so:
            term.append_parent(so[gtype])
        if pd.notna(name):
            term.append_synonym(name)
        for hgnc_curie in human_orthologs.get(identifier, []):
            if not hgnc_curie or pd.isna(hgnc_curie):
                continue
            hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
            if hgnc_ortholog is None:
                tqdm.write(
                    f"fb:{identifier} had invalid ortholog: {hgnc_curie}")
            else:
                term.append_relationship(orthologous, hgnc_ortholog)
        taxonomy_id = abbr_to_taxonomy.get(organism)
        if taxonomy_id is not None:
            term.append_relationship(from_species,
                                     Reference(NCBITAXON_PREFIX, taxonomy_id))
        elif organism not in missing_taxonomies:
            tqdm.write(f"missing mapping for species abbreviation: {organism}")
            missing_taxonomies.add(organism)
        yield term

    if missing_taxonomies:
        tqdm.write(
            f"there were {len(missing_taxonomies)} missing taxa in flybase genes"
        )
Esempio n. 6
0
def _iter_genome_terms(
    *,
    list_pathway_path: str,
    link_pathway_path: str,
    kegg_genome: KEGGGenome,
) -> Iterable[Term]:
    terms = {}
    with open(list_pathway_path) as file:
        list_pathway_lines = [line.strip() for line in file]
    for line in list_pathway_lines:
        line = line.strip()
        pathway_id, name = [part.strip() for part in line.split("\t")]
        pathway_id = pathway_id[len("path:"):]

        terms[pathway_id] = term = Term.from_triple(
            prefix=KEGG_PATHWAY_PREFIX,
            identifier=pathway_id,
            name=name,
        )

        # Annotate species information
        kegg_genome.annotate_term(term)

        # Annotate the non-species specific code
        _start = min(i for i, e in enumerate(pathway_id) if e.isnumeric())
        pathway_code = pathway_id[_start:]
        term.append_relationship(
            species_specific,
            Reference(prefix=KEGG_PATHWAY_PREFIX,
                      identifier=f"map{pathway_code}"),
        )

    for pathway_id, protein_ids in _get_link_pathway_map(
            link_pathway_path).items():
        term = terms.get(pathway_id)
        if term is None:
            tqdm.write(
                f"could not find kegg.pathway:{pathway_id} for {kegg_genome.name}"
            )
            continue
        for protein_id in protein_ids:
            term.append_relationship(
                has_part,
                Reference(
                    prefix=KEGG_GENES_PREFIX,
                    identifier=protein_id,
                ),
            )

    yield from terms.values()
Esempio n. 7
0
def iter_terms(version: Optional[str] = None,
               force: bool = False) -> Iterable[Term]:
    """Iterate over UniProt Terms."""
    with open_reader(ensure(version=version, force=force)) as reader:
        _ = next(reader)  # header
        for uniprot_id, name, taxonomy_id in tqdm(reader,
                                                  desc="Mapping UniProt"):
            term = Term.from_triple(prefix=PREFIX,
                                    identifier=uniprot_id,
                                    name=name)
            # TODO add gene encodes from relationship
            # TODO add description
            term.append_relationship(
                from_species,
                Reference(prefix=NCBITAXON_PREFIX, identifier=taxonomy_id))
            yield term
Esempio n. 8
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv"
    df = ensure_df(PREFIX, url=url, version=version, force=force)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        term = Term.from_triple(prefix=PREFIX,
                                identifier=drugcentral_id,
                                name=drugcentral_name)
        term.append_xref(Reference(prefix="inchikey", identifier=inchi_key))
        term.append_property("smiles", smiles)
        term.append_property("inchi", inchi)
        if pd.notna(cas):
            term.append_xref(Reference(prefix="cas", identifier=cas))
        yield term
Esempio n. 9
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    orthologs_df = ensure_df(PREFIX,
                             url=ORTHOLOGS_URL,
                             force=force,
                             header=None,
                             version=version)
    identifier_to_hgnc_ids = defaultdict(set)
    hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc")
    for identifier, hgnc_symbols in orthologs_df.values:
        if hgnc_symbols == "NONE":
            continue
        for hgnc_symbol in hgnc_symbols.split("|"):
            hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol)
            if hgnc_id is not None:
                identifier_to_hgnc_ids[identifier].add(hgnc_id)

    df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
    so = {
        gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
        for gtype in sorted(df[df.columns[6]].unique())
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=name if pd.notna(name) else None,
        )
        term.append_property("chromosome", chromosome[len("chromosome_"):])
        term.append_parent(so[gtype])
        term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
        for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        if uniprot_id and pd.notna(uniprot_id):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym))
        yield term
Esempio n. 10
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over ChEMBL compounds."""
    with chembl_downloader.connect(version=version) as conn:
        logger.info("using connection %s", conn)
        with closing(conn.cursor()) as cursor:
            logger.info("using cursor %s", cursor)
            cursor.execute(QUERY)
            for chembl_id, name, smiles, inchi, inchi_key in cursor.fetchall():
                # TODO add xrefs?
                term = Term.from_triple(prefix=PREFIX,
                                        identifier=chembl_id,
                                        name=name)
                if smiles:
                    term.append_property("smiles", smiles)
                if inchi:
                    term.append_property("inchi", inchi)
                if inchi_key:
                    term.append_xref(Reference("inchikey", inchi_key))
                yield term
Esempio n. 11
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get CGNC terms."""
    df = ensure_df(PREFIX,
                   url=URL,
                   name=f"{PREFIX}.tsv",
                   force=force,
                   header=0,
                   names=HEADER)
    for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _,
            _) in enumerate(df.values):
        if pd.isna(cgnc_id):
            logger.warning(f"row {i} CGNC ID is none")
            continue

        try:
            int(cgnc_id)
        except ValueError:
            logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}")
            continue

        term = Term.from_triple(
            prefix=PREFIX,
            identifier=cgnc_id,
            name=name,
        )
        term.set_species(identifier="9031", name="Gallus gallus")
        if entrez_id and pd.notna(entrez_id):
            term.append_xref(Reference(prefix="ncbigene",
                                       identifier=entrez_id))
        if pd.notna(ensembl_id):
            term.append_xref(Reference(prefix="ensembl",
                                       identifier=ensembl_id))
        if synonym_1 and pd.notna(synonym_1):
            term.append_synonym(synonym_1)
        if synoynm_2 and pd.notna(synoynm_2):
            term.append_synonym(synoynm_2)
        yield term
Esempio n. 12
0
def get_terms(force: bool = False,
              version: Optional[str] = None) -> Iterable[Term]:
    """Get terms."""
    alt_ids_df = ensure_df(
        PREFIX,
        url=ALTS_URL,
        name="alts.tsv",
        force=force,
        header=None,
        names=["alt", "zfin_id"],
        version=version,
    )
    primary_to_alt_ids = defaultdict(set)
    for alt_id, zfin_id in alt_ids_df.values:
        primary_to_alt_ids[zfin_id].add(alt_id)

    human_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=HUMAN_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 7],
                  version=version).values)
    mouse_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=MOUSE_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    fly_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=FLY_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    entrez_mappings = dict(
        ensure_df(PREFIX,
                  url=ENTREZ_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)
    uniprot_mappings = multidict(
        ensure_df(PREFIX,
                  url=UNIPROT_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)

    df = ensure_df(
        PREFIX,
        url=URL,
        name="markers.tsv",
        force=force,
        header=None,
        names=MARKERS_COLUMNS,
        version=version,
    )
    df["sequence_ontology_id"] = df["sequence_ontology_id"].map(
        lambda x: x[len("SO:"):])
    so = {
        sequence_ontology_id: Reference.auto(prefix="SO",
                                             identifier=sequence_ontology_id)
        for sequence_ontology_id in df["sequence_ontology_id"].unique()
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
            definition=definition if definition != name else None,
        )
        term.set_species(identifier="7955", name="Danio rerio")
        term.append_parent(so[sequence_ontology_id])
        # Entity type is redundant of identifier
        # term.append_property("type", entity_type)
        for alt_id in primary_to_alt_ids[identifier]:
            term.append_alt(alt_id)
        entrez_id = entrez_mappings.get(identifier)
        if entrez_id:
            term.append_xref(Reference("ncbigene", entrez_id))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        for hgnc_id in human_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        for mgi_curie in mouse_orthologs.get(identifier, []):
            mouse_ortholog = Reference.from_curie(mgi_curie, auto=True)
            if mouse_ortholog:
                term.append_relationship(orthologous, mouse_ortholog)
        for flybase_id in fly_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference("flybase", flybase_id))

        yield term