Ejemplo n.º 1
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX,
                                        name='ITIS.sqlite',
                                        version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(
                        os.path.join(sqlite_dir, f'itisSqlite{version}',
                                     'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 2
0
def _ensure_conv_genome_helper(
    kegg_genome_id: str,
    target_database: str,
    *,
    version: str,
    error_on_missing: bool = False,
) -> Optional[str]:
    """Get the KEGG-external protein map for the given organism/database."""
    name = f"{kegg_genome_id}.tsv"
    try:
        rv = ensure_path(
            KEGG_GENES_PREFIX,
            f"conv_{target_database}",
            url=f"{BASE}/conv/{target_database}/{kegg_genome_id}",
            name=name,
            error_on_missing=error_on_missing,
            version=version,
        )
    except urllib.error.HTTPError:
        path_rv = prefix_directory_join(
            KEGG_GENES_PREFIX,
            f"conv_{target_database}",
            name=name,
            version=version,
        )
        with path_rv.open("w") as file:
            print(file=file)  # noqa: T201
        return path_rv.as_posix()
    except FileNotFoundError:
        return None
    else:
        return rv
Ejemplo n.º 3
0
def get_path(version: str):
    """Get the path to the extracted ChEMBL SQLite database."""
    url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz'
    path = ensure_path(PREFIX, url=url, version=version)
    name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db'
    d = prefix_directory_join(PREFIX, version=version)
    op = os.path.join(d, name)
    if not os.path.exists(op):
        with tarfile.open(path, mode='r', encoding='utf-8') as tar_file:
            tar_file.extractall(d)
    return op
Ejemplo n.º 4
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL, force=force, version=version)
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for file in zip_file.filelist:
                if file.filename.endswith(".sqlite") and not file.is_dir():
                    zip_file.extract(file, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename)))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f"file missing: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path.as_posix())

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == "0":  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 5
0
    return get_name_id_mapping('ncbitaxon')[name]


def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version('biogrid')
    url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip'
    df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df


@cached_mapping(
    path=prefix_directory_join(PREFIX,
                               'cache',
                               'xrefs',
                               name='ncbigene.tsv',
                               version=version_getter(PREFIX)),
    header=['biogrid_id', 'ncbigene_id'],
)
def get_ncbigene_mapping() -> Mapping[str, str]:
    """Get BioGRID to NCBIGENE mapping.

    Is basically equivalent to:

    .. code-block:: python

        from pyobo import get_filtered_xrefs
        biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene')
    """
    df = get_df()
Ejemplo n.º 6
0
def get_terms(version: Optional[str] = None,
              force: bool = False) -> Iterable[Term]:  # noqa:C901
    """Get HGNC terms."""
    if version is None:
        version = datetime.date.today().strftime("%Y-%m-01")
    unhandled_entry_keys: typing.Counter[str] = Counter()
    unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict)
    path = ensure_path(
        PREFIX,
        url=DEFINITIONS_URL_FMT.format(version=version),
        force=force,
        version=version,
        name="hgnc_complete_set.json",
    )
    with open(path) as file:
        entries = json.load(file)["response"]["docs"]

    yield from sorted(
        {
            Term(reference=Reference.auto("SO", so_id))
            for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id
        },
        key=attrgetter("identifier"),
    )

    statuses = set()
    for entry in tqdm(entries,
                      desc=f"Mapping {PREFIX}",
                      unit="gene",
                      unit_scale=True):
        name, symbol, identifier = (
            entry.pop("name"),
            entry.pop("symbol"),
            entry.pop("hgnc_id")[len("HGNC:"):],
        )
        status = entry.pop("status")
        if status == "Approved":
            is_obsolete = False
        elif status not in statuses:
            statuses.add(status)
            logger.warning("UNHANDLED %s", status)
            is_obsolete = True
        else:
            raise ValueError(
                f"Unhandled status for hgnc:{identifier}: {status}")

        term = Term(
            definition=name,
            reference=Reference(prefix=PREFIX,
                                identifier=identifier,
                                name=symbol),
            is_obsolete=is_obsolete,
        )

        for uniprot_id in entry.pop("uniprot_ids", []):
            term.append_relationship(
                has_gene_product,
                Reference.auto("uniprot", uniprot_id),
            )
        for ec_code in entry.pop("enzyme_id", []):
            if "-" in ec_code:
                continue  # only add concrete annotations
            term.append_relationship(
                gene_product_member_of,
                Reference.auto("eccode", ec_code),
            )
        for rna_central_ids in entry.pop("rna_central_id", []):
            for rna_central_id in rna_central_ids.split(","):
                term.append_relationship(
                    transcribes_to,
                    Reference(prefix="rnacentral",
                              identifier=rna_central_id.strip()),
                )
        mirbase_id = entry.pop("mirbase", None)
        if mirbase_id:
            term.append_relationship(
                transcribes_to,
                Reference.auto(
                    "mirbase",
                    mirbase_id,
                ),
            )
        snornabase_id = entry.pop("snornabase", None)
        if snornabase_id:
            term.append_relationship(
                transcribes_to,
                Reference(prefix="snornabase", identifier=snornabase_id))

        for rgd_curie in entry.pop("rgd_id", []):
            if not rgd_curie.startswith("RGD:"):
                logger.warning(
                    f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}")
                continue
            rgd_id = rgd_curie[len("RGD:"):]
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="rgd", identifier=rgd_id),
            )
        for mgi_curie in entry.pop("mgd_id", []):
            if not mgi_curie.startswith("MGI:"):
                logger.warning(
                    f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}")
                continue
            mgi_id = mgi_curie[len("MGI:"):]
            if not mgi_id:
                continue
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="mgi", identifier=mgi_id),
            )

        for xref_prefix, key in gene_xrefs:
            xref_identifiers = entry.pop(key, None)
            if xref_identifiers is None:
                continue
            if not isinstance(xref_identifiers, list):
                xref_identifiers = [xref_identifiers]
            for xref_identifier in xref_identifiers:
                term.append_xref(
                    Reference(prefix=xref_prefix,
                              identifier=str(xref_identifier)))

        for pubmed_id in entry.pop("pubmed_id", []):
            term.append_provenance(
                Reference(prefix="pubmed", identifier=str(pubmed_id)))

        gene_group_ids = entry.pop("gene_group_id", [])
        gene_groups = entry.pop("gene_group", [])
        for gene_group_id, gene_group_label in zip(gene_group_ids,
                                                   gene_groups):
            term.append_relationship(
                member_of,
                Reference(
                    prefix="hgnc.genegroup",
                    identifier=str(gene_group_id),
                    name=gene_group_label,
                ),
            )

        for alias_symbol in entry.pop("alias_symbol", []):
            term.append_synonym(
                Synonym(name=alias_symbol, type=alias_symbol_type))
        for alias_name in entry.pop("alias_name", []):
            term.append_synonym(Synonym(name=alias_name, type=alias_name_type))
        for previous_symbol in entry.pop("previous_symbol", []):
            term.append_synonym(
                Synonym(name=previous_symbol, type=previous_symbol_type))
        for previous_name in entry.pop("prev_name", []):
            term.append_synonym(
                Synonym(name=previous_name, type=previous_name_type))

        for prop in ["location"]:
            value = entry.pop(prop, None)
            if value:
                term.append_property(prop, value)

        locus_type = entry.pop("locus_type")
        locus_group = entry.pop("locus_group")
        so_id = LOCUS_TYPE_TO_SO.get(locus_type)
        if so_id:
            term.append_parent(Reference.auto("SO", so_id))
        else:
            term.append_parent(Reference.auto("SO", "0000704"))  # gene
            unhandle_locus_types[locus_type][identifier] = term
            term.append_property("locus_type", locus_type)
            term.append_property("locus_group", locus_group)

        term.set_species(identifier="9606", name="H**o sapiens")

        for key in entry:
            unhandled_entry_keys[key] += 1
        yield term

    with open(prefix_directory_join(PREFIX, name="unhandled.json"),
              "w") as file:
        json.dump(
            {
                k: {hgnc_id: term.name
                    for hgnc_id, term in v.items()}
                for k, v in unhandle_locus_types.items()
            },
            file,
            indent=2,
        )

    with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file:
        for k, v in sorted(unhandle_locus_types.items()):
            t = tabulate(
                [(
                    hgnc_id,
                    term.name,
                    term.is_obsolete,
                    term.link,
                    ", ".join(p.link for p in term.provenance if p.link),
                ) for hgnc_id, term in sorted(v.items())],
                headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
                tablefmt="github",
            )
            print(f"## {k} ({len(v)})", file=file)  # noqa: T201
            print(t, "\n", file=file)  # noqa: T201

    unhandle_locus_type_counter = Counter(
        {locus_type: len(d)
         for locus_type, d in unhandle_locus_types.items()})
    logger.warning("Unhandled locus types:\n%s",
                   tabulate(unhandle_locus_type_counter.most_common()))
    logger.warning("Unhandled keys:\n%s",
                   tabulate(unhandled_entry_keys.most_common()))
Ejemplo n.º 7
0
    return get_name_id_mapping("ncbitaxon")[name]


def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version("biogrid")
    url = f"{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip"
    df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
    df["taxonomy_id"] = df["ORGANISM_OFFICIAL_NAME"].map(_lookup)
    return df


@cached_mapping(
    path=prefix_directory_join(PREFIX,
                               "cache",
                               "xrefs",
                               name="ncbigene.tsv",
                               version=version_getter(PREFIX)),
    header=["biogrid_id", "ncbigene_id"],
)
def get_ncbigene_mapping() -> Mapping[str, str]:
    """Get BioGRID to NCBIGENE mapping.

    Is basically equivalent to:

    .. code-block:: python

        from pyobo import get_filtered_xrefs
        biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene')
    """
    df = get_df()