Exemple #1
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f"{PREFIX} download"):
        try:
            path = ensure_path(
                PREFIX,
                "phenotype",
                version=version,
                url=
                f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
                name=f"HGVPM{n}.json",
                force=force,
            )
        except OSError as e:
            tqdm.write(f"{n}: {e}")
            continue
        with open(path) as file:
            j = json.load(file)

        description = j.get("description")
        if description is not None:
            description = description.strip().replace("\n", " ")
        term = Term(
            reference=Reference(PREFIX, j["identifier"], j["name"]),
            definition=description,
        )
        yield term
Exemple #2
0
def _get_version() -> str:
    """Get the version of the current data."""
    zip_path = ensure_path(PREFIX, url=URL)
    with zipfile.ZipFile(zip_path) as zip_file:
        for x in zip_file.filelist:
            if x.filename.endswith('.sqlite'):
                return x.filename[len('itisSqlite'):-len('/ITIS.sqlite')]
    raise ValueError('could not find a file with the version in it')
Exemple #3
0
def get_path(version: str):
    """Get the path to the extracted ChEMBL SQLite database."""
    url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz'
    path = ensure_path(PREFIX, url=url, version=version)
    name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db'
    d = prefix_directory_join(PREFIX, version=version)
    op = os.path.join(d, name)
    if not os.path.exists(op):
        with tarfile.open(path, mode='r', encoding='utf-8') as tar_file:
            tar_file.extractall(d)
    return op
Exemple #4
0
def iter_terms(force: bool = False) -> Iterable[Term]:
    """Iterate over UniProt PTM Terms."""
    path = ensure_path(PREFIX, url=URL, force=force)
    with open(path) as file:
        lines = list(file)
    it: Iterable[Tuple[str, str]] = ((line[:2], line[2:].strip())
                                     for line in lines[47:-5])
    for i, (_, term_lines) in enumerate(
            itt.groupby(it, key=lambda p: p[0] == "//")):
        term = _parse(i, term_lines)
        if term:
            yield term
Exemple #5
0
def iterate_terms(version: str) -> Iterable[Term]:
    """Iterate over GWAS Central Study terms."""
    path = ensure_path(PREFIX, url=URL, version=version)
    with tarfile.open(path) as tar_file:
        for tar_info in tar_file:
            if not tar_info.path.endswith(".xml"):
                continue
            with tar_file.extractfile(tar_info) as file:
                try:
                    tree = ElementTree.parse(file)
                except ElementTree.ParseError:
                    logger.warning("malformed XML in %s", tar_info.path)
                    continue
            yield _get_term_from_tree(tree)
Exemple #6
0
def iterate_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over GWAS Central Study terms."""
    url = f"http://www.gwascentral.org/docs/GC_{version}.tar.gz"
    path = ensure_path(PREFIX, url=url, version=version, force=force)
    with tarfile.open(path) as tar_file:
        for tar_info in tar_file:
            if not tar_info.path.endswith(".xml"):
                continue
            with tar_file.extractfile(tar_info) as file:  # type:ignore
                try:
                    tree = ElementTree.parse(file)
                except ElementTree.ParseError:
                    logger.warning("malformed XML in %s", tar_info.path)
                    continue
            yield _get_term_from_tree(tree)
Exemple #7
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX,
                                        name='ITIS.sqlite',
                                        version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(
                        os.path.join(sqlite_dir, f'itisSqlite{version}',
                                     'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Exemple #8
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f'{PREFIX} download'):
        try:
            path = ensure_path(
                PREFIX,
                'phenotype',
                version=version,
                url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json',
                name=f'HGVPM{n}.json',
            )
        except OSError as e:
            tqdm.write(f'{n}: {e}')
            continue
        with open(path) as file:
            j = json.load(file)
        term = Term(
            reference=Reference(PREFIX, j['identifier'], j['name']),
            definition=j['description'].strip().replace('\n', ' '),
        )
        yield term
Exemple #9
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL, force=force, version=version)
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for file in zip_file.filelist:
                if file.filename.endswith(".sqlite") and not file.is_dir():
                    zip_file.extract(file, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename)))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f"file missing: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path.as_posix())

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == "0":  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Exemple #10
0
def iter_terms(version: str,
               synonym_abb,
               autodownload: bool = False) -> Iterable[Term]:
    """Iterate over UMLS terms."""
    name = f"umls-{version}-mrconso.zip"
    url = f"https://download.nlm.nih.gov/umls/kss/{version}/{name}"
    if autodownload:
        # FIXME needs automated scrapy step where you put in user/password
        path = ensure_path(PREFIX, url=url, version=version)
    else:
        path = RAW_MODULE.get(PREFIX, version, name=name)
        if not path.exists():
            raise FileNotFoundError(
                f"UMLS needs to be downloaded manually still and moved to  {path}. "
                f"See https://www.nlm.nih.gov/research/umls/index.html", )

    with zipfile.ZipFile(path) as zip_file:
        with zip_file.open("MRCONSO.RRF", mode="r") as file:
            it = tqdm(file, unit_scale=True, desc="[umls] parsing")
            lines = (line.decode("utf-8").strip().split("|") for line in it)
            for cui, cui_lines in itt.groupby(lines,
                                              key=operator.itemgetter(0)):
                df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
                df = df[df["LAT - Language"] == "ENG"]
                idx = ((df["ISPREF - is preferred"] == "Y")
                       & (df["TS - Term Status"] == "P")
                       & (df["STT - String Type"] == "PF"), )
                pref_rows_df = df.loc[idx]
                if len(pref_rows_df.index) != 1:
                    it.write(
                        f"no preferred term for umls:{cui}. got {len(pref_rows_df.index)}"
                    )
                    continue

                df["TTY - Term Type in Source"] = df[
                    "TTY - Term Type in Source"].map(synonym_abb.__getitem__)

                _r = pref_rows_df.iloc[0]
                sdf = df[[
                    "SAB - source name", "CODE", "TTY - Term Type in Source",
                    "STR"
                ]]

                synonyms = []
                xrefs = []
                for source, identifier, synonym_type, synonym in sdf.values:
                    norm_source = normalize_prefix(source)
                    if norm_source is None or not identifier:
                        provenance = []
                    else:
                        ref = Reference(prefix=norm_source,
                                        identifier=identifier)
                        provenance = [ref]
                        xrefs.append(ref)
                    synonyms.append(
                        Synonym(
                            name=synonym,
                            provenance=provenance,
                            type=SynonymTypeDef.from_text(synonym_type),
                        ))

                xrefs = sorted(set(xrefs),
                               key=lambda reference:
                               (reference.prefix, reference.identifier))

                term = Term(
                    reference=Reference(prefix=PREFIX,
                                        identifier=cui,
                                        name=_r["STR"]),
                    synonyms=synonyms,
                    xrefs=xrefs,
                )
                yield term
Exemple #11
0
def get_terms(version: Optional[str] = None,
              force: bool = False) -> Iterable[Term]:  # noqa:C901
    """Get HGNC terms."""
    if version is None:
        version = datetime.date.today().strftime("%Y-%m-01")
    unhandled_entry_keys: typing.Counter[str] = Counter()
    unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict)
    path = ensure_path(
        PREFIX,
        url=DEFINITIONS_URL_FMT.format(version=version),
        force=force,
        version=version,
        name="hgnc_complete_set.json",
    )
    with open(path) as file:
        entries = json.load(file)["response"]["docs"]

    yield from sorted(
        {
            Term(reference=Reference.auto("SO", so_id))
            for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id
        },
        key=attrgetter("identifier"),
    )

    statuses = set()
    for entry in tqdm(entries,
                      desc=f"Mapping {PREFIX}",
                      unit="gene",
                      unit_scale=True):
        name, symbol, identifier = (
            entry.pop("name"),
            entry.pop("symbol"),
            entry.pop("hgnc_id")[len("HGNC:"):],
        )
        status = entry.pop("status")
        if status == "Approved":
            is_obsolete = False
        elif status not in statuses:
            statuses.add(status)
            logger.warning("UNHANDLED %s", status)
            is_obsolete = True
        else:
            raise ValueError(
                f"Unhandled status for hgnc:{identifier}: {status}")

        term = Term(
            definition=name,
            reference=Reference(prefix=PREFIX,
                                identifier=identifier,
                                name=symbol),
            is_obsolete=is_obsolete,
        )

        for uniprot_id in entry.pop("uniprot_ids", []):
            term.append_relationship(
                has_gene_product,
                Reference.auto("uniprot", uniprot_id),
            )
        for ec_code in entry.pop("enzyme_id", []):
            if "-" in ec_code:
                continue  # only add concrete annotations
            term.append_relationship(
                gene_product_member_of,
                Reference.auto("eccode", ec_code),
            )
        for rna_central_ids in entry.pop("rna_central_id", []):
            for rna_central_id in rna_central_ids.split(","):
                term.append_relationship(
                    transcribes_to,
                    Reference(prefix="rnacentral",
                              identifier=rna_central_id.strip()),
                )
        mirbase_id = entry.pop("mirbase", None)
        if mirbase_id:
            term.append_relationship(
                transcribes_to,
                Reference.auto(
                    "mirbase",
                    mirbase_id,
                ),
            )
        snornabase_id = entry.pop("snornabase", None)
        if snornabase_id:
            term.append_relationship(
                transcribes_to,
                Reference(prefix="snornabase", identifier=snornabase_id))

        for rgd_curie in entry.pop("rgd_id", []):
            if not rgd_curie.startswith("RGD:"):
                logger.warning(
                    f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}")
                continue
            rgd_id = rgd_curie[len("RGD:"):]
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="rgd", identifier=rgd_id),
            )
        for mgi_curie in entry.pop("mgd_id", []):
            if not mgi_curie.startswith("MGI:"):
                logger.warning(
                    f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}")
                continue
            mgi_id = mgi_curie[len("MGI:"):]
            if not mgi_id:
                continue
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="mgi", identifier=mgi_id),
            )

        for xref_prefix, key in gene_xrefs:
            xref_identifiers = entry.pop(key, None)
            if xref_identifiers is None:
                continue
            if not isinstance(xref_identifiers, list):
                xref_identifiers = [xref_identifiers]
            for xref_identifier in xref_identifiers:
                term.append_xref(
                    Reference(prefix=xref_prefix,
                              identifier=str(xref_identifier)))

        for pubmed_id in entry.pop("pubmed_id", []):
            term.append_provenance(
                Reference(prefix="pubmed", identifier=str(pubmed_id)))

        gene_group_ids = entry.pop("gene_group_id", [])
        gene_groups = entry.pop("gene_group", [])
        for gene_group_id, gene_group_label in zip(gene_group_ids,
                                                   gene_groups):
            term.append_relationship(
                member_of,
                Reference(
                    prefix="hgnc.genegroup",
                    identifier=str(gene_group_id),
                    name=gene_group_label,
                ),
            )

        for alias_symbol in entry.pop("alias_symbol", []):
            term.append_synonym(
                Synonym(name=alias_symbol, type=alias_symbol_type))
        for alias_name in entry.pop("alias_name", []):
            term.append_synonym(Synonym(name=alias_name, type=alias_name_type))
        for previous_symbol in entry.pop("previous_symbol", []):
            term.append_synonym(
                Synonym(name=previous_symbol, type=previous_symbol_type))
        for previous_name in entry.pop("prev_name", []):
            term.append_synonym(
                Synonym(name=previous_name, type=previous_name_type))

        for prop in ["location"]:
            value = entry.pop(prop, None)
            if value:
                term.append_property(prop, value)

        locus_type = entry.pop("locus_type")
        locus_group = entry.pop("locus_group")
        so_id = LOCUS_TYPE_TO_SO.get(locus_type)
        if so_id:
            term.append_parent(Reference.auto("SO", so_id))
        else:
            term.append_parent(Reference.auto("SO", "0000704"))  # gene
            unhandle_locus_types[locus_type][identifier] = term
            term.append_property("locus_type", locus_type)
            term.append_property("locus_group", locus_group)

        term.set_species(identifier="9606", name="H**o sapiens")

        for key in entry:
            unhandled_entry_keys[key] += 1
        yield term

    with open(prefix_directory_join(PREFIX, name="unhandled.json"),
              "w") as file:
        json.dump(
            {
                k: {hgnc_id: term.name
                    for hgnc_id, term in v.items()}
                for k, v in unhandle_locus_types.items()
            },
            file,
            indent=2,
        )

    with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file:
        for k, v in sorted(unhandle_locus_types.items()):
            t = tabulate(
                [(
                    hgnc_id,
                    term.name,
                    term.is_obsolete,
                    term.link,
                    ", ".join(p.link for p in term.provenance if p.link),
                ) for hgnc_id, term in sorted(v.items())],
                headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
                tablefmt="github",
            )
            print(f"## {k} ({len(v)})", file=file)  # noqa: T201
            print(t, "\n", file=file)  # noqa: T201

    unhandle_locus_type_counter = Counter(
        {locus_type: len(d)
         for locus_type, d in unhandle_locus_types.items()})
    logger.warning("Unhandled locus types:\n%s",
                   tabulate(unhandle_locus_type_counter.most_common()))
    logger.warning("Unhandled keys:\n%s",
                   tabulate(unhandled_entry_keys.most_common()))