Beispiel #1
0
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
    xrefs_url = f"{base_url}/equivalences.csv"
    xrefs_df = ensure_df(PREFIX,
                         url=xrefs_url,
                         version=version,
                         header=None,
                         sep=",",
                         dtype=str)

    # Normalize nextprot families
    ns_remapping = {
        "NXP": "nextprot.family",
    }
    xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
    xrefs_df[1] = [
        xref_identifier
        if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):]
        for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
    ]

    xrefs_df[0] = xrefs_df[0].map(normalize_prefix)
    xrefs_df = xrefs_df[xrefs_df[0].notna()]
    xrefs_df = xrefs_df[xrefs_df[0] != "bel"]
    return multidict(
        (identifier, Reference(xref_prefix, xref_identifier))
        for xref_prefix, xref_identifier, identifier in xrefs_df.values)
Beispiel #2
0
def get_grounder(
    prefix: Union[str, Iterable[str]],
    unnamed: Optional[Iterable[str]] = None,
    grounder_cls: Optional[Type[Grounder]] = None,
) -> Grounder:
    """Get a Gilda grounder for the given namespace."""
    unnamed = set() if unnamed is None else set(unnamed)
    if isinstance(prefix, str):
        prefix = [prefix]

    terms: List[gilda.term.Term] = []
    for p in prefix:
        try:
            p_terms = list(
                get_gilda_terms(p, identifiers_are_names=p in unnamed))
        except NoBuild:
            continue
        else:
            terms.extend(p_terms)
    terms = filter_out_duplicates(terms)
    terms_dict = multidict((term.norm_text, term) for term in terms)
    if grounder_cls is None:
        return Grounder(terms_dict)
    else:
        return grounder_cls(terms_dict)
Beispiel #3
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX,
                                        name='ITIS.sqlite',
                                        version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(
                        os.path.join(sqlite_dir, f'itisSqlite{version}',
                                     'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Beispiel #4
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL, force=force, version=version)
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for file in zip_file.filelist:
                if file.filename.endswith(".sqlite") and not file.is_dir():
                    zip_file.extract(file, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename)))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f"file missing: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path.as_posix())

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == "0":  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Beispiel #5
0
def get_terms(force: bool = False,
              version: Optional[str] = None) -> Iterable[Term]:
    """Get terms."""
    alt_ids_df = ensure_df(
        PREFIX,
        url=ALTS_URL,
        name="alts.tsv",
        force=force,
        header=None,
        names=["alt", "zfin_id"],
        version=version,
    )
    primary_to_alt_ids = defaultdict(set)
    for alt_id, zfin_id in alt_ids_df.values:
        primary_to_alt_ids[zfin_id].add(alt_id)

    human_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=HUMAN_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 7],
                  version=version).values)
    mouse_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=MOUSE_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    fly_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=FLY_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    entrez_mappings = dict(
        ensure_df(PREFIX,
                  url=ENTREZ_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)
    uniprot_mappings = multidict(
        ensure_df(PREFIX,
                  url=UNIPROT_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)

    df = ensure_df(
        PREFIX,
        url=URL,
        name="markers.tsv",
        force=force,
        header=None,
        names=MARKERS_COLUMNS,
        version=version,
    )
    df["sequence_ontology_id"] = df["sequence_ontology_id"].map(
        lambda x: x[len("SO:"):])
    so = {
        sequence_ontology_id: Reference.auto(prefix="SO",
                                             identifier=sequence_ontology_id)
        for sequence_ontology_id in df["sequence_ontology_id"].unique()
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
            definition=definition if definition != name else None,
        )
        term.set_species(identifier="7955", name="Danio rerio")
        term.append_parent(so[sequence_ontology_id])
        # Entity type is redundant of identifier
        # term.append_property("type", entity_type)
        for alt_id in primary_to_alt_ids[identifier]:
            term.append_alt(alt_id)
        entrez_id = entrez_mappings.get(identifier)
        if entrez_id:
            term.append_xref(Reference("ncbigene", entrez_id))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        for hgnc_id in human_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        for mgi_curie in mouse_orthologs.get(identifier, []):
            mouse_ortholog = Reference.from_curie(mgi_curie, auto=True)
            if mouse_ortholog:
                term.append_relationship(orthologous, mouse_ortholog)
        for flybase_id in fly_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference("flybase", flybase_id))

        yield term
Beispiel #6
0
def get_grounder(prefix, url: Optional[str] = None) -> Grounder:
    """Get a Gilda grounder for the given namespace."""
    terms = list(get_gilda_terms(prefix, url=url))
    terms = filter_out_duplicates(terms)
    terms = multidict((term.norm_text, term) for term in terms)
    return Grounder(terms)