コード例 #1
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = _find_text(tree, "name")
    description = _find_text(tree, "description")
    if description:
        description = description.strip().replace("\n", " ")
    identifier = _find_text(tree, "identifier")
    if identifier is None:
        raise ValueError
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = _find_text(experiment, "name")
        experiment_identifier = _find_text(experiment, "identifier")
        if experiment_identifier is None:
            continue
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_identifier,
                name=experiment_name,
            ),
        )
    return term
コード例 #2
0
ファイル: complexportal.py プロジェクト: ddomingof/pyobo
def get_terms() -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df()

    df['aliases'] = df['aliases'].map(lambda s: s.split('|')
                                      if pd.notna(s) else [])
    df['members'] = df['members'].map(_parse_members)
    df['xrefs'] = df['xrefs'].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
    df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

    slim_df = df[[
        'complexportal_id',
        'name',
        'definition',
        'aliases',
        'xrefs',
        'taxonomy_id',
        'taxonomy_name',
        'members',
    ]]
    it = tqdm(slim_df.values,
              total=len(slim_df.index),
              desc=f'mapping {PREFIX}')
    unhandled_xref_type = set()
    for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == 'identity':
                _xrefs.append(reference)
            elif note == 'see-also' and reference.prefix == 'pubmed':
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(
                    f'unhandled xref type: {note} / {reference.prefix}')
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=complexportal_id,
                                name=name),
            definition=definition.strip(),
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
コード例 #3
0
ファイル: gwascentral_study.py プロジェクト: shunsunsun/pyobo
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find('name').text
    description = tree.find('description').text
    if description:
        description = description.strip().replace('\n', ' ')
    identifier = tree.find('identifier').text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall('experiments'):
        experiment_name = experiment.find('name').text
        experiment_id = experiment.find('identifier').text
        term.append_relationship(
            has_part,
            Reference(
                'gwascentral.experiment',
                identifier=experiment_id,
                name=experiment_name,
            ))
    return term
コード例 #4
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find("name").text
    description = tree.find("description").text
    if description:
        description = description.strip().replace("\n", " ")
    identifier = tree.find("identifier").text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = experiment.find("name").text
        experiment_id = experiment.find("identifier").text
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_id,
                name=experiment_name,
            ),
        )
    return term
コード例 #5
0
ファイル: complexportal.py プロジェクト: rpatil524/pyobo
def get_terms(version: str) -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df(version=version)
    df.rename(
        inplace=True,
        columns={
            "Aliases for complex": "aliases",
            "Identifiers (and stoichiometry) of molecules in complex": "members",
            "Taxonomy identifier": "taxonomy_id",
            "Cross references": "xrefs",
            "Description": "definition",
            "Recommended name": "name",
            "#Complex ac": "complexportal_id",
        },
    )

    df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else [])
    df["members"] = df["members"].map(_parse_members)
    df["xrefs"] = df["xrefs"].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping("ncbitaxon")
    df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get)

    slim_df = df[
        [
            "complexportal_id",
            "name",
            "definition",
            "aliases",
            "xrefs",
            "taxonomy_id",
            "taxonomy_name",
            "members",
        ]
    ]
    it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}")
    unhandled_xref_type = set()
    for (
        complexportal_id,
        name,
        definition,
        aliases,
        xrefs,
        taxonomy_id,
        taxonomy_name,
        members,
    ) in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == "identity":
                _xrefs.append(reference)
            elif note == "see-also" and reference.prefix == "pubmed":
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(f"unhandled xref type: {note} / {reference.prefix}")
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name),
            definition=definition.strip() if pd.notna(definition) else None,
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
コード例 #6
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}'

    entities_url = f'{base_url}/entities.csv'
    entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force)

    relations_url = f'{base_url}/relations.csv'
    relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force)

    definitions_url = f'{base_url}/descriptions.csv'
    definitions_df = ensure_df(
        PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    # TODO add xrefs
    # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv'
    # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str)

    hgnc_name_to_id = get_name_id_mapping('hgnc')
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == 'HGNC':
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name)
            h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name)
        elif h_ns == 'FPLX':
            h = Reference(prefix='fplx', identifier=h_name, name=h_name)
        elif h_ns == 'UP':
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == 'HGNC':
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name)
            t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name)
        elif t_ns == 'FPLX':
            t = Reference(prefix='fplx', identifier=t_name, name=t_name)
        elif h_ns == 'UP':
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for entity, in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)] if definition is not None else None,
        )

        for r, t in out_edges.get(reference, []):
            if r == 'isa' and t.prefix == 'fplx':
                term.append_parent(t)
            elif r == 'isa':
                term.append_relationship(is_a, t)
            elif r == 'partof':
                term.append_relationship(part_of, t)
            else:
                logging.warning('unhandled relation %s', r)

        for r, h in in_edges.get(reference, []):
            if r == 'isa':
                term.append_relationship(has_member, h)
            elif r == 'partof':
                term.append_relationship(has_part, h)
            else:
                logging.warning('unhandled relation %s', r)
        yield term
コード例 #7
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"

    entities_url = f"{base_url}/entities.csv"
    entities_df = ensure_df(PREFIX,
                            url=entities_url,
                            version=version,
                            dtype=str,
                            force=force)

    relations_url = f"{base_url}/relations.csv"
    relations_df = ensure_df(PREFIX,
                             url=relations_url,
                             version=version,
                             header=None,
                             sep=",",
                             dtype=str,
                             force=force)

    definitions_url = f"{base_url}/descriptions.csv"
    definitions_df = ensure_df(
        PREFIX,
        url=definitions_url,
        version=version,
        header=None,
        sep=",",
        dtype=str,
        force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    id_xrefs = _get_xref_df(version)

    hgnc_name_to_id = get_name_id_mapping("hgnc")
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == "HGNC":
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, h_name)
            h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name)
        elif h_ns == "FPLX":
            h = Reference(prefix="fplx", identifier=h_name, name=h_name)
        elif h_ns == "UP":
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == "HGNC":
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, t_name)
            t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name)
        elif t_ns == "FPLX":
            t = Reference(prefix="fplx", identifier=t_name, name=t_name)
        elif h_ns == "UP":
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for (entity, ) in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)]
            if definition is not None else None,
        )

        for xref_reference in id_xrefs.get(entity, []):
            term.append_xref(xref_reference)

        for r, t in out_edges.get(reference, []):
            if r == "isa" and t.prefix == "fplx":
                term.append_parent(t)
            elif r == "isa":
                term.append_relationship(is_a, t)
            elif r == "partof":
                term.append_relationship(part_of, t)
            else:
                logging.warning("unhandled relation %s", r)

        for r, h in in_edges.get(reference, []):
            if r == "isa":
                term.append_relationship(has_member, h)
            elif r == "partof":
                term.append_relationship(has_part, h)
            else:
                logging.warning("unhandled relation %s", r)
        yield term
コード例 #8
0
def get_terms(version: Optional[str] = None,
              force: bool = False) -> Iterable[Term]:  # noqa:C901
    """Get HGNC terms."""
    if version is None:
        version = datetime.date.today().strftime("%Y-%m-01")
    unhandled_entry_keys: typing.Counter[str] = Counter()
    unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict)
    path = ensure_path(
        PREFIX,
        url=DEFINITIONS_URL_FMT.format(version=version),
        force=force,
        version=version,
        name="hgnc_complete_set.json",
    )
    with open(path) as file:
        entries = json.load(file)["response"]["docs"]

    yield from sorted(
        {
            Term(reference=Reference.auto("SO", so_id))
            for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id
        },
        key=attrgetter("identifier"),
    )

    statuses = set()
    for entry in tqdm(entries,
                      desc=f"Mapping {PREFIX}",
                      unit="gene",
                      unit_scale=True):
        name, symbol, identifier = (
            entry.pop("name"),
            entry.pop("symbol"),
            entry.pop("hgnc_id")[len("HGNC:"):],
        )
        status = entry.pop("status")
        if status == "Approved":
            is_obsolete = False
        elif status not in statuses:
            statuses.add(status)
            logger.warning("UNHANDLED %s", status)
            is_obsolete = True
        else:
            raise ValueError(
                f"Unhandled status for hgnc:{identifier}: {status}")

        term = Term(
            definition=name,
            reference=Reference(prefix=PREFIX,
                                identifier=identifier,
                                name=symbol),
            is_obsolete=is_obsolete,
        )

        for uniprot_id in entry.pop("uniprot_ids", []):
            term.append_relationship(
                has_gene_product,
                Reference.auto("uniprot", uniprot_id),
            )
        for ec_code in entry.pop("enzyme_id", []):
            if "-" in ec_code:
                continue  # only add concrete annotations
            term.append_relationship(
                gene_product_member_of,
                Reference.auto("eccode", ec_code),
            )
        for rna_central_ids in entry.pop("rna_central_id", []):
            for rna_central_id in rna_central_ids.split(","):
                term.append_relationship(
                    transcribes_to,
                    Reference(prefix="rnacentral",
                              identifier=rna_central_id.strip()),
                )
        mirbase_id = entry.pop("mirbase", None)
        if mirbase_id:
            term.append_relationship(
                transcribes_to,
                Reference.auto(
                    "mirbase",
                    mirbase_id,
                ),
            )
        snornabase_id = entry.pop("snornabase", None)
        if snornabase_id:
            term.append_relationship(
                transcribes_to,
                Reference(prefix="snornabase", identifier=snornabase_id))

        for rgd_curie in entry.pop("rgd_id", []):
            if not rgd_curie.startswith("RGD:"):
                logger.warning(
                    f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}")
                continue
            rgd_id = rgd_curie[len("RGD:"):]
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="rgd", identifier=rgd_id),
            )
        for mgi_curie in entry.pop("mgd_id", []):
            if not mgi_curie.startswith("MGI:"):
                logger.warning(
                    f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}")
                continue
            mgi_id = mgi_curie[len("MGI:"):]
            if not mgi_id:
                continue
            term.append_relationship(
                orthologous,
                Reference.auto(prefix="mgi", identifier=mgi_id),
            )

        for xref_prefix, key in gene_xrefs:
            xref_identifiers = entry.pop(key, None)
            if xref_identifiers is None:
                continue
            if not isinstance(xref_identifiers, list):
                xref_identifiers = [xref_identifiers]
            for xref_identifier in xref_identifiers:
                term.append_xref(
                    Reference(prefix=xref_prefix,
                              identifier=str(xref_identifier)))

        for pubmed_id in entry.pop("pubmed_id", []):
            term.append_provenance(
                Reference(prefix="pubmed", identifier=str(pubmed_id)))

        gene_group_ids = entry.pop("gene_group_id", [])
        gene_groups = entry.pop("gene_group", [])
        for gene_group_id, gene_group_label in zip(gene_group_ids,
                                                   gene_groups):
            term.append_relationship(
                member_of,
                Reference(
                    prefix="hgnc.genegroup",
                    identifier=str(gene_group_id),
                    name=gene_group_label,
                ),
            )

        for alias_symbol in entry.pop("alias_symbol", []):
            term.append_synonym(
                Synonym(name=alias_symbol, type=alias_symbol_type))
        for alias_name in entry.pop("alias_name", []):
            term.append_synonym(Synonym(name=alias_name, type=alias_name_type))
        for previous_symbol in entry.pop("previous_symbol", []):
            term.append_synonym(
                Synonym(name=previous_symbol, type=previous_symbol_type))
        for previous_name in entry.pop("prev_name", []):
            term.append_synonym(
                Synonym(name=previous_name, type=previous_name_type))

        for prop in ["location"]:
            value = entry.pop(prop, None)
            if value:
                term.append_property(prop, value)

        locus_type = entry.pop("locus_type")
        locus_group = entry.pop("locus_group")
        so_id = LOCUS_TYPE_TO_SO.get(locus_type)
        if so_id:
            term.append_parent(Reference.auto("SO", so_id))
        else:
            term.append_parent(Reference.auto("SO", "0000704"))  # gene
            unhandle_locus_types[locus_type][identifier] = term
            term.append_property("locus_type", locus_type)
            term.append_property("locus_group", locus_group)

        term.set_species(identifier="9606", name="H**o sapiens")

        for key in entry:
            unhandled_entry_keys[key] += 1
        yield term

    with open(prefix_directory_join(PREFIX, name="unhandled.json"),
              "w") as file:
        json.dump(
            {
                k: {hgnc_id: term.name
                    for hgnc_id, term in v.items()}
                for k, v in unhandle_locus_types.items()
            },
            file,
            indent=2,
        )

    with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file:
        for k, v in sorted(unhandle_locus_types.items()):
            t = tabulate(
                [(
                    hgnc_id,
                    term.name,
                    term.is_obsolete,
                    term.link,
                    ", ".join(p.link for p in term.provenance if p.link),
                ) for hgnc_id, term in sorted(v.items())],
                headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
                tablefmt="github",
            )
            print(f"## {k} ({len(v)})", file=file)  # noqa: T201
            print(t, "\n", file=file)  # noqa: T201

    unhandle_locus_type_counter = Counter(
        {locus_type: len(d)
         for locus_type, d in unhandle_locus_types.items()})
    logger.warning("Unhandled locus types:\n%s",
                   tabulate(unhandle_locus_type_counter.most_common()))
    logger.warning("Unhandled keys:\n%s",
                   tabulate(unhandled_entry_keys.most_common()))
コード例 #9
0
def get_terms(force: bool = False,
              version: Optional[str] = None) -> Iterable[Term]:
    """Get RGD terms."""
    df = ensure_df(
        PREFIX,
        url=GENES_URL,
        sep="\t",
        header=0,
        comment="#",
        dtype=str,
        force=force,
        version=version,
        quoting=3,
        error_bad_lines=False,
    )
    for _, row in tqdm(df.iterrows(),
                       total=len(df.index),
                       desc=f"Mapping {PREFIX}",
                       unit_scale=True):
        if pd.notna(row["NAME"]):
            definition = row["NAME"]
        elif pd.notna(row["GENE_DESC"]):
            definition = row["GENE_DESC"]
        else:
            definition = None

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=row["GENE_RGD_ID"],
                                name=row["SYMBOL"]),
            definition=definition,
        )
        old_names = row["OLD_NAME"]
        if old_names and pd.notna(old_names):
            for old_name in old_names.split(";"):
                term.append_synonym(Synonym(name=old_name, type=old_name_type))
        old_symbols = row["OLD_SYMBOL"]
        if old_symbols and pd.notna(old_symbols):
            for old_symbol in old_symbols.split(";"):
                term.append_synonym(
                    Synonym(name=old_symbol, type=old_symbol_type))
        for prefix, key in namespace_to_column:
            xref_ids = str(row[key])
            if xref_ids and pd.notna(xref_ids):
                for xref_id in xref_ids.split(";"):
                    if xref_id == "nan":
                        continue
                    if prefix == "uniprot":
                        term.append_relationship(
                            has_gene_product,
                            Reference.auto(prefix=prefix, identifier=xref_id))
                    elif prefix == "ensembl":
                        if xref_id.startswith("ENSMUSG") or xref_id.startswith(
                                "ENSRNOG"):
                            # second one is reverse strand
                            term.append_xref(
                                Reference(prefix=prefix, identifier=xref_id))
                        elif xref_id.startswith("ENSMUST"):
                            term.append_relationship(
                                transcribes_to,
                                Reference(prefix=prefix, identifier=xref_id))
                        elif xref_id.startswith("ENSMUSP"):
                            term.append_relationship(
                                has_gene_product,
                                Reference(prefix=prefix, identifier=xref_id))
                        else:
                            logger.warning("[%s] unhandled xref ensembl:%s",
                                           PREFIX, xref_id)
                    else:
                        term.append_xref(
                            Reference(prefix=prefix, identifier=xref_id))

        pubmed_ids = row["CURATED_REF_PUBMED_ID"]
        if pubmed_ids and pd.notna(pubmed_ids):
            for pubmed_id in str(pubmed_ids).split(";"):
                term.append_provenance(
                    Reference(prefix="pubmed", identifier=pubmed_id))

        term.set_species(identifier="10116", name="Rattus norvegicus")
        yield term