コード例 #1
0
ファイル: bengo.py プロジェクト: shunsunsun/pyobo
def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph:
    """Make a super graph containing is_a, part_of, and xref relationships."""
    rv = nx.DiGraph()

    df = ensure_inspector_javert_df()
    for source_ns, source_id, target_ns, target_id, provenance in df.values:
        rv.add_edge(f'{source_ns}:{source_id}',
                    f'{target_ns}:{target_id}',
                    relation='xref',
                    provenance=provenance)

    logger.info('getting hierarchies')
    it = sorted(bioregistry.read_bioregistry())
    if use_tqdm:
        it = tqdm(it, desc='Entries')
    for prefix in it:
        if bioregistry.is_deprecated(prefix) or prefix in SKIP:
            continue
        if use_tqdm:
            it.set_postfix({'prefix': prefix})

        hierarchy = get_hierarchy(prefix,
                                  include_has_member=True,
                                  include_part_of=True)
        rv.add_edges_from(hierarchy.edges(data=True))

    # TODO include translates_to, transcribes_to, and has_variant

    return rv
コード例 #2
0
ファイル: metaregistry.py プロジェクト: shunsunsun/pyobo
def get_prefix_to_miriam_prefix() -> Mapping[str, Tuple[str, str]]:
    """Get a mapping of bioregistry prefixes to MIRIAM prefixes."""
    return {
        prefix:
        (entry['miriam']['prefix'], entry['miriam']['namespaceEmbeddedInLui'])
        for prefix, entry in bioregistry.read_bioregistry().items()
        if 'miriam' in entry and 'prefix' in entry['miriam']
    }
コード例 #3
0
ファイル: metaregistry.py プロジェクト: shunsunsun/pyobo
def get_curated_urls() -> Mapping[str, str]:
    """Get a mapping of prefixes to their custom download URLs."""
    #: URLs of resources that weren't listed in OBO Foundry properly
    return {
        bioregistry_prefix: bioregistry_entry['download']
        for bioregistry_prefix, bioregistry_entry in
        bioregistry.read_bioregistry().items()
        if 'download' in bioregistry_entry
    }
コード例 #4
0
ファイル: metaregistry.py プロジェクト: shunsunsun/pyobo
def get_not_available_as_obo():
    """Get the list of prefixes not available as OBO."""
    #: A list of prefixes that have been manually annotated as not being available in OBO
    return {
        bioregistry_prefix
        for bioregistry_prefix, bioregistry_entry in
        bioregistry.read_bioregistry().items()
        if 'not_available_as_obo' in bioregistry_entry
        and bioregistry_entry['not_available_as_obo']
    }
コード例 #5
0
def iter_helper_helper(
    f: Callable[[str], X],
    use_tqdm: bool = True,
    skip_below: Optional[str] = None,
    skip_pyobo: bool = False,
    strict: bool = True,
) -> Iterable[Tuple[str, X]]:
    """Yield all mappings extracted from each database given.

    :param f: A function that takes a prefix and gives back something that will be used by an outer function.
    :param strict: If true, will raise exceptions and crash the program instead of logging them.
    :raises HTTPError: If the resource could not be downloaded
    :raises URLError: If another problem was encountered during download
    :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
    """
    it = sorted(bioregistry.read_bioregistry())
    if use_tqdm:
        it = tqdm(it)
    for prefix in it:
        if prefix in SKIP:
            continue
        if skip_below is not None and prefix < skip_below:
            continue
        if skip_pyobo and has_nomenclature_plugin(prefix):
            continue
        if use_tqdm:
            it.set_postfix({'prefix': prefix})
        try:
            mapping = f(prefix)
        except NoBuild:
            continue
        except urllib.error.HTTPError as e:
            logger.warning('[%s] HTTP %s: unable to download %s', prefix,
                           e.getcode(), e.geturl())
            if strict:
                raise
        except urllib.error.URLError:
            logger.warning('[%s] unable to download', prefix)
            if strict:
                raise
        except ValueError as e:
            if _is_xml(e):
                # this means that it tried doing parsing on an xml page saying get the f**k out
                logger.info(
                    'no resource available for %s. See http://www.obofoundry.org/ontology/%s',
                    prefix, prefix)
            else:
                logger.warning('[%s] error while parsing: %s', prefix, e)
            if strict:
                raise e
        else:
            yield prefix, mapping
コード例 #6
0
ファイル: wikidata.py プロジェクト: fossabot/pyobo
def iterate_wikidata_dfs(*, use_tqdm: bool = True) -> Iterable[pd.DataFrame]:
    """Iterate over WikiData xref dataframes."""
    wikidata_properties = {
        prefix: entry['wikidata']['property']
        for prefix, entry in bioregistry.read_bioregistry().items()
        if 'wikidata' in entry and 'property' in entry['wikidata']
    }
    # wikidata_properties.update(get_wikidata_properties())

    it = sorted(wikidata_properties.items())
    if use_tqdm:
        it = tqdm(it, desc='Wikidata properties')
    for prefix, wikidata_property in it:
        if prefix in {'pubmed', 'pmc', 'orcid'}:
            continue  # too many
        try:
            yield get_wikidata_df(prefix, wikidata_property)
        except json.decoder.JSONDecodeError as e:
            logger.warning('[%s] Problem decoding results from %s: %s', prefix, wikidata_property, e)
コード例 #7
0
def iter_helper_helper(
    f: Callable[[str], X],
    use_tqdm: bool = True,
    skip_below: Optional[str] = None,
    skip_pyobo: bool = False,
    skip_set: Optional[Set[str]] = None,
    strict: bool = True,
    **kwargs,
) -> Iterable[Tuple[str, X]]:
    """Yield all mappings extracted from each database given.

    :param f: A function that takes a prefix and gives back something that will be used by an outer function.
    :param use_tqdm: If true, use the tqdm progress bar
    :param skip_below: If true, skip sources whose names are less than this (used for iterative curation
    :param skip_pyobo: If true, skip sources implemented in PyOBO
    :param skip_set: A pre-defined blacklist to skip
    :param strict: If true, will raise exceptions and crash the program instead of logging them.
    :param kwargs: Keyword arguments passed to ``f``.
    :yields: A prefix and the result of the callable ``f``

    :raises TypeError: If a type error is raised, it gets re-raised
    :raises urllib.error.HTTPError: If the resource could not be downloaded
    :raises urllib.error.URLError: If another problem was encountered during download
    :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
    """
    it = sorted(bioregistry.read_bioregistry())
    if use_tqdm:
        it = tqdm(it, disable=None, desc='Resources')
    for prefix in it:
        if use_tqdm:
            it.set_postfix({'prefix': prefix})
        if prefix in SKIP:
            tqdm.write(f'skipping {prefix} because in default skip set')
            continue
        if skip_set and prefix in skip_set:
            tqdm.write(f'skipping {prefix} because in skip set')
            continue
        if skip_below is not None and prefix < skip_below:
            continue
        if skip_pyobo and has_nomenclature_plugin(prefix):
            continue
        try:
            yv = f(prefix, **kwargs)
        except NoBuild:
            continue
        except urllib.error.HTTPError as e:
            logger.warning('[%s] HTTP %s: unable to download %s', prefix,
                           e.getcode(), e.geturl())
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except urllib.error.URLError:
            logger.warning('[%s] unable to download', prefix)
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except MissingPrefix as e:
            logger.warning('[%s] missing prefix: %s', prefix, e)
            if strict:
                raise e
        except ValueError as e:
            if _is_xml(e):
                # this means that it tried doing parsing on an xml page saying get the f**k out
                logger.info(
                    'no resource available for %s. See http://www.obofoundry.org/ontology/%s',
                    prefix, prefix)
            else:
                logger.exception('[%s] error while parsing: %s', prefix,
                                 e.__class__)
            if strict:
                raise e
        except TypeError as e:
            logger.exception('TypeError on %s', prefix)
            if strict:
                raise e
        else:
            yield prefix, yv
コード例 #8
0
ファイル: legacy_loader.py プロジェクト: shunsunsun/pyobo
def load(
    load_all: bool,
    load_resources: bool = False,
    load_names: bool = False,
    load_alts: bool = False,
    load_xrefs: bool = True,
    load_synonyms: bool = False,
    reset: bool = False,
) -> None:
    """Load the database."""
    if reset:
        drop_all()
    create_all()

    if load_resources or load_all:
        prefix_to_resource: Dict[str, Resource] = {}
        prefixes = {resource.prefix for resource in Resource.query.all()}

        for prefix, entry in tqdm(bioregistry.read_bioregistry().items(),
                                  desc='loading resources'):
            if bioregistry.is_deprecated(prefix):
                continue
            if prefix in prefixes:
                continue
            prefix_to_resource[prefix] = resource_model = Resource(
                prefix=prefix,
                name=entry['name'],
                pattern=bioregistry.get_pattern(prefix),
            )
            session.add(resource_model)
        session.commit()

    ooh_na_na_path = ensure_ooh_na_na()
    synonyms_path = ensure_synonyms()
    xrefs_path = ensure_inspector_javert()

    if load_alts or load_all:
        alts_path = ensure_alts()
        alts_df = pd.read_csv(alts_path, sep='\t',
                              dtype=str)  # prefix, alt, identifier
        logger.info('inserting %d alt identifiers', len(alts_df.index))
        alts_df.to_sql(name=Alt.__tablename__,
                       con=engine,
                       if_exists='append',
                       index=False)
        logger.info('committing alt identifier')
        session.commit()
        logger.info('done committing alt identifiers')

    for label, path, table, columns, checker in [
        ('names', ooh_na_na_path, Reference, None, load_names),
        ('synonyms', synonyms_path, Synonym, ['prefix', 'identifier',
                                              'name'], load_synonyms),
        ('xrefs', xrefs_path, Xref,
         ['prefix', 'identifier', 'xref_prefix', 'xref_identifier',
          'source'], load_xrefs),
    ]:
        if not checker and not load_all:
            continue
        logger.info('beginning insertion of %s', label)
        conn = engine.raw_connection()
        logger.info('inserting with low-level copy of %s from: %s', label,
                    path)
        if columns:
            columns = ', '.join(columns)
            logger.info('corresponding to columns: %s', columns)
            columns = f' ({columns})'
        else:
            columns = ''

        with conn.cursor() as cursor, gzip.open(path) as file:
            # next(file)  # skip the header
            sql = f'''COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';'''
            logger.info('running SQL: %s', sql)
            cursor.copy_expert(sql=sql, file=file)

        logger.info('committing %s', label)
        conn.commit()
        logger.info('done committing %s', label)

    logger.info(f'number resources loaded: {Resource.query.count():,}')
    logger.info(f'number references loaded: {Reference.query.count():,}')
    logger.info(f'number alts loaded: {Alt.query.count():,}')
    logger.info(f'number synonyms loaded: {Synonym.query.count():,}')
    logger.info(f'number xrefs loaded: {Xref.query.count():,}')
コード例 #9
0
ファイル: metaregistry.py プロジェクト: shunsunsun/pyobo
def _get_map(registry: str) -> Mapping[str, str]:
    return {
        prefix: entry[registry]['prefix']
        for prefix, entry in bioregistry.read_bioregistry().items()
        if registry in entry
    }
コード例 #10
0
 def key(self) -> str:
     """Get the OBO Foundry key."""
     return bioregistry.read_bioregistry()[
         self.bioregistry_id]['obofoundry']['prefix']
コード例 #11
0
import logging
from typing import Iterable, Mapping, Optional, Type, Union

import bioregistry
from bioregistry.external.ols import get_ols
from bioregistry.resolve import _clean_version, get_name

from bioversions.utils import Getter, VersionType

logger = logging.getLogger(__name__)

bioregistry_id_to_ols_id = {
    bioregistry_id: bioregistry_entry['ols']['prefix']
    for bioregistry_id, bioregistry_entry in
    bioregistry.read_bioregistry().items() if 'ols' in bioregistry_entry
}


def _get_version_type(bioregistry_id) -> Optional[VersionType]:
    ols_entry = bioregistry.get(bioregistry_id)
    ols_version_type = ols_entry.get('ols_version_type')
    ols_version_date_format = ols_entry.get('ols_version_date_format')
    if ols_version_date_format:
        return VersionType.date
    elif ols_version_type:
        return getattr(VersionType, ols_version_type)
    else:
        logger.warning('[%s] missing version type', bioregistry_id)