コード例 #1
0
    def test_curies(self):
        """Test correct prefixes and identifiers."""
        registry = dict(bioregistry.read_registry())
        registry['decopath'] = {
        }  # TODO decopath needs a real resource and an entry in the bioregistry

        miriam_patterns = {
            k: re.compile(entry['miriam']['pattern'])
            for k, entry in registry.items() if 'miriam' in entry
        }

        dataframes = {
            getter.__name__.removeprefix('get_').removesuffix('_df'): getter()
            for _, getter in DATA
        }

        rows = ['Source Resource', 'Source ID', 'Target Resource', 'Target ID']
        for name, df in dataframes.items():
            with self.subTest(name=name):
                for i, (source_prefix, source_id, target_prefix,
                        target_id) in enumerate(df[rows].values):
                    self.assertIn(source_prefix, registry.keys())
                    self.assertNotEqual(source_prefix, 'kegg')
                    self.assertFalse(
                        bioregistry.is_deprecated(source_prefix),
                        msg=
                        f'[{name}, row {i}] deprecated source prefix: {source_prefix}',
                    )
                    if source_regex := miriam_patterns.get(source_prefix):
                        self.assertRegex(
                            source_id,
                            source_regex,
                            msg=
                            f'[{name}, row {i}] source prefix: {source_prefix}',
                        )
                    self.assertIn(target_prefix, registry.keys())
                    self.assertNotEqual(target_prefix, 'kegg')
                    self.assertFalse(
                        bioregistry.is_deprecated(target_prefix),
                        msg=
                        f'[{name}, row {i}] deprecated target prefix: {target_prefix}',
                    )
                    if target_regex := miriam_patterns.get(target_prefix):
                        self.assertRegex(
                            target_id,
                            target_regex,
                            msg=
                            f'[{name}, row {i}] target prefix: {target_prefix}',
                        )
コード例 #2
0
def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph:
    """Make a super graph containing is_a, part_of, and xref relationships."""
    rv = nx.DiGraph()

    df = ensure_inspector_javert_df()
    for source_ns, source_id, target_ns, target_id, provenance in df.values:
        rv.add_edge(
            f"{source_ns}:{source_id}",
            f"{target_ns}:{target_id}",
            relation="xref",
            provenance=provenance,
        )

    logger.info("getting hierarchies")
    it = sorted(bioregistry.read_registry())
    if use_tqdm:
        it = tqdm(it, desc="Entries")
    for prefix in it:
        if bioregistry.is_deprecated(prefix) or prefix in SKIP:
            continue
        if use_tqdm:
            it.set_postfix({"prefix": prefix})

        hierarchy = get_hierarchy(prefix,
                                  include_has_member=True,
                                  include_part_of=True)
        rv.add_edges_from(hierarchy.edges(data=True))

    # TODO include translates_to, transcribes_to, and has_variant

    return rv
コード例 #3
0
ファイル: ui.py プロジェクト: pagreene/bioregistry
def resource(prefix: str):
    """Serve the a Bioregistry entry page."""
    prefix = _normalize_prefix_or_404(prefix, '.' + resource.__name__)
    if not isinstance(prefix, str):
        return prefix
    example = bioregistry.get_example(prefix)
    return render_template(
        'resource.html',
        prefix=prefix,
        name=bioregistry.get_name(prefix),
        example=example,
        mappings=_get_resource_mapping_rows(prefix),
        synonyms=bioregistry.get_synonyms(prefix),
        homepage=bioregistry.get_homepage(prefix),
        pattern=bioregistry.get_pattern(prefix),
        version=bioregistry.get_version(prefix),
        has_terms=bioregistry.has_terms(prefix),
        obo_download=bioregistry.get_obo_download(prefix),
        owl_download=bioregistry.get_owl_download(prefix),
        namespace_in_lui=bioregistry.namespace_in_lui(prefix),
        deprecated=bioregistry.is_deprecated(prefix),
        contact=bioregistry.get_email(prefix),
        banana=bioregistry.get_banana(prefix),
        description=bioregistry.get_description(prefix),
        providers=None if example is None else _get_resource_providers(
            prefix, example),
    )
コード例 #4
0
def iter_cached_obo() -> Iterable[Tuple[str, str]]:
    """Iterate over cached OBO paths."""
    for prefix in os.listdir(RAW_DIRECTORY):
        if prefix in GLOBAL_SKIP or has_no_download(
                prefix) or bioregistry.is_deprecated(prefix):
            continue
        d = os.path.join(RAW_DIRECTORY, prefix)
        if not os.path.isdir(d):
            continue
        for x in os.listdir(d):
            if x.endswith(".obo"):
                p = os.path.join(d, x)
                yield prefix, p
コード例 #5
0
ファイル: test_data.py プロジェクト: pagreene/bioregistry
    def test_name_expansions(self):
        """Test that default names are not capital acronyms."""
        for prefix in bioregistry.read_registry():
            if bioregistry.is_deprecated(prefix):
                continue
            entry = bioregistry.get(prefix)
            if 'name' in entry:
                continue
            name = bioregistry.get_name(prefix)
            if prefix == name.lower() and name.upper() == name:
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} acronym ({name}) is not expanded')

            if '.' in prefix and prefix.split('.')[0] == name.lower():
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} acronym ({name}) is not expanded')
コード例 #6
0
ファイル: test_data.py プロジェクト: pagreene/bioregistry
    def test_no_redundant_acronym(self):
        """Test that there is no redundant acronym in the name.

        For example, "Amazon Standard Identification Number (ASIN)" is a problematic
        name for prefix "asin".
        """
        for prefix in bioregistry.read_registry():
            if bioregistry.is_deprecated(prefix):
                continue
            entry = bioregistry.get(prefix)
            if 'name' in entry:
                continue
            name = bioregistry.get_name(prefix)

            try:
                _, rest = name.rstrip(')').rsplit('(', 1)
            except ValueError:
                continue
            if rest.lower() == prefix.lower():
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} has redundany acronym in name "{name}"')
コード例 #7
0
def load(
    load_all: bool,
    load_resources: bool = False,
    load_names: bool = False,
    load_alts: bool = False,
    load_xrefs: bool = True,
    load_synonyms: bool = False,
    reset: bool = False,
) -> None:
    """Load the database."""
    if reset:
        drop_all()
    create_all()

    if load_resources or load_all:
        prefix_to_resource: Dict[str, Resource] = {}
        prefixes = {resource.prefix for resource in Resource.query.all()}

        for prefix, entry in tqdm(bioregistry.read_registry().items(),
                                  desc="loading resources"):
            if bioregistry.is_deprecated(prefix):
                continue
            if prefix in prefixes:
                continue
            prefix_to_resource[prefix] = resource_model = Resource(
                prefix=prefix,
                name=entry["name"],
                pattern=bioregistry.get_pattern(prefix),
            )
            session.add(resource_model)
        session.commit()

    ooh_na_na_path = ensure_ooh_na_na()
    synonyms_path = ensure_synonyms()
    xrefs_path = ensure_inspector_javert()

    if load_alts or load_all:
        alts_path = ensure_alts()
        alts_df = pd.read_csv(alts_path, sep="\t",
                              dtype=str)  # prefix, alt, identifier
        logger.info("inserting %d alt identifiers", len(alts_df.index))
        alts_df.to_sql(name=Alt.__tablename__,
                       con=engine,
                       if_exists="append",
                       index=False)
        logger.info("committing alt identifier")
        session.commit()
        logger.info("done committing alt identifiers")

    for label, path, table, columns, checker in [
        ("names", ooh_na_na_path, Reference, None, load_names),
        ("synonyms", synonyms_path, Synonym, ["prefix", "identifier",
                                              "name"], load_synonyms),
        (
            "xrefs",
            xrefs_path,
            Xref,
            [
                "prefix", "identifier", "xref_prefix", "xref_identifier",
                "source"
            ],
            load_xrefs,
        ),
    ]:
        if not checker and not load_all:
            continue
        logger.info("beginning insertion of %s", label)
        conn = engine.raw_connection()
        logger.info("inserting with low-level copy of %s from: %s", label,
                    path)
        if columns:
            columns = ", ".join(columns)
            logger.info("corresponding to columns: %s", columns)
            columns = f" ({columns})"
        else:
            columns = ""

        with conn.cursor() as cursor, gzip.open(path) as file:
            # next(file)  # skip the header
            sql = f"""COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';"""
            logger.info("running SQL: %s", sql)
            cursor.copy_expert(sql=sql, file=file)

        logger.info("committing %s", label)
        conn.commit()
        logger.info("done committing %s", label)

    logger.info(f"number resources loaded: {Resource.query.count():,}")
    logger.info(f"number references loaded: {Reference.query.count():,}")
    logger.info(f"number alts loaded: {Alt.query.count():,}")
    logger.info(f"number synonyms loaded: {Synonym.query.count():,}")
    logger.info(f"number xrefs loaded: {Xref.query.count():,}")
コード例 #8
0
def iter_helper_helper(
    f: Callable[[str], X],
    use_tqdm: bool = True,
    skip_below: Optional[str] = None,
    skip_below_inclusive: bool = True,
    skip_pyobo: bool = False,
    skip_set: Optional[Set[str]] = None,
    strict: bool = True,
    **kwargs,
) -> Iterable[Tuple[str, X]]:
    """Yield all mappings extracted from each database given.

    :param f: A function that takes a prefix and gives back something that will be used by an outer function.
    :param use_tqdm: If true, use the tqdm progress bar
    :param skip_below: If true, skip sources whose names are less than this (used for iterative curation
    :param skip_pyobo: If true, skip sources implemented in PyOBO
    :param skip_set: A pre-defined blacklist to skip
    :param strict: If true, will raise exceptions and crash the program instead of logging them.
    :param kwargs: Keyword arguments passed to ``f``.
    :yields: A prefix and the result of the callable ``f``

    :raises TypeError: If a type error is raised, it gets re-raised
    :raises urllib.error.HTTPError: If the resource could not be downloaded
    :raises urllib.error.URLError: If another problem was encountered during download
    :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
    """
    prefixes = list(
        _prefixes(
            skip_set=skip_set,
            skip_below=skip_below,
            skip_pyobo=skip_pyobo,
            skip_below_inclusive=skip_below_inclusive,
        ))
    prefix_it = tqdm(prefixes,
                     disable=not use_tqdm,
                     desc=f"Building with {f.__name__}()",
                     unit="resource")
    for prefix in prefix_it:
        prefix_it.set_postfix(prefix=prefix)
        try:
            yv = f(prefix, **kwargs)  # type:ignore
        except urllib.error.HTTPError as e:
            logger.warning("[%s] HTTP %s: unable to download %s", prefix,
                           e.getcode(), e.geturl())
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except urllib.error.URLError:
            logger.warning("[%s] unable to download", prefix)
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except MissingPrefix as e:
            logger.warning("[%s] missing prefix: %s", prefix, e)
            if strict and not bioregistry.is_deprecated(prefix):
                raise e
        except subprocess.CalledProcessError:
            logger.warning("[%s] ROBOT was unable to convert OWL to OBO",
                           prefix)
        except UnhandledFormat as e:
            logger.warning("[%s] %s", prefix, e)
        except ValueError as e:
            if _is_xml(e):
                # this means that it tried doing parsing on an xml page
                logger.info(
                    "no resource available for %s. See http://www.obofoundry.org/ontology/%s",
                    prefix,
                    prefix,
                )
            else:
                logger.exception("[%s] got exception %s while parsing", prefix,
                                 e.__class__.__name__)
        except TypeError as e:
            logger.exception("[%s] got exception %s while parsing", prefix,
                             e.__class__.__name__)
            if strict:
                raise e
        else:
            yield prefix, yv
コード例 #9
0
def iter_helper_helper(
    f: Callable[[str], X],
    use_tqdm: bool = True,
    skip_below: Optional[str] = None,
    skip_pyobo: bool = False,
    skip_set: Optional[Set[str]] = None,
    strict: bool = True,
    **kwargs,
) -> Iterable[Tuple[str, X]]:
    """Yield all mappings extracted from each database given.

    :param f: A function that takes a prefix and gives back something that will be used by an outer function.
    :param use_tqdm: If true, use the tqdm progress bar
    :param skip_below: If true, skip sources whose names are less than this (used for iterative curation
    :param skip_pyobo: If true, skip sources implemented in PyOBO
    :param skip_set: A pre-defined blacklist to skip
    :param strict: If true, will raise exceptions and crash the program instead of logging them.
    :param kwargs: Keyword arguments passed to ``f``.
    :yields: A prefix and the result of the callable ``f``

    :raises TypeError: If a type error is raised, it gets re-raised
    :raises urllib.error.HTTPError: If the resource could not be downloaded
    :raises urllib.error.URLError: If another problem was encountered during download
    :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
    """
    it = sorted(bioregistry.read_bioregistry())
    if use_tqdm:
        it = tqdm(it, disable=None, desc='Resources')
    for prefix in it:
        if use_tqdm:
            it.set_postfix({'prefix': prefix})
        if prefix in SKIP:
            tqdm.write(f'skipping {prefix} because in default skip set')
            continue
        if skip_set and prefix in skip_set:
            tqdm.write(f'skipping {prefix} because in skip set')
            continue
        if skip_below is not None and prefix < skip_below:
            continue
        if skip_pyobo and has_nomenclature_plugin(prefix):
            continue
        try:
            yv = f(prefix, **kwargs)
        except NoBuild:
            continue
        except urllib.error.HTTPError as e:
            logger.warning('[%s] HTTP %s: unable to download %s', prefix,
                           e.getcode(), e.geturl())
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except urllib.error.URLError:
            logger.warning('[%s] unable to download', prefix)
            if strict and not bioregistry.is_deprecated(prefix):
                raise
        except MissingPrefix as e:
            logger.warning('[%s] missing prefix: %s', prefix, e)
            if strict:
                raise e
        except ValueError as e:
            if _is_xml(e):
                # this means that it tried doing parsing on an xml page saying get the f**k out
                logger.info(
                    'no resource available for %s. See http://www.obofoundry.org/ontology/%s',
                    prefix, prefix)
            else:
                logger.exception('[%s] error while parsing: %s', prefix,
                                 e.__class__)
            if strict:
                raise e
        except TypeError as e:
            logger.exception('TypeError on %s', prefix)
            if strict:
                raise e
        else:
            yield prefix, yv
コード例 #10
0
def _iter_metadata(**kwargs):
    for prefix, data in iter_helper_helper(get_metadata, **kwargs):
        version = data["version"]
        tqdm.write(f"[{prefix}] using version {version}")
        yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)