Example #1
0
def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph:
    """Make a super graph containing is_a, part_of, and xref relationships."""
    rv = nx.DiGraph()

    df = ensure_inspector_javert_df()
    for source_ns, source_id, target_ns, target_id, provenance in df.values:
        rv.add_edge(
            f"{source_ns}:{source_id}",
            f"{target_ns}:{target_id}",
            relation="xref",
            provenance=provenance,
        )

    logger.info("getting hierarchies")
    it = sorted(bioregistry.read_registry())
    if use_tqdm:
        it = tqdm(it, desc="Entries")
    for prefix in it:
        if bioregistry.is_deprecated(prefix) or prefix in SKIP:
            continue
        if use_tqdm:
            it.set_postfix({"prefix": prefix})

        hierarchy = get_hierarchy(prefix,
                                  include_has_member=True,
                                  include_part_of=True)
        rv.add_edges_from(hierarchy.edges(data=True))

    # TODO include translates_to, transcribes_to, and has_variant

    return rv
Example #2
0
    def test_url_auto(self):
        """Test formatting URLs."""
        for prefix, entry in bioregistry.read_registry().items():
            if prefix in IDOT_BROKEN:
                continue
            identifier = bioregistry.get_example(prefix)
            if identifier is None:
                continue
            if ('example' not in entry and 'banana' not in entry
                    and 'pattern' not in entry):
                continue

            url = get_identifiers_org_url(prefix, identifier)
            if url is None:
                continue

            print(prefix)
            with self.subTest(prefix=prefix, identifier=identifier):
                # The following tests don't work because the CURIE generation often throws away the prefix.
                # miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix)
                # self.assertIsNotNone(miriam_prefix)
                # self.assertTrue(
                #     url.startswith(f'https://identifiers.org/{miriam_prefix}:'),
                #     msg=f"bad prefix for {prefix}. Expected {miriam_prefix} in {url}",
                # )
                res = self.session.get(url, allow_redirects=False)
                self.assertEqual(302,
                                 res.status_code,
                                 msg='\n' + dedent(f'''\
                Prefix: {prefix}
                Identifier: {identifier}
                URL: {url}
                Text: ''') + fill(res.text, 70, subsequent_indent='      '))
Example #3
0
    def test_unique_keys(self):
        """Test that all prefixes are norm-unique."""
        registry = bioregistry.read_registry()

        for a, b in pairwise(sorted(registry, key=norm)):
            with self.subTest(a=a, b=b):
                self.assertNotEqual(norm(a), norm(b))
Example #4
0
def _prefixes(
    skip_below: Optional[str] = None,
    skip_below_inclusive: bool = True,
    skip_pyobo: bool = False,
    skip_set: Optional[Set[str]] = None,
) -> Iterable[str]:
    for prefix, resource in sorted(bioregistry.read_registry().items()):
        if resource.no_own_terms:
            continue
        if prefix in SKIP:
            tqdm.write(f"skipping {prefix} because in default skip set")
            continue
        if skip_set and prefix in skip_set:
            tqdm.write(f"skipping {prefix} because in skip set")
            continue
        if skip_below is not None:
            if skip_below_inclusive:
                if prefix < skip_below:
                    continue
            else:
                if prefix <= skip_below:
                    continue
        has_pyobo = has_nomenclature_plugin(prefix)
        has_download = resource.has_download()
        if skip_pyobo and has_pyobo:
            continue
        if not has_pyobo and not has_download:
            continue
        yield prefix
Example #5
0
def _no_download() -> Set[str]:
    """Get the list of prefixes not available as OBO."""
    return {
        prefix
        for prefix in bioregistry.read_registry()
        if bioregistry.get_obo_download(prefix) is None
        and bioregistry.get_owl_download(prefix) is None
    }
Example #6
0
 def test_bioregistry_ids(self):
     """Test Bioregistry prefixes are all canonical."""
     prefixes = set(bioregistry.read_registry())
     for getter in get_getters():
         if getter.bioregistry_id is None:
             continue
         with self.subTest(name=getter.name):
             self.assertIn(getter.bioregistry_id, prefixes)
Example #7
0
 def test_homepage_http(self):
     """Test that all homepages start with http."""
     for prefix in bioregistry.read_registry():
         homepage = bioregistry.get_homepage(prefix)
         if homepage is None or homepage.startswith('http'):
             continue
         with self.subTest(prefix=prefix):
             self.fail(msg=f'malformed homepage: {homepage}')
Example #8
0
 def test_email(self):
     """Test that the email getter returns valid email addresses."""
     for prefix in bioregistry.read_registry():
         email = _get_prefix_key(prefix, 'contact', ('obofoundry', 'ols'))
         if email is None or EMAIL_RE.match(email):
             continue
         with self.subTest(prefix=prefix):
             self.fail(msg=f'bad email: {email}')
Example #9
0
    def test_synonyms(self):
        """Test that there are no synonyms that conflict with keys."""
        registry = bioregistry.read_registry()
        norm_prefixes = {norm(prefix) for prefix in registry}

        for key, entry in registry.items():
            for synonym in entry.get('synonyms', []):
                with self.subTest(key=key, synonym=synonym):
                    self.assertNotIn(synonym, norm_prefixes - {norm(key)})
Example #10
0
def upload():
    """Generate a CX graph and upload to NDEx."""
    cx = NiceCXBuilder()
    cx.set_name('Bioregistry')
    cx.add_network_attribute(
        'description',
        'An integrative meta-registry of biological databases, ontologies, and nomenclatures',
    )
    cx.add_network_attribute('author', 'Charles Tapley Hoyt')
    cx.set_context({
        'bioregistry.registry': 'https://bioregistry.io/metaregistry/',
        'bioregistry': 'https://bioregistry.io/registry/',
    })

    metaregistry = bioregistry.read_metaregistry()
    registry = bioregistry.read_registry()

    registry_nodes = {
        metaprefix: make_registry_node(cx, metaprefix)
        for metaprefix in metaregistry
    }
    resource_nodes = {
        prefix: make_resource_node(cx, prefix)
        for prefix in registry
    }

    for prefix, entry in registry.items():
        # Who does it provide for?
        provides = bioregistry.resolve.get_provides_for(prefix)
        if isinstance(provides, str):
            provides = [provides]
        for target in provides or []:
            cx.add_edge(
                source=resource_nodes[prefix],
                target=resource_nodes[target],
                interaction='provides',
            )

        # Which registries does it map to?
        for metaprefix in metaregistry:
            if metaprefix not in entry:
                continue
            cx.add_edge(
                source=resource_nodes[prefix],
                target=registry_nodes[metaprefix],
                interaction='listed',
            )

    nice_cx = cx.get_nice_cx()
    nice_cx.update_to(
        uuid=NDEX_UUID,
        server='http://public.ndexbio.org',
        username=pystow.get_config('ndex', 'username'),
        password=pystow.get_config('ndex', 'password'),
    )
Example #11
0
def home():
    """Render the homepage."""
    example_prefix, example_identifier = 'chebi', '138488'
    example_url = _get_bioregistry_link(example_prefix, example_identifier)
    return render_template(
        'home.html',
        example_url=example_url,
        example_prefix=example_prefix,
        example_identifier=example_identifier,
        registry_size=len(bioregistry.read_registry()),
        metaregistry_size=len(bioregistry.read_metaregistry()),
        collections_size=len(bioregistry.read_collections()),
    )
Example #12
0
    def test_curies(self):
        """Test correct prefixes and identifiers."""
        registry = dict(bioregistry.read_registry())
        registry['decopath'] = {
        }  # TODO decopath needs a real resource and an entry in the bioregistry

        miriam_patterns = {
            k: re.compile(entry['miriam']['pattern'])
            for k, entry in registry.items() if 'miriam' in entry
        }

        dataframes = {
            getter.__name__.removeprefix('get_').removesuffix('_df'): getter()
            for _, getter in DATA
        }

        rows = ['Source Resource', 'Source ID', 'Target Resource', 'Target ID']
        for name, df in dataframes.items():
            with self.subTest(name=name):
                for i, (source_prefix, source_id, target_prefix,
                        target_id) in enumerate(df[rows].values):
                    self.assertIn(source_prefix, registry.keys())
                    self.assertNotEqual(source_prefix, 'kegg')
                    self.assertFalse(
                        bioregistry.is_deprecated(source_prefix),
                        msg=
                        f'[{name}, row {i}] deprecated source prefix: {source_prefix}',
                    )
                    if source_regex := miriam_patterns.get(source_prefix):
                        self.assertRegex(
                            source_id,
                            source_regex,
                            msg=
                            f'[{name}, row {i}] source prefix: {source_prefix}',
                        )
                    self.assertIn(target_prefix, registry.keys())
                    self.assertNotEqual(target_prefix, 'kegg')
                    self.assertFalse(
                        bioregistry.is_deprecated(target_prefix),
                        msg=
                        f'[{name}, row {i}] deprecated target prefix: {target_prefix}',
                    )
                    if target_regex := miriam_patterns.get(target_prefix):
                        self.assertRegex(
                            target_id,
                            target_regex,
                            msg=
                            f'[{name}, row {i}] target prefix: {target_prefix}',
                        )
Example #13
0
def resources():
    """Serve the Bioregistry page."""
    rows = [
        dict(
            prefix=prefix,
            name=bioregistry.get_name(prefix),
            example=bioregistry.get_example(prefix),
            homepage=bioregistry.get_homepage(prefix),
            pattern=bioregistry.get_pattern(prefix),
            namespace_in_lui=bioregistry.namespace_in_lui(prefix),
            banana=bioregistry.get_banana(prefix),
            description=bioregistry.get_description(prefix),
        ) for prefix in bioregistry.read_registry()
    ]
    return render_template('resources.html', rows=rows)
Example #14
0
    def __init__(self):
        """Instantiate the aligner."""
        self.internal_registry = read_registry()
        self.external_registry = self.__class__.getter(**(self.getter_kwargs or {}))
        self.skip_external = self.get_skip()

        # Get all of the pre-curated mappings from the Bioregistry
        self.external_id_to_bioregistry_id = {
            bioregistry_entry[self.key]['prefix']: bioregistry_id
            for bioregistry_id, bioregistry_entry in self.internal_registry.items()
            if self.key in bioregistry_entry
        }

        # Run lexical alignment
        self._align()
Example #15
0
    def test_name_expansions(self):
        """Test that default names are not capital acronyms."""
        for prefix in bioregistry.read_registry():
            if bioregistry.is_deprecated(prefix):
                continue
            entry = bioregistry.get(prefix)
            if 'name' in entry:
                continue
            name = bioregistry.get_name(prefix)
            if prefix == name.lower() and name.upper() == name:
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} acronym ({name}) is not expanded')

            if '.' in prefix and prefix.split('.')[0] == name.lower():
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} acronym ({name}) is not expanded')
Example #16
0
def iterate_wikidata_dfs(*, use_tqdm: bool = True) -> Iterable[pd.DataFrame]:
    """Iterate over WikiData xref dataframes."""
    wikidata_properties = {
        prefix: entry.wikidata["prefix"]
        for prefix, entry in bioregistry.read_registry().items()
        if entry.wikidata and "prefix" in entry.wikidata
    }
    # wikidata_properties.update(get_wikidata_properties())

    it = tqdm(sorted(wikidata_properties.items()), disable=not use_tqdm, desc="Wikidata properties")
    for prefix, wikidata_property in it:
        if prefix in {"pubmed", "pmc", "orcid", "inchi", "smiles"}:
            continue  # too many
        it.set_postfix({"prefix": prefix})
        try:
            yield get_wikidata_df(prefix, wikidata_property)
        except json.decoder.JSONDecodeError as e:
            logger.warning(
                "[%s] Problem decoding results from %s: %s", prefix, wikidata_property, e
            )
Example #17
0
 def test_banana(self):
     """Test that entries curated with a new banana are resolved properly."""
     for prefix, entry in bioregistry.read_registry().items():
         banana = entry.get('banana')
         if banana is None:
             continue
         if prefix in IDOT_BROKEN:
             continue  # identifiers.org is broken for these prefixes
         with self.subTest(
                 prefix=prefix,
                 banana=banana,
                 pattern=bioregistry.get_pattern(prefix),
         ):
             identifier = bioregistry.get_example(prefix)
             self.assertIsNotNone(identifier)
             url = bioregistry.resolve_identifier.get_identifiers_org_url(
                 prefix, identifier)
             res = self.session.get(url, allow_redirects=False)
             self.assertEqual(302,
                              res.status_code,
                              msg=f'failed with URL: {url}')
Example #18
0
    def test_no_redundant_acronym(self):
        """Test that there is no redundant acronym in the name.

        For example, "Amazon Standard Identification Number (ASIN)" is a problematic
        name for prefix "asin".
        """
        for prefix in bioregistry.read_registry():
            if bioregistry.is_deprecated(prefix):
                continue
            entry = bioregistry.get(prefix)
            if 'name' in entry:
                continue
            name = bioregistry.get_name(prefix)

            try:
                _, rest = name.rstrip(')').rsplit('(', 1)
            except ValueError:
                continue
            if rest.lower() == prefix.lower():
                with self.subTest(prefix=prefix):
                    self.fail(msg=f'{prefix} has redundany acronym in name "{name}"')
Example #19
0
def main(url: str, local: bool):
    """Test the API."""
    url = url.rstrip('/')
    if local:
        url = 'http://localhost:5000'
    click.echo(f'Testing resolution API on {url}')
    failure = False
    prefixes = tqdm(bioregistry.read_registry())

    for prefix in prefixes:
        identifier = bioregistry.get_example(prefix)
        if identifier is None:
            continue
        prefixes.set_postfix({'prefix': prefix})
        req_url = f'{url}/{prefix}:{identifier}'
        res = requests.get(req_url, allow_redirects=False)
        log = partial(_log, req_url=req_url)
        if res.status_code == 302:  # redirect
            continue
        elif res.status_code != 404:
            text = res.text.splitlines()[3][len('<p>'):-len('</p>')]
            log(f'HTTP {res.status_code}: {res.reason} {text}', fg='red')
        elif not bioregistry.get_providers(prefix, identifier):
            continue
        elif '/' in identifier or SLASH_URL_ENCODED in identifier:
            log('contains slash 🎩 🎸', fg='red')
        elif not bioregistry.validate(prefix, identifier):
            pattern = bioregistry.get_pattern(prefix)
            if bioregistry.get_banana(prefix):
                log(f'banana {pattern} 🍌', fg='red')
            else:
                log(f'invalid example does not match pattern {pattern}',
                    fg='red')
        else:
            log('404 unknown issue', fg='red')

        failure = True

    return sys.exit(1 if failure else 0)
Example #20
0
    def test_lui(self):
        """Test the LUI makes sense (spoilers, they don't).

        Discussion is ongoing at:

        - https://github.com/identifiers-org/identifiers-org.github.io/issues/151
        """
        for prefix in bioregistry.read_registry():
            if not bioregistry.namespace_in_lui(prefix):
                continue
            if bioregistry.get_banana(prefix):
                continue  # rewrite rules are applied to prefixes with bananas
            if prefix in {'ark', 'obi'}:
                continue  # these patterns on identifiers.org are garb
            with self.subTest(prefix=prefix):
                re_pattern = bioregistry.get_pattern(prefix)
                miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix)
                self.assertTrue(
                    re_pattern.startswith(f'^{miriam_prefix.upper()}')
                    or re_pattern.startswith(miriam_prefix.upper()),
                    msg=f'{prefix} pattern: {re_pattern}',
                )
Example #21
0
def iter_ols_getters() -> Iterable[Type[Getter]]:
    """Iterate over OLS getters."""
    for bioregistry_id in bioregistry.read_registry():
        yv = make_ols_getter(bioregistry_id)
        if yv is not None:
            yield yv
Example #22
0
def load(
    load_all: bool,
    load_resources: bool = False,
    load_names: bool = False,
    load_alts: bool = False,
    load_xrefs: bool = True,
    load_synonyms: bool = False,
    reset: bool = False,
) -> None:
    """Load the database."""
    if reset:
        drop_all()
    create_all()

    if load_resources or load_all:
        prefix_to_resource: Dict[str, Resource] = {}
        prefixes = {resource.prefix for resource in Resource.query.all()}

        for prefix, entry in tqdm(bioregistry.read_registry().items(),
                                  desc="loading resources"):
            if bioregistry.is_deprecated(prefix):
                continue
            if prefix in prefixes:
                continue
            prefix_to_resource[prefix] = resource_model = Resource(
                prefix=prefix,
                name=entry["name"],
                pattern=bioregistry.get_pattern(prefix),
            )
            session.add(resource_model)
        session.commit()

    ooh_na_na_path = ensure_ooh_na_na()
    synonyms_path = ensure_synonyms()
    xrefs_path = ensure_inspector_javert()

    if load_alts or load_all:
        alts_path = ensure_alts()
        alts_df = pd.read_csv(alts_path, sep="\t",
                              dtype=str)  # prefix, alt, identifier
        logger.info("inserting %d alt identifiers", len(alts_df.index))
        alts_df.to_sql(name=Alt.__tablename__,
                       con=engine,
                       if_exists="append",
                       index=False)
        logger.info("committing alt identifier")
        session.commit()
        logger.info("done committing alt identifiers")

    for label, path, table, columns, checker in [
        ("names", ooh_na_na_path, Reference, None, load_names),
        ("synonyms", synonyms_path, Synonym, ["prefix", "identifier",
                                              "name"], load_synonyms),
        (
            "xrefs",
            xrefs_path,
            Xref,
            [
                "prefix", "identifier", "xref_prefix", "xref_identifier",
                "source"
            ],
            load_xrefs,
        ),
    ]:
        if not checker and not load_all:
            continue
        logger.info("beginning insertion of %s", label)
        conn = engine.raw_connection()
        logger.info("inserting with low-level copy of %s from: %s", label,
                    path)
        if columns:
            columns = ", ".join(columns)
            logger.info("corresponding to columns: %s", columns)
            columns = f" ({columns})"
        else:
            columns = ""

        with conn.cursor() as cursor, gzip.open(path) as file:
            # next(file)  # skip the header
            sql = f"""COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';"""
            logger.info("running SQL: %s", sql)
            cursor.copy_expert(sql=sql, file=file)

        logger.info("committing %s", label)
        conn.commit()
        logger.info("done committing %s", label)

    logger.info(f"number resources loaded: {Resource.query.count():,}")
    logger.info(f"number references loaded: {Reference.query.count():,}")
    logger.info(f"number alts loaded: {Alt.query.count():,}")
    logger.info(f"number synonyms loaded: {Synonym.query.count():,}")
    logger.info(f"number xrefs loaded: {Xref.query.count():,}")
# -*- coding: utf-8 -*-
"""Make the curation list."""

import os

import click
import yaml

import bioregistry
from bioregistry.constants import DOCS_DATA

items = sorted(bioregistry.read_registry().items())


def _g(predicate):
    return [{
        'prefix': bioregistry_id,
        'name': bioregistry.get_name(bioregistry_id),
    } for bioregistry_id, bioregistry_entry in items
            if predicate(bioregistry_id, bioregistry_entry)]


@click.command()
def curation():
    """Make curation list."""
    missing_wikidata_database = _g(lambda prefix, entry: entry.get(
        'wikidata', {}).get('database') is None)
    missing_pattern = _g(
        lambda prefix, entry: bioregistry.get_pattern(prefix) is None)
    missing_format_url = _g(
        lambda prefix, entry: bioregistry.get_format(prefix) is None)
Example #24
0
 def setUp(self) -> None:
     """Set up the test case."""
     self.registry = bioregistry.read_registry()
Example #25
0
def resources():
    """List the entire Bioregistry."""
    return jsonify(bioregistry.read_registry())
Example #26
0
)
from bioregistry.constants import DOCS_IMG
from bioregistry.external import (
    get_biolink,
    get_bioportal,
    get_go,
    get_miriam,
    get_n2t,
    get_ncbi,
    get_obofoundry,
    get_ols,
    get_prefix_commons,
    get_wikidata_registry,
)

bioregistry = read_registry()

LICENSES = {
    'None':
    None,
    'license':
    None,
    'unspecified':
    None,
    # CC-BY (4.0)
    'CC-BY 4.0':
    'CC-BY',
    'CC BY 4.0':
    'CC-BY',
    'https://creativecommons.org/licenses/by/4.0/':
    'CC-BY',