def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph: """Make a super graph containing is_a, part_of, and xref relationships.""" rv = nx.DiGraph() df = ensure_inspector_javert_df() for source_ns, source_id, target_ns, target_id, provenance in df.values: rv.add_edge( f"{source_ns}:{source_id}", f"{target_ns}:{target_id}", relation="xref", provenance=provenance, ) logger.info("getting hierarchies") it = sorted(bioregistry.read_registry()) if use_tqdm: it = tqdm(it, desc="Entries") for prefix in it: if bioregistry.is_deprecated(prefix) or prefix in SKIP: continue if use_tqdm: it.set_postfix({"prefix": prefix}) hierarchy = get_hierarchy(prefix, include_has_member=True, include_part_of=True) rv.add_edges_from(hierarchy.edges(data=True)) # TODO include translates_to, transcribes_to, and has_variant return rv
def test_url_auto(self): """Test formatting URLs.""" for prefix, entry in bioregistry.read_registry().items(): if prefix in IDOT_BROKEN: continue identifier = bioregistry.get_example(prefix) if identifier is None: continue if ('example' not in entry and 'banana' not in entry and 'pattern' not in entry): continue url = get_identifiers_org_url(prefix, identifier) if url is None: continue print(prefix) with self.subTest(prefix=prefix, identifier=identifier): # The following tests don't work because the CURIE generation often throws away the prefix. # miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix) # self.assertIsNotNone(miriam_prefix) # self.assertTrue( # url.startswith(f'https://identifiers.org/{miriam_prefix}:'), # msg=f"bad prefix for {prefix}. Expected {miriam_prefix} in {url}", # ) res = self.session.get(url, allow_redirects=False) self.assertEqual(302, res.status_code, msg='\n' + dedent(f'''\ Prefix: {prefix} Identifier: {identifier} URL: {url} Text: ''') + fill(res.text, 70, subsequent_indent=' '))
def test_unique_keys(self): """Test that all prefixes are norm-unique.""" registry = bioregistry.read_registry() for a, b in pairwise(sorted(registry, key=norm)): with self.subTest(a=a, b=b): self.assertNotEqual(norm(a), norm(b))
def _prefixes( skip_below: Optional[str] = None, skip_below_inclusive: bool = True, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, ) -> Iterable[str]: for prefix, resource in sorted(bioregistry.read_registry().items()): if resource.no_own_terms: continue if prefix in SKIP: tqdm.write(f"skipping {prefix} because in default skip set") continue if skip_set and prefix in skip_set: tqdm.write(f"skipping {prefix} because in skip set") continue if skip_below is not None: if skip_below_inclusive: if prefix < skip_below: continue else: if prefix <= skip_below: continue has_pyobo = has_nomenclature_plugin(prefix) has_download = resource.has_download() if skip_pyobo and has_pyobo: continue if not has_pyobo and not has_download: continue yield prefix
def _no_download() -> Set[str]: """Get the list of prefixes not available as OBO.""" return { prefix for prefix in bioregistry.read_registry() if bioregistry.get_obo_download(prefix) is None and bioregistry.get_owl_download(prefix) is None }
def test_bioregistry_ids(self): """Test Bioregistry prefixes are all canonical.""" prefixes = set(bioregistry.read_registry()) for getter in get_getters(): if getter.bioregistry_id is None: continue with self.subTest(name=getter.name): self.assertIn(getter.bioregistry_id, prefixes)
def test_homepage_http(self): """Test that all homepages start with http.""" for prefix in bioregistry.read_registry(): homepage = bioregistry.get_homepage(prefix) if homepage is None or homepage.startswith('http'): continue with self.subTest(prefix=prefix): self.fail(msg=f'malformed homepage: {homepage}')
def test_email(self): """Test that the email getter returns valid email addresses.""" for prefix in bioregistry.read_registry(): email = _get_prefix_key(prefix, 'contact', ('obofoundry', 'ols')) if email is None or EMAIL_RE.match(email): continue with self.subTest(prefix=prefix): self.fail(msg=f'bad email: {email}')
def test_synonyms(self): """Test that there are no synonyms that conflict with keys.""" registry = bioregistry.read_registry() norm_prefixes = {norm(prefix) for prefix in registry} for key, entry in registry.items(): for synonym in entry.get('synonyms', []): with self.subTest(key=key, synonym=synonym): self.assertNotIn(synonym, norm_prefixes - {norm(key)})
def upload(): """Generate a CX graph and upload to NDEx.""" cx = NiceCXBuilder() cx.set_name('Bioregistry') cx.add_network_attribute( 'description', 'An integrative meta-registry of biological databases, ontologies, and nomenclatures', ) cx.add_network_attribute('author', 'Charles Tapley Hoyt') cx.set_context({ 'bioregistry.registry': 'https://bioregistry.io/metaregistry/', 'bioregistry': 'https://bioregistry.io/registry/', }) metaregistry = bioregistry.read_metaregistry() registry = bioregistry.read_registry() registry_nodes = { metaprefix: make_registry_node(cx, metaprefix) for metaprefix in metaregistry } resource_nodes = { prefix: make_resource_node(cx, prefix) for prefix in registry } for prefix, entry in registry.items(): # Who does it provide for? provides = bioregistry.resolve.get_provides_for(prefix) if isinstance(provides, str): provides = [provides] for target in provides or []: cx.add_edge( source=resource_nodes[prefix], target=resource_nodes[target], interaction='provides', ) # Which registries does it map to? for metaprefix in metaregistry: if metaprefix not in entry: continue cx.add_edge( source=resource_nodes[prefix], target=registry_nodes[metaprefix], interaction='listed', ) nice_cx = cx.get_nice_cx() nice_cx.update_to( uuid=NDEX_UUID, server='http://public.ndexbio.org', username=pystow.get_config('ndex', 'username'), password=pystow.get_config('ndex', 'password'), )
def home(): """Render the homepage.""" example_prefix, example_identifier = 'chebi', '138488' example_url = _get_bioregistry_link(example_prefix, example_identifier) return render_template( 'home.html', example_url=example_url, example_prefix=example_prefix, example_identifier=example_identifier, registry_size=len(bioregistry.read_registry()), metaregistry_size=len(bioregistry.read_metaregistry()), collections_size=len(bioregistry.read_collections()), )
def test_curies(self): """Test correct prefixes and identifiers.""" registry = dict(bioregistry.read_registry()) registry['decopath'] = { } # TODO decopath needs a real resource and an entry in the bioregistry miriam_patterns = { k: re.compile(entry['miriam']['pattern']) for k, entry in registry.items() if 'miriam' in entry } dataframes = { getter.__name__.removeprefix('get_').removesuffix('_df'): getter() for _, getter in DATA } rows = ['Source Resource', 'Source ID', 'Target Resource', 'Target ID'] for name, df in dataframes.items(): with self.subTest(name=name): for i, (source_prefix, source_id, target_prefix, target_id) in enumerate(df[rows].values): self.assertIn(source_prefix, registry.keys()) self.assertNotEqual(source_prefix, 'kegg') self.assertFalse( bioregistry.is_deprecated(source_prefix), msg= f'[{name}, row {i}] deprecated source prefix: {source_prefix}', ) if source_regex := miriam_patterns.get(source_prefix): self.assertRegex( source_id, source_regex, msg= f'[{name}, row {i}] source prefix: {source_prefix}', ) self.assertIn(target_prefix, registry.keys()) self.assertNotEqual(target_prefix, 'kegg') self.assertFalse( bioregistry.is_deprecated(target_prefix), msg= f'[{name}, row {i}] deprecated target prefix: {target_prefix}', ) if target_regex := miriam_patterns.get(target_prefix): self.assertRegex( target_id, target_regex, msg= f'[{name}, row {i}] target prefix: {target_prefix}', )
def resources(): """Serve the Bioregistry page.""" rows = [ dict( prefix=prefix, name=bioregistry.get_name(prefix), example=bioregistry.get_example(prefix), homepage=bioregistry.get_homepage(prefix), pattern=bioregistry.get_pattern(prefix), namespace_in_lui=bioregistry.namespace_in_lui(prefix), banana=bioregistry.get_banana(prefix), description=bioregistry.get_description(prefix), ) for prefix in bioregistry.read_registry() ] return render_template('resources.html', rows=rows)
def __init__(self): """Instantiate the aligner.""" self.internal_registry = read_registry() self.external_registry = self.__class__.getter(**(self.getter_kwargs or {})) self.skip_external = self.get_skip() # Get all of the pre-curated mappings from the Bioregistry self.external_id_to_bioregistry_id = { bioregistry_entry[self.key]['prefix']: bioregistry_id for bioregistry_id, bioregistry_entry in self.internal_registry.items() if self.key in bioregistry_entry } # Run lexical alignment self._align()
def test_name_expansions(self): """Test that default names are not capital acronyms.""" for prefix in bioregistry.read_registry(): if bioregistry.is_deprecated(prefix): continue entry = bioregistry.get(prefix) if 'name' in entry: continue name = bioregistry.get_name(prefix) if prefix == name.lower() and name.upper() == name: with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} acronym ({name}) is not expanded') if '.' in prefix and prefix.split('.')[0] == name.lower(): with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} acronym ({name}) is not expanded')
def iterate_wikidata_dfs(*, use_tqdm: bool = True) -> Iterable[pd.DataFrame]: """Iterate over WikiData xref dataframes.""" wikidata_properties = { prefix: entry.wikidata["prefix"] for prefix, entry in bioregistry.read_registry().items() if entry.wikidata and "prefix" in entry.wikidata } # wikidata_properties.update(get_wikidata_properties()) it = tqdm(sorted(wikidata_properties.items()), disable=not use_tqdm, desc="Wikidata properties") for prefix, wikidata_property in it: if prefix in {"pubmed", "pmc", "orcid", "inchi", "smiles"}: continue # too many it.set_postfix({"prefix": prefix}) try: yield get_wikidata_df(prefix, wikidata_property) except json.decoder.JSONDecodeError as e: logger.warning( "[%s] Problem decoding results from %s: %s", prefix, wikidata_property, e )
def test_banana(self): """Test that entries curated with a new banana are resolved properly.""" for prefix, entry in bioregistry.read_registry().items(): banana = entry.get('banana') if banana is None: continue if prefix in IDOT_BROKEN: continue # identifiers.org is broken for these prefixes with self.subTest( prefix=prefix, banana=banana, pattern=bioregistry.get_pattern(prefix), ): identifier = bioregistry.get_example(prefix) self.assertIsNotNone(identifier) url = bioregistry.resolve_identifier.get_identifiers_org_url( prefix, identifier) res = self.session.get(url, allow_redirects=False) self.assertEqual(302, res.status_code, msg=f'failed with URL: {url}')
def test_no_redundant_acronym(self): """Test that there is no redundant acronym in the name. For example, "Amazon Standard Identification Number (ASIN)" is a problematic name for prefix "asin". """ for prefix in bioregistry.read_registry(): if bioregistry.is_deprecated(prefix): continue entry = bioregistry.get(prefix) if 'name' in entry: continue name = bioregistry.get_name(prefix) try: _, rest = name.rstrip(')').rsplit('(', 1) except ValueError: continue if rest.lower() == prefix.lower(): with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} has redundany acronym in name "{name}"')
def main(url: str, local: bool): """Test the API.""" url = url.rstrip('/') if local: url = 'http://localhost:5000' click.echo(f'Testing resolution API on {url}') failure = False prefixes = tqdm(bioregistry.read_registry()) for prefix in prefixes: identifier = bioregistry.get_example(prefix) if identifier is None: continue prefixes.set_postfix({'prefix': prefix}) req_url = f'{url}/{prefix}:{identifier}' res = requests.get(req_url, allow_redirects=False) log = partial(_log, req_url=req_url) if res.status_code == 302: # redirect continue elif res.status_code != 404: text = res.text.splitlines()[3][len('<p>'):-len('</p>')] log(f'HTTP {res.status_code}: {res.reason} {text}', fg='red') elif not bioregistry.get_providers(prefix, identifier): continue elif '/' in identifier or SLASH_URL_ENCODED in identifier: log('contains slash 🎩 🎸', fg='red') elif not bioregistry.validate(prefix, identifier): pattern = bioregistry.get_pattern(prefix) if bioregistry.get_banana(prefix): log(f'banana {pattern} 🍌', fg='red') else: log(f'invalid example does not match pattern {pattern}', fg='red') else: log('404 unknown issue', fg='red') failure = True return sys.exit(1 if failure else 0)
def test_lui(self): """Test the LUI makes sense (spoilers, they don't). Discussion is ongoing at: - https://github.com/identifiers-org/identifiers-org.github.io/issues/151 """ for prefix in bioregistry.read_registry(): if not bioregistry.namespace_in_lui(prefix): continue if bioregistry.get_banana(prefix): continue # rewrite rules are applied to prefixes with bananas if prefix in {'ark', 'obi'}: continue # these patterns on identifiers.org are garb with self.subTest(prefix=prefix): re_pattern = bioregistry.get_pattern(prefix) miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix) self.assertTrue( re_pattern.startswith(f'^{miriam_prefix.upper()}') or re_pattern.startswith(miriam_prefix.upper()), msg=f'{prefix} pattern: {re_pattern}', )
def iter_ols_getters() -> Iterable[Type[Getter]]: """Iterate over OLS getters.""" for bioregistry_id in bioregistry.read_registry(): yv = make_ols_getter(bioregistry_id) if yv is not None: yield yv
def load( load_all: bool, load_resources: bool = False, load_names: bool = False, load_alts: bool = False, load_xrefs: bool = True, load_synonyms: bool = False, reset: bool = False, ) -> None: """Load the database.""" if reset: drop_all() create_all() if load_resources or load_all: prefix_to_resource: Dict[str, Resource] = {} prefixes = {resource.prefix for resource in Resource.query.all()} for prefix, entry in tqdm(bioregistry.read_registry().items(), desc="loading resources"): if bioregistry.is_deprecated(prefix): continue if prefix in prefixes: continue prefix_to_resource[prefix] = resource_model = Resource( prefix=prefix, name=entry["name"], pattern=bioregistry.get_pattern(prefix), ) session.add(resource_model) session.commit() ooh_na_na_path = ensure_ooh_na_na() synonyms_path = ensure_synonyms() xrefs_path = ensure_inspector_javert() if load_alts or load_all: alts_path = ensure_alts() alts_df = pd.read_csv(alts_path, sep="\t", dtype=str) # prefix, alt, identifier logger.info("inserting %d alt identifiers", len(alts_df.index)) alts_df.to_sql(name=Alt.__tablename__, con=engine, if_exists="append", index=False) logger.info("committing alt identifier") session.commit() logger.info("done committing alt identifiers") for label, path, table, columns, checker in [ ("names", ooh_na_na_path, Reference, None, load_names), ("synonyms", synonyms_path, Synonym, ["prefix", "identifier", "name"], load_synonyms), ( "xrefs", xrefs_path, Xref, [ "prefix", "identifier", "xref_prefix", "xref_identifier", "source" ], load_xrefs, ), ]: if not checker and not load_all: continue logger.info("beginning insertion of %s", label) conn = engine.raw_connection() logger.info("inserting with low-level copy of %s from: %s", label, path) if columns: columns = ", ".join(columns) logger.info("corresponding to columns: %s", columns) columns = f" ({columns})" else: columns = "" with conn.cursor() as cursor, gzip.open(path) as file: # next(file) # skip the header sql = f"""COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';""" logger.info("running SQL: %s", sql) cursor.copy_expert(sql=sql, file=file) logger.info("committing %s", label) conn.commit() logger.info("done committing %s", label) logger.info(f"number resources loaded: {Resource.query.count():,}") logger.info(f"number references loaded: {Reference.query.count():,}") logger.info(f"number alts loaded: {Alt.query.count():,}") logger.info(f"number synonyms loaded: {Synonym.query.count():,}") logger.info(f"number xrefs loaded: {Xref.query.count():,}")
# -*- coding: utf-8 -*- """Make the curation list.""" import os import click import yaml import bioregistry from bioregistry.constants import DOCS_DATA items = sorted(bioregistry.read_registry().items()) def _g(predicate): return [{ 'prefix': bioregistry_id, 'name': bioregistry.get_name(bioregistry_id), } for bioregistry_id, bioregistry_entry in items if predicate(bioregistry_id, bioregistry_entry)] @click.command() def curation(): """Make curation list.""" missing_wikidata_database = _g(lambda prefix, entry: entry.get( 'wikidata', {}).get('database') is None) missing_pattern = _g( lambda prefix, entry: bioregistry.get_pattern(prefix) is None) missing_format_url = _g( lambda prefix, entry: bioregistry.get_format(prefix) is None)
def setUp(self) -> None: """Set up the test case.""" self.registry = bioregistry.read_registry()
def resources(): """List the entire Bioregistry.""" return jsonify(bioregistry.read_registry())
) from bioregistry.constants import DOCS_IMG from bioregistry.external import ( get_biolink, get_bioportal, get_go, get_miriam, get_n2t, get_ncbi, get_obofoundry, get_ols, get_prefix_commons, get_wikidata_registry, ) bioregistry = read_registry() LICENSES = { 'None': None, 'license': None, 'unspecified': None, # CC-BY (4.0) 'CC-BY 4.0': 'CC-BY', 'CC BY 4.0': 'CC-BY', 'https://creativecommons.org/licenses/by/4.0/': 'CC-BY',