def resource(prefix: str): """Serve the a Bioregistry entry page.""" prefix = _normalize_prefix_or_404(prefix, '.' + resource.__name__) if not isinstance(prefix, str): return prefix example = bioregistry.get_example(prefix) return render_template( 'resource.html', prefix=prefix, name=bioregistry.get_name(prefix), example=example, mappings=_get_resource_mapping_rows(prefix), synonyms=bioregistry.get_synonyms(prefix), homepage=bioregistry.get_homepage(prefix), pattern=bioregistry.get_pattern(prefix), version=bioregistry.get_version(prefix), has_terms=bioregistry.has_terms(prefix), obo_download=bioregistry.get_obo_download(prefix), owl_download=bioregistry.get_owl_download(prefix), namespace_in_lui=bioregistry.namespace_in_lui(prefix), deprecated=bioregistry.is_deprecated(prefix), contact=bioregistry.get_email(prefix), banana=bioregistry.get_banana(prefix), description=bioregistry.get_description(prefix), providers=None if example is None else _get_resource_providers( prefix, example), )
def warnings(): """Make warnings list.""" miriam_pattern_wrong = [ dict( prefix=prefix, name=bioregistry.get_name(prefix), correct=entry['pattern'], miriam=entry['miriam']['pattern'], ) for prefix, entry in items if 'miriam' in entry and 'pattern' in entry and entry['pattern'] != entry['miriam']['pattern'] ] miriam_embedding_rewrites = [ dict( prefix=prefix, name=bioregistry.get_name(prefix), pattern=bioregistry.get_pattern(prefix), correct=entry['namespace.embedded'], miriam=entry['miriam']['namespaceEmbeddedInLui'], ) for prefix, entry in items if 'namespace.embedded' in entry ] # When are namespace rewrites required? miriam_prefix_rewrites = [ dict( prefix=prefix, name=bioregistry.get_name(prefix), pattern=bioregistry.get_pattern(prefix), correct=entry['namespace.rewrite'], ) for prefix, entry in items if 'namespace.rewrite' in entry ] with open(os.path.join(DOCS_DATA, 'warnings.yml'), 'w') as file: yaml.safe_dump( { 'wrong_patterns': miriam_pattern_wrong, 'embedding_rewrites': miriam_embedding_rewrites, 'prefix_rewrites': miriam_prefix_rewrites, }, file, )
def check_valid_prefix_id(self, prefix, identifier): """Check the prefix/identifier pair is valid.""" if prefix in self.entries: entry = self.entries[prefix] if not re.match(entry["pattern"], identifier): raise InvalidIdentifier(prefix, identifier) elif bioregistry.get_resource(prefix) is None: raise InvalidPrefix(prefix) elif bioregistry.get_pattern(prefix) is None: if bioregistry.validate(prefix, identifier): raise InvalidIdentifier(prefix, identifier)
def resources(): """Serve the Bioregistry page.""" rows = [ dict( prefix=prefix, name=bioregistry.get_name(prefix), example=bioregistry.get_example(prefix), homepage=bioregistry.get_homepage(prefix), pattern=bioregistry.get_pattern(prefix), namespace_in_lui=bioregistry.namespace_in_lui(prefix), banana=bioregistry.get_banana(prefix), description=bioregistry.get_description(prefix), ) for prefix in bioregistry.read_registry() ] return render_template('resources.html', rows=rows)
def make_resource_node(cx: NiceCXBuilder, prefix: str) -> int: """Generate a CX node for a resource.""" node = cx.add_node( name=bioregistry.get_name(prefix), represents=f'bioregistry.resource:{prefix}', ) homepage = bioregistry.get_homepage(prefix) if homepage: cx.add_node_attribute(node, 'homepage', homepage) description = bioregistry.get_description(prefix) if description: cx.add_node_attribute(node, 'description', description) pattern = bioregistry.get_pattern(prefix) if pattern: cx.add_node_attribute(node, 'pattern', pattern) # TODO add more return node
def curation(): """Make curation list.""" missing_wikidata_database = _g(lambda prefix, entry: entry.get( 'wikidata', {}).get('database') is None) missing_pattern = _g( lambda prefix, entry: bioregistry.get_pattern(prefix) is None) missing_format_url = _g( lambda prefix, entry: bioregistry.get_format(prefix) is None) missing_example = _g( lambda prefix, entry: bioregistry.get_example(prefix) is None) with open(os.path.join(DOCS_DATA, 'curation.yml'), 'w') as file: yaml.safe_dump( { 'wikidata': missing_wikidata_database, 'pattern': missing_pattern, 'formatter': missing_format_url, 'example': missing_example, }, file, )
def test_banana(self): """Test that entries curated with a new banana are resolved properly.""" for prefix, entry in bioregistry.read_registry().items(): banana = entry.get('banana') if banana is None: continue if prefix in IDOT_BROKEN: continue # identifiers.org is broken for these prefixes with self.subTest( prefix=prefix, banana=banana, pattern=bioregistry.get_pattern(prefix), ): identifier = bioregistry.get_example(prefix) self.assertIsNotNone(identifier) url = bioregistry.resolve_identifier.get_identifiers_org_url( prefix, identifier) res = self.session.get(url, allow_redirects=False) self.assertEqual(302, res.status_code, msg=f'failed with URL: {url}')
def main(url: str, local: bool): """Test the API.""" url = url.rstrip('/') if local: url = 'http://localhost:5000' click.echo(f'Testing resolution API on {url}') failure = False prefixes = tqdm(bioregistry.read_registry()) for prefix in prefixes: identifier = bioregistry.get_example(prefix) if identifier is None: continue prefixes.set_postfix({'prefix': prefix}) req_url = f'{url}/{prefix}:{identifier}' res = requests.get(req_url, allow_redirects=False) log = partial(_log, req_url=req_url) if res.status_code == 302: # redirect continue elif res.status_code != 404: text = res.text.splitlines()[3][len('<p>'):-len('</p>')] log(f'HTTP {res.status_code}: {res.reason} {text}', fg='red') elif not bioregistry.get_providers(prefix, identifier): continue elif '/' in identifier or SLASH_URL_ENCODED in identifier: log('contains slash 🎩 🎸', fg='red') elif not bioregistry.validate(prefix, identifier): pattern = bioregistry.get_pattern(prefix) if bioregistry.get_banana(prefix): log(f'banana {pattern} 🍌', fg='red') else: log(f'invalid example does not match pattern {pattern}', fg='red') else: log('404 unknown issue', fg='red') failure = True return sys.exit(1 if failure else 0)
def test_lui(self): """Test the LUI makes sense (spoilers, they don't). Discussion is ongoing at: - https://github.com/identifiers-org/identifiers-org.github.io/issues/151 """ for prefix in bioregistry.read_registry(): if not bioregistry.namespace_in_lui(prefix): continue if bioregistry.get_banana(prefix): continue # rewrite rules are applied to prefixes with bananas if prefix in {'ark', 'obi'}: continue # these patterns on identifiers.org are garb with self.subTest(prefix=prefix): re_pattern = bioregistry.get_pattern(prefix) miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix) self.assertTrue( re_pattern.startswith(f'^{miriam_prefix.upper()}') or re_pattern.startswith(miriam_prefix.upper()), msg=f'{prefix} pattern: {re_pattern}', )
def resolve(prefix: str, identifier: Optional[str] = None): """Resolve a CURIE. The following things can make a CURIE unable to resolve: 1. The prefix is not registered with the Bioregistry 2. The prefix has a validation pattern and the identifier does not match it 3. There are no providers available for the URL """ # noqa:DAR101,DAR201 norm_prefix = bioregistry.normalize_prefix(prefix) if norm_prefix is None: return render_template('resolve_missing_prefix.html', prefix=prefix, identifier=identifier), 404 if identifier is None: return redirect(url_for('.' + resource.__name__, prefix=norm_prefix)) pattern = bioregistry.get_pattern(prefix) if pattern and not bioregistry.validate(prefix, identifier): return render_template( 'resolve_invalid_identifier.html', prefix=prefix, identifier=identifier, pattern=pattern, ), 404 url = bioregistry.get_link(prefix, identifier, use_bioregistry_io=False) if not url: return render_template('resolve_missing_providers.html', prefix=prefix, identifier=identifier), 404 try: # TODO remove any garbage characters? return redirect(url) except ValueError: # headers could not be constructed return render_template('resolve_disallowed_identifier.html', prefix=prefix, identifier=identifier), 404
def test_collections(self): """Check collections have minimal metadata and correct prefixes.""" for key, collection in sorted(bioregistry.read_collections().items()): with self.subTest(key=key): self.assertRegex(key, '^\\d{7}$') self.assertIn('name', collection) self.assertIn('author', collection) self.assertIn('name', collection['author']) self.assertIn('orcid', collection['author']) self.assertRegex(collection['author']['orcid'], bioregistry.get_pattern('orcid')) self.assertIn('description', collection) incorrect = { prefix for prefix in collection['resources'] if prefix not in self.registry } self.assertEqual(set(), incorrect) duplicates = { prefix for prefix, count in Counter(collection['resources']).items() if 1 < count } self.assertEqual(set(), duplicates, msg='Duplicates found')
def load( load_all: bool, load_resources: bool = False, load_names: bool = False, load_alts: bool = False, load_xrefs: bool = True, load_synonyms: bool = False, reset: bool = False, ) -> None: """Load the database.""" if reset: drop_all() create_all() if load_resources or load_all: prefix_to_resource: Dict[str, Resource] = {} prefixes = {resource.prefix for resource in Resource.query.all()} for prefix, entry in tqdm(bioregistry.read_registry().items(), desc="loading resources"): if bioregistry.is_deprecated(prefix): continue if prefix in prefixes: continue prefix_to_resource[prefix] = resource_model = Resource( prefix=prefix, name=entry["name"], pattern=bioregistry.get_pattern(prefix), ) session.add(resource_model) session.commit() ooh_na_na_path = ensure_ooh_na_na() synonyms_path = ensure_synonyms() xrefs_path = ensure_inspector_javert() if load_alts or load_all: alts_path = ensure_alts() alts_df = pd.read_csv(alts_path, sep="\t", dtype=str) # prefix, alt, identifier logger.info("inserting %d alt identifiers", len(alts_df.index)) alts_df.to_sql(name=Alt.__tablename__, con=engine, if_exists="append", index=False) logger.info("committing alt identifier") session.commit() logger.info("done committing alt identifiers") for label, path, table, columns, checker in [ ("names", ooh_na_na_path, Reference, None, load_names), ("synonyms", synonyms_path, Synonym, ["prefix", "identifier", "name"], load_synonyms), ( "xrefs", xrefs_path, Xref, [ "prefix", "identifier", "xref_prefix", "xref_identifier", "source" ], load_xrefs, ), ]: if not checker and not load_all: continue logger.info("beginning insertion of %s", label) conn = engine.raw_connection() logger.info("inserting with low-level copy of %s from: %s", label, path) if columns: columns = ", ".join(columns) logger.info("corresponding to columns: %s", columns) columns = f" ({columns})" else: columns = "" with conn.cursor() as cursor, gzip.open(path) as file: # next(file) # skip the header sql = f"""COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';""" logger.info("running SQL: %s", sql) cursor.copy_expert(sql=sql, file=file) logger.info("committing %s", label) conn.commit() logger.info("done committing %s", label) logger.info(f"number resources loaded: {Resource.query.count():,}") logger.info(f"number references loaded: {Reference.query.count():,}") logger.info(f"number alts loaded: {Alt.query.count():,}") logger.info(f"number synonyms loaded: {Synonym.query.count():,}") logger.info(f"number xrefs loaded: {Xref.query.count():,}")