def test_identifiers_mapping(self): """Test the identifier mappings are all contained in the Bioregistry.""" for prefix, target in indra.databases.identifiers.identifiers_mappings.items(): if prefix in {'CTD', 'NONCODE', 'NCBI'}: # these aren't specific enough continue with self.subTest(prefix=prefix): self.assertIsNotNone(bioregistry.normalize_prefix(prefix), msg=f'should be {target}')
def relations(prefix: str, relation: str, target: str, force: bool, no_strict: bool, summarize: bool): """Page through the relations for entities in the given namespace.""" if relation is None: relations_df = get_relations_df(prefix, force=force, strict=not no_strict) if summarize: click.echo(relations_df[relations_df.columns[2]].value_counts()) else: echo_df(relations_df) else: curie = normalize_curie(relation) if curie[1] is None: # that's the identifier click.secho(f"not valid curie, assuming local to {prefix}", fg="yellow") curie = prefix, relation if target is not None: target = bioregistry.normalize_prefix(target) relations_df = get_filtered_relations_df(prefix, relation=curie, force=force, strict=not no_strict, target=target) else: raise NotImplementedError( f"can not filter by target prefix {target}")
def test_url_prefixes(self): """Test that all of the INDRA custom URL prefixes are mapped in the Bioregistry.""" for prefix in indra.databases.identifiers.url_prefixes: if prefix in NON_BIOLOGY: continue with self.subTest(prefix=prefix): self.assertIsNotNone(bioregistry.normalize_prefix(prefix))
def _help(self, expected, d, msg=None): expected = {CONCEPT: expected} d = {CONCEPT: d} self.assertIsNotNone(bioregistry.normalize_prefix( expected[CONCEPT][NAMESPACE]), msg='Unrecognized namespace') _process_concept(concept=d[CONCEPT], node=d) self.assertEqual(expected[CONCEPT], d[CONCEPT], msg=msg)
def main(force: bool): if force: for prefix in tqdm(PRIORITY_LIST, desc='reloading resources'): tqdm.write(f'reloading {prefix}') pyobo.get_id_definition_mapping(prefix, force=True) description_rows = [tuple(row) for row in load_descriptions()] descriptions = {e: d for e, _source, d in description_rows} xrefs = defaultdict(dict) unnorm = set() for xref_ns, xref_id, fplx_id in load_equivalences(): norm_xref_ns = bioregistry.normalize_prefix(xref_ns) if norm_xref_ns is None: if xref_ns not in unnorm: print('unnormalized ns', xref_ns) unnorm.add(xref_ns) continue xrefs[fplx_id][norm_xref_ns] = xref_id entities = load_entities() missing_description = set(entities) - set(descriptions) print(f'{len(descriptions)} have descriptions') print(f'{len(missing_description)} missing descriptions') for fplx_id in missing_description: entity_xrefs = xrefs.get(fplx_id) if not entity_xrefs: continue if list(entity_xrefs) == ['bel']: # skip famplexes with only a BEL reference since these don't have any meaningful # lookup, but would be worth curating by hand. continue for prefix in PRIORITY_LIST: identifier = entity_xrefs.get(prefix) if not identifier: continue definition = pyobo.get_definition(prefix, identifier) if definition: description_rows.append((fplx_id, f'{prefix}:{identifier}', definition)) break else: exr = {k: v for k, v in entity_xrefs.items() if k not in {'bel'}} print(f'Did not get for {fplx_id} with xrefs {exr}') description_rows = sorted(description_rows) for path in PATH, DESCRIPTIONS_PATH: with open(path, 'w') as file: writer = csv.writer( file, delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL, quotechar='"', ) writer.writerows(description_rows)
def validate(): """Validate identifiers.""" df = get_xrefs_df() for i, (prefix, identifier) in df[['source_db', 'source_id']].iterrows(): norm_prefix = bioregistry.normalize_prefix(prefix) if prefix != norm_prefix: raise ValueError( f'invalid source prefix: {prefix} should be {norm_prefix}') if not bioregistry.validate(prefix, identifier): raise ValueError( f'[line {i}] Invalid source curie: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}', ) for i, (prefix, identifier) in df[['target_db', 'target_id']].iterrows(): norm_prefix = bioregistry.normalize_prefix(prefix) if prefix != norm_prefix: raise ValueError( f'invalid target prefix: {prefix} should be {norm_prefix}') if not bioregistry.validate(prefix, identifier): raise ValueError( f'[line {i}] Invalid target curie: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}', )
def normalize_prefix(prefix: str, *, curie=None, xref=None, strict: bool = True) -> Optional[str]: """Normalize a namespace and return, if possible.""" norm_prefix = bioregistry.normalize_prefix(prefix) if norm_prefix is not None: return norm_prefix if curie is None or curie.startswith('obo:'): return if curie.startswith('UBERON:'): # uberon has tons of xrefs to anatomical features. skip them UBERON_UNHANDLED[prefix].append((curie, xref)) elif strict: raise MissingPrefix(prefix=prefix, curie=curie, xref=xref)
def get_remapping(force: bool = False) -> Mapping[Tuple[str, str], Tuple[str, str, str]]: """Get a mapping from database/identifier pairs to famplex identifiers.""" df = _get_famplex_df(force=force) rv = {} for target_ns, target_id, source_id in df.values: if target_ns.lower() == 'medscan': continue # MEDSCAN is proprietary and Ben said to skip using these identifiers remapped_prefix = normalize_prefix(target_ns) if remapped_prefix is None: logger.warning('could not remap %s', target_ns) else: rv[remapped_prefix, target_id] = 'fplx', source_id, source_id return rv
def xrefs(prefix: str, target: str, force: bool, no_strict: bool): """Page through xrefs for the given namespace to the second given namespace.""" if target: target = bioregistry.normalize_prefix(target) filtered_xrefs = get_filtered_xrefs(prefix, target, force=force, strict=not no_strict) click.echo_via_pager("\n".join( f"{identifier}\t{_xref}" for identifier, _xref in filtered_xrefs.items())) else: all_xrefs_df = get_xrefs_df(prefix, force=force, strict=not no_strict) echo_df(all_xrefs_df)
def test_resolve(self): """Test prefixes can be resolved properly.""" for expected, query in [ ('ncbitaxon', 'ncbitaxon'), ('ncbitaxon', 'NCBITaxon'), ('ncbitaxon', 'taxonomy'), ('bel', 'SCOMP'), ('bel', 'SFAM'), ('eccode', 'ec-code'), ('eccode', 'EC_CODE'), ('chembl.compound', 'chembl.compound'), ('chembl.compound', 'chemblcompound'), ('chembl', 'chembl'), ]: with self.subTest(query=query): self.assertEqual(expected, bioregistry.normalize_prefix(query))
def _align(self): """Align the external registry.""" for external_id, external_entry in self.external_registry.items(): if external_id in self.skip_external: continue bioregistry_id = self.external_id_to_bioregistry_id.get(external_id) # try to lookup with lexical match if bioregistry_id is None: bioregistry_id = normalize_prefix(external_id) if bioregistry_id is not None: # a match was found _entry = self.prepare_external(external_id, external_entry) _entry['prefix'] = external_id self.internal_registry[bioregistry_id][self.key] = _entry self.external_id_to_bioregistry_id[external_id] = bioregistry_id
def normalize_prefix(prefix: str, *, curie=None, xref=None, strict: bool = True) -> Optional[str]: """Normalize a namespace and return, if possible.""" norm_prefix = bioregistry.normalize_prefix(prefix) if norm_prefix is not None: return norm_prefix if curie is None or curie.startswith("obo:"): return None if curie.startswith("http") or curie.startswith("urn:"): return None if curie.startswith("UBERON:"): # uberon has tons of xrefs to anatomical features. skip them UBERON_UNHANDLED[prefix].append((curie, xref)) elif strict: raise MissingPrefix(prefix=prefix, curie=curie, xref=xref) # if prefix.replace(':', '').replace("'", '').replace('-', '').replace('%27', '').isalpha(): # return # skip if its just text return None
def test_remap_scomp(self, *_): """Test remapping SFAM to FPLX.""" self.assertIsNotNone(bioregistry.normalize_prefix('BEL')) self.assertIn( ('bel', 'gamma Secretase Complex'), _NAME_REMAPPING, msg='name remapping is not populated properly', ) self._help( { NAMESPACE: 'fplx', NAME: 'Gamma_secretase', IDENTIFIER: 'Gamma_secretase' }, { NAMESPACE: 'SCOMP', NAME: 'gamma Secretase Complex' }, )
def __post_init__(self): """Run post-init checks.""" if self.ontology != bioregistry.normalize_prefix(self.ontology): raise BioregistryError(self.ontology) # The type ignores are because of the hack where we override the # class variables in the instance if self.name is None: self.name = bioregistry.get_name(self.ontology) # type:ignore if not self.data_version: if self.static_version: self.data_version = self.static_version else: self.data_version = self._get_version() if not self.dynamic_version: if self.data_version is None: raise ValueError(f"{self.ontology} is missing data_version") elif "/" in self.data_version: raise ValueError( f"{self.ontology} has a slash in version: {self.data_version}" ) if self.auto_generated_by is None: self.auto_generated_by = f"bio2obo:{self.ontology}" # type:ignore
def resolve(prefix: str, identifier: Optional[str] = None): """Resolve a CURIE. The following things can make a CURIE unable to resolve: 1. The prefix is not registered with the Bioregistry 2. The prefix has a validation pattern and the identifier does not match it 3. There are no providers available for the URL """ # noqa:DAR101,DAR201 norm_prefix = bioregistry.normalize_prefix(prefix) if norm_prefix is None: return render_template('resolve_missing_prefix.html', prefix=prefix, identifier=identifier), 404 if identifier is None: return redirect(url_for('.' + resource.__name__, prefix=norm_prefix)) pattern = bioregistry.get_pattern(prefix) if pattern and not bioregistry.validate(prefix, identifier): return render_template( 'resolve_invalid_identifier.html', prefix=prefix, identifier=identifier, pattern=pattern, ), 404 url = bioregistry.get_link(prefix, identifier, use_bioregistry_io=False) if not url: return render_template('resolve_missing_providers.html', prefix=prefix, identifier=identifier), 404 try: # TODO remove any garbage characters? return redirect(url) except ValueError: # headers could not be constructed return render_template('resolve_disallowed_identifier.html', prefix=prefix, identifier=identifier), 404
def get_identifier( namespace: str, name: str) -> Union[Tuple[str, None, str], Tuple[str, str, str]]: if namespace in {'SFAM', 'SCOMP'}: return 'fplx', bel_fplx.get(name), name if namespace in {'SCHEM', 'CHEBI'}: prefix, identifier, name = pyobo.ground('chebi', name) return prefix or namespace, identifier, name norm_namespace = bioregistry.normalize_prefix(namespace) if norm_namespace is None: raise ValueError(f'could not normalize {namespace}') namespace = norm_namespace if namespace in MISSING_NAMESPACE: return namespace, None, name try: name_id_mapping = pyobo.get_name_id_mapping(namespace) except: logger.info('missing namespace: %s', namespace) MISSING_NAMESPACE.add(namespace) return namespace, None, name if name_id_mapping: identifier = name_id_mapping.get(name) if identifier: return namespace, identifier, name elif (namespace, name) not in MISSING: MISSING.add((namespace, name)) logger.debug('missing lookup for %s ! %s', namespace, name) return namespace, None, name elif namespace not in MISSING_NAMESPACE: logger.info('empty namespace: %s', namespace) MISSING_NAMESPACE.add(namespace) return namespace, None, name
from famplex.load import load_descriptions, load_entities, load_equivalences from famplex.locations import DESCRIPTIONS_PATH HERE = os.path.abspath(os.path.dirname(__file__)) PATH = os.path.abspath(os.path.join(HERE, os.pardir, os.pardir, 'descriptions.csv')) PRIORITY_LIST = [ 'HGNC_GROUP', 'go', 'mesh', 'PF', 'reactome', 'eccode', 'interpro', ] PRIORITY_LIST = [bioregistry.normalize_prefix(prefix) for prefix in PRIORITY_LIST] @click.command() @verbose_option @click.option('--force', is_flag=True) def main(force: bool): if force: for prefix in tqdm(PRIORITY_LIST, desc='reloading resources'): tqdm.write(f'reloading {prefix}') pyobo.get_id_definition_mapping(prefix, force=True) description_rows = [tuple(row) for row in load_descriptions()] descriptions = {e: d for e, _source, d in description_rows} xrefs = defaultdict(dict)
def test_non_registry(self): """Test the Bioregistry has entries for all non-registry entries in INDRA.""" for prefix in indra.databases.identifiers.non_registry: with self.subTest(prefix=prefix): self.assertIsNotNone(bioregistry.normalize_prefix(prefix))
def has_no_download(prefix: str) -> bool: """Return if the prefix is not available.""" prefix_norm = bioregistry.normalize_prefix(prefix) return prefix_norm is not None and prefix_norm in _no_download()
def not_available_as_obo(prefix: str) -> bool: """Return if the prefix is not available.""" prefix_norm = bioregistry.normalize_prefix(prefix) return prefix_norm is not None and prefix_norm in get_not_available_as_obo( )