예제 #1
0
 def test_identifiers_mapping(self):
     """Test the identifier mappings are all contained in the Bioregistry."""
     for prefix, target in indra.databases.identifiers.identifiers_mappings.items():
         if prefix in {'CTD', 'NONCODE', 'NCBI'}:  # these aren't specific enough
             continue
         with self.subTest(prefix=prefix):
             self.assertIsNotNone(bioregistry.normalize_prefix(prefix), msg=f'should be {target}')
예제 #2
0
def relations(prefix: str, relation: str, target: str, force: bool,
              no_strict: bool, summarize: bool):
    """Page through the relations for entities in the given namespace."""
    if relation is None:
        relations_df = get_relations_df(prefix,
                                        force=force,
                                        strict=not no_strict)
        if summarize:
            click.echo(relations_df[relations_df.columns[2]].value_counts())
        else:
            echo_df(relations_df)
    else:
        curie = normalize_curie(relation)
        if curie[1] is None:  # that's the identifier
            click.secho(f"not valid curie, assuming local to {prefix}",
                        fg="yellow")
            curie = prefix, relation

        if target is not None:
            target = bioregistry.normalize_prefix(target)
            relations_df = get_filtered_relations_df(prefix,
                                                     relation=curie,
                                                     force=force,
                                                     strict=not no_strict,
                                                     target=target)
        else:
            raise NotImplementedError(
                f"can not filter by target prefix {target}")
예제 #3
0
 def test_url_prefixes(self):
     """Test that all of the INDRA custom URL prefixes are mapped in the Bioregistry."""
     for prefix in indra.databases.identifiers.url_prefixes:
         if prefix in NON_BIOLOGY:
             continue
         with self.subTest(prefix=prefix):
             self.assertIsNotNone(bioregistry.normalize_prefix(prefix))
예제 #4
0
 def _help(self, expected, d, msg=None):
     expected = {CONCEPT: expected}
     d = {CONCEPT: d}
     self.assertIsNotNone(bioregistry.normalize_prefix(
         expected[CONCEPT][NAMESPACE]),
                          msg='Unrecognized namespace')
     _process_concept(concept=d[CONCEPT], node=d)
     self.assertEqual(expected[CONCEPT], d[CONCEPT], msg=msg)
예제 #5
0
def main(force: bool):
    if force:
        for prefix in tqdm(PRIORITY_LIST, desc='reloading resources'):
            tqdm.write(f'reloading {prefix}')
            pyobo.get_id_definition_mapping(prefix, force=True)

    description_rows = [tuple(row) for row in load_descriptions()]
    descriptions = {e: d for e, _source, d in description_rows}
    xrefs = defaultdict(dict)

    unnorm = set()
    for xref_ns, xref_id, fplx_id in load_equivalences():
        norm_xref_ns = bioregistry.normalize_prefix(xref_ns)
        if norm_xref_ns is None:
            if xref_ns not in unnorm:
                print('unnormalized ns', xref_ns)
                unnorm.add(xref_ns)
            continue
        xrefs[fplx_id][norm_xref_ns] = xref_id

    entities = load_entities()
    missing_description = set(entities) - set(descriptions)
    print(f'{len(descriptions)} have descriptions')
    print(f'{len(missing_description)} missing descriptions')

    for fplx_id in missing_description:
        entity_xrefs = xrefs.get(fplx_id)
        if not entity_xrefs:
            continue
        if list(entity_xrefs) == ['bel']:
            # skip famplexes with only a BEL reference since these don't have any meaningful
            # lookup, but would be worth curating by hand.
            continue

        for prefix in PRIORITY_LIST:
            identifier = entity_xrefs.get(prefix)
            if not identifier:
                continue
            definition = pyobo.get_definition(prefix, identifier)
            if definition:
                description_rows.append((fplx_id, f'{prefix}:{identifier}', definition))
                break
        else:
            exr = {k: v for k, v in entity_xrefs.items() if k not in {'bel'}}
            print(f'Did not get for {fplx_id} with xrefs {exr}')

    description_rows = sorted(description_rows)

    for path in PATH, DESCRIPTIONS_PATH:
        with open(path, 'w') as file:
            writer = csv.writer(
                file, delimiter=',', lineterminator='\r\n',
                quoting=csv.QUOTE_MINIMAL,
                quotechar='"',
            )
            writer.writerows(description_rows)
예제 #6
0
def validate():
    """Validate identifiers."""
    df = get_xrefs_df()
    for i, (prefix, identifier) in df[['source_db', 'source_id']].iterrows():
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if prefix != norm_prefix:
            raise ValueError(
                f'invalid source prefix: {prefix} should be {norm_prefix}')
        if not bioregistry.validate(prefix, identifier):
            raise ValueError(
                f'[line {i}] Invalid source curie: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}',
            )
    for i, (prefix, identifier) in df[['target_db', 'target_id']].iterrows():
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if prefix != norm_prefix:
            raise ValueError(
                f'invalid target prefix: {prefix} should be {norm_prefix}')
        if not bioregistry.validate(prefix, identifier):
            raise ValueError(
                f'[line {i}] Invalid target curie: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}',
            )
예제 #7
0
def normalize_prefix(prefix: str, *, curie=None, xref=None, strict: bool = True) -> Optional[str]:
    """Normalize a namespace and return, if possible."""
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is not None:
        return norm_prefix

    if curie is None or curie.startswith('obo:'):
        return
    if curie.startswith('UBERON:'):  # uberon has tons of xrefs to anatomical features. skip them
        UBERON_UNHANDLED[prefix].append((curie, xref))
    elif strict:
        raise MissingPrefix(prefix=prefix, curie=curie, xref=xref)
예제 #8
0
파일: famplex.py 프로젝트: fossabot/pyobo
def get_remapping(force: bool = False) -> Mapping[Tuple[str, str], Tuple[str, str, str]]:
    """Get a mapping from database/identifier pairs to famplex identifiers."""
    df = _get_famplex_df(force=force)
    rv = {}
    for target_ns, target_id, source_id in df.values:
        if target_ns.lower() == 'medscan':
            continue  # MEDSCAN is proprietary and Ben said to skip using these identifiers
        remapped_prefix = normalize_prefix(target_ns)
        if remapped_prefix is None:
            logger.warning('could not remap %s', target_ns)
        else:
            rv[remapped_prefix, target_id] = 'fplx', source_id, source_id
    return rv
예제 #9
0
def xrefs(prefix: str, target: str, force: bool, no_strict: bool):
    """Page through xrefs for the given namespace to the second given namespace."""
    if target:
        target = bioregistry.normalize_prefix(target)
        filtered_xrefs = get_filtered_xrefs(prefix,
                                            target,
                                            force=force,
                                            strict=not no_strict)
        click.echo_via_pager("\n".join(
            f"{identifier}\t{_xref}"
            for identifier, _xref in filtered_xrefs.items()))
    else:
        all_xrefs_df = get_xrefs_df(prefix, force=force, strict=not no_strict)
        echo_df(all_xrefs_df)
예제 #10
0
 def test_resolve(self):
     """Test prefixes can be resolved properly."""
     for expected, query in [
         ('ncbitaxon', 'ncbitaxon'),
         ('ncbitaxon', 'NCBITaxon'),
         ('ncbitaxon', 'taxonomy'),
         ('bel', 'SCOMP'),
         ('bel', 'SFAM'),
         ('eccode', 'ec-code'),
         ('eccode', 'EC_CODE'),
         ('chembl.compound', 'chembl.compound'),
         ('chembl.compound', 'chemblcompound'),
         ('chembl', 'chembl'),
     ]:
         with self.subTest(query=query):
             self.assertEqual(expected, bioregistry.normalize_prefix(query))
예제 #11
0
    def _align(self):
        """Align the external registry."""
        for external_id, external_entry in self.external_registry.items():
            if external_id in self.skip_external:
                continue

            bioregistry_id = self.external_id_to_bioregistry_id.get(external_id)

            # try to lookup with lexical match
            if bioregistry_id is None:
                bioregistry_id = normalize_prefix(external_id)

            if bioregistry_id is not None:  # a match was found
                _entry = self.prepare_external(external_id, external_entry)
                _entry['prefix'] = external_id
                self.internal_registry[bioregistry_id][self.key] = _entry
                self.external_id_to_bioregistry_id[external_id] = bioregistry_id
예제 #12
0
def normalize_prefix(prefix: str, *, curie=None, xref=None, strict: bool = True) -> Optional[str]:
    """Normalize a namespace and return, if possible."""
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is not None:
        return norm_prefix

    if curie is None or curie.startswith("obo:"):
        return None
    if curie.startswith("http") or curie.startswith("urn:"):
        return None
    if curie.startswith("UBERON:"):  # uberon has tons of xrefs to anatomical features. skip them
        UBERON_UNHANDLED[prefix].append((curie, xref))
    elif strict:
        raise MissingPrefix(prefix=prefix, curie=curie, xref=xref)
    # if prefix.replace(':', '').replace("'", '').replace('-', '').replace('%27', '').isalpha():
    #     return  # skip if its just text
    return None
예제 #13
0
 def test_remap_scomp(self, *_):
     """Test remapping SFAM to FPLX."""
     self.assertIsNotNone(bioregistry.normalize_prefix('BEL'))
     self.assertIn(
         ('bel', 'gamma Secretase Complex'),
         _NAME_REMAPPING,
         msg='name remapping is not populated properly',
     )
     self._help(
         {
             NAMESPACE: 'fplx',
             NAME: 'Gamma_secretase',
             IDENTIFIER: 'Gamma_secretase'
         },
         {
             NAMESPACE: 'SCOMP',
             NAME: 'gamma Secretase Complex'
         },
     )
예제 #14
0
 def __post_init__(self):
     """Run post-init checks."""
     if self.ontology != bioregistry.normalize_prefix(self.ontology):
         raise BioregistryError(self.ontology)
     # The type ignores are because of the hack where we override the
     # class variables in the instance
     if self.name is None:
         self.name = bioregistry.get_name(self.ontology)  # type:ignore
     if not self.data_version:
         if self.static_version:
             self.data_version = self.static_version
         else:
             self.data_version = self._get_version()
     if not self.dynamic_version:
         if self.data_version is None:
             raise ValueError(f"{self.ontology} is missing data_version")
         elif "/" in self.data_version:
             raise ValueError(
                 f"{self.ontology} has a slash in version: {self.data_version}"
             )
     if self.auto_generated_by is None:
         self.auto_generated_by = f"bio2obo:{self.ontology}"  # type:ignore
예제 #15
0
파일: ui.py 프로젝트: pagreene/bioregistry
def resolve(prefix: str, identifier: Optional[str] = None):
    """Resolve a CURIE.

    The following things can make a CURIE unable to resolve:

    1. The prefix is not registered with the Bioregistry
    2. The prefix has a validation pattern and the identifier does not match it
    3. There are no providers available for the URL
    """  # noqa:DAR101,DAR201
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is None:
        return render_template('resolve_missing_prefix.html',
                               prefix=prefix,
                               identifier=identifier), 404
    if identifier is None:
        return redirect(url_for('.' + resource.__name__, prefix=norm_prefix))

    pattern = bioregistry.get_pattern(prefix)
    if pattern and not bioregistry.validate(prefix, identifier):
        return render_template(
            'resolve_invalid_identifier.html',
            prefix=prefix,
            identifier=identifier,
            pattern=pattern,
        ), 404

    url = bioregistry.get_link(prefix, identifier, use_bioregistry_io=False)
    if not url:
        return render_template('resolve_missing_providers.html',
                               prefix=prefix,
                               identifier=identifier), 404
    try:
        # TODO remove any garbage characters?
        return redirect(url)
    except ValueError:  # headers could not be constructed
        return render_template('resolve_disallowed_identifier.html',
                               prefix=prefix,
                               identifier=identifier), 404
예제 #16
0
def get_identifier(
        namespace: str,
        name: str) -> Union[Tuple[str, None, str], Tuple[str, str, str]]:
    if namespace in {'SFAM', 'SCOMP'}:
        return 'fplx', bel_fplx.get(name), name
    if namespace in {'SCHEM', 'CHEBI'}:
        prefix, identifier, name = pyobo.ground('chebi', name)
        return prefix or namespace, identifier, name

    norm_namespace = bioregistry.normalize_prefix(namespace)
    if norm_namespace is None:
        raise ValueError(f'could not normalize {namespace}')
    namespace = norm_namespace

    if namespace in MISSING_NAMESPACE:
        return namespace, None, name
    try:
        name_id_mapping = pyobo.get_name_id_mapping(namespace)
    except:
        logger.info('missing namespace: %s', namespace)
        MISSING_NAMESPACE.add(namespace)
        return namespace, None, name

    if name_id_mapping:
        identifier = name_id_mapping.get(name)
        if identifier:
            return namespace, identifier, name
        elif (namespace, name) not in MISSING:
            MISSING.add((namespace, name))
            logger.debug('missing lookup for %s ! %s', namespace, name)
            return namespace, None, name
    elif namespace not in MISSING_NAMESPACE:
        logger.info('empty namespace: %s', namespace)
        MISSING_NAMESPACE.add(namespace)

    return namespace, None, name
예제 #17
0
from famplex.load import load_descriptions, load_entities, load_equivalences
from famplex.locations import DESCRIPTIONS_PATH

HERE = os.path.abspath(os.path.dirname(__file__))
PATH = os.path.abspath(os.path.join(HERE, os.pardir, os.pardir, 'descriptions.csv'))

PRIORITY_LIST = [
    'HGNC_GROUP',
    'go',
    'mesh',
    'PF',
    'reactome',
    'eccode',
    'interpro',
]
PRIORITY_LIST = [bioregistry.normalize_prefix(prefix) for prefix in PRIORITY_LIST]


@click.command()
@verbose_option
@click.option('--force', is_flag=True)
def main(force: bool):
    if force:
        for prefix in tqdm(PRIORITY_LIST, desc='reloading resources'):
            tqdm.write(f'reloading {prefix}')
            pyobo.get_id_definition_mapping(prefix, force=True)

    description_rows = [tuple(row) for row in load_descriptions()]
    descriptions = {e: d for e, _source, d in description_rows}
    xrefs = defaultdict(dict)
예제 #18
0
 def test_non_registry(self):
     """Test the Bioregistry has entries for all non-registry entries in INDRA."""
     for prefix in indra.databases.identifiers.non_registry:
         with self.subTest(prefix=prefix):
             self.assertIsNotNone(bioregistry.normalize_prefix(prefix))
예제 #19
0
def has_no_download(prefix: str) -> bool:
    """Return if the prefix is not available."""
    prefix_norm = bioregistry.normalize_prefix(prefix)
    return prefix_norm is not None and prefix_norm in _no_download()
예제 #20
0
def not_available_as_obo(prefix: str) -> bool:
    """Return if the prefix is not available."""
    prefix_norm = bioregistry.normalize_prefix(prefix)
    return prefix_norm is not None and prefix_norm in get_not_available_as_obo(
    )