def get_specific_chebi_id(chebi_ids, name): # NOTE: this function is mainly factored out to be able to use cacheing, it # requires a frozenset as input to work. # First, if we have a manual override, we just do that manual_id = manual_chebi_map.get(name) if manual_id: return manual_id # The first thing we do is eliminate the secondary IDs by mapping them to # primaries primary_ids = {chebi_client.get_primary_id(cid) for cid in chebi_ids} # Occasinally, invalid ChEBI IDs are given that don't have corresponding # primary IDs, which we can filter out primary_ids = {pi for pi in primary_ids if pi is not None} # We then get rid of generic IDs which are never useful for grounding non_generic_ids = primary_ids - generic_chebi_ids # We then try name-based grounding to see if any of the names in the list # match the name of the entity well enough grounding_names = [ chebi_client.get_chebi_name_from_id(p) for p in non_generic_ids ] for grounding_name, grounding_id in zip(grounding_names, non_generic_ids): if grounding_name and (name.lower() == grounding_name.lower()): return grounding_id # If we still have no best grounding, we try to distill the IDs down to # the most specific one based on the hierarchy specific_chebi_id = chebi_client.get_specific_id(non_generic_ids) return specific_chebi_id
def fix_id_standards(db_ns, db_id): if db_ns == 'CHEBI': if not db_id.startswith('CHEBI:'): db_id = f'CHEBI:{db_id}' db_id = chebi_client.get_primary_id(db_id) elif db_ns == 'HGNC' and db_id.startswith('HGNC:'): db_id = db_id[5:] return db_ns, db_id
def generate_chebi_terms(): fname = os.path.join(indra_resources, 'chebi_entries.tsv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, header=True, delimiter='\t'): db = 'CHEBI' id = 'CHEBI:' + row['CHEBI_ID'] name = row['NAME'] term = Term(normalize(name), name, db, id, name, 'name', 'chebi') terms.append(term) logger.info('Loaded %d terms' % len(terms)) # Now we add synonyms # NOTE: this file is not in version control. The file is available # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_ # tab_delimited/names_3star.tsv.gz, it needs to be decompressed # into the INDRA resources folder. fname = os.path.join(indra_resources, 'names_3star.tsv') if not os.path.exists(fname): import pandas as pd chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \ 'Flat_file_tab_delimited/names_3star.tsv.gz' logger.info('Loading %s into memory. You can download and decompress' ' it in the indra/resources folder for faster access.' % chebi_url) df = pd.read_csv(chebi_url, sep='\t') rows = (row for _, row in df.iterrows()) else: rows = read_csv(fname, header=True, delimiter='\t') added = set() for row in rows: chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID'])) if not chebi_id: logger.info('Could not get valid CHEBI ID for %s' % row['COMPOUND_ID']) continue db = 'CHEBI' id = 'CHEBI:%s' % chebi_id name = str(row['NAME']) chebi_name = \ chebi_client.get_chebi_name_from_id(chebi_id, offline=True) if chebi_name is None: logger.info('Could not get valid name for %s' % chebi_id) continue term_args = (normalize(name), name, db, id, chebi_name, 'synonym', 'chebi') if term_args in added: continue else: term = Term(*term_args) terms.append(term) added.add(term_args) logger.info('Loaded %d terms' % len(terms)) return terms
def generate_chebi_terms(): # We can get standard names directly from the OBO terms = _generate_obo_terms('chebi', ignore_mappings=True, map_to_ns={}) # Now we add synonyms # NOTE: this file is not in version control. The file is available # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_ # tab_delimited/names_3star.tsv.gz, it needs to be decompressed # into the INDRA resources folder. fname = os.path.join(indra_resources, 'names_3star.tsv') if not os.path.exists(fname): import pandas as pd chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \ 'Flat_file_tab_delimited/names_3star.tsv.gz' logger.info('Loading %s into memory. You can download and decompress' ' it in the indra/resources folder for faster access.' % chebi_url) df = pd.read_csv(chebi_url, sep='\t') rows = (row for _, row in df.iterrows()) else: rows = read_csv(fname, header=True, delimiter='\t') added = set() for row in rows: chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID'])) if not chebi_id: logger.info('Could not get valid CHEBI ID for %s' % row['COMPOUND_ID']) continue db = 'CHEBI' name = str(row['NAME']) chebi_name = \ chebi_client.get_chebi_name_from_id(chebi_id, offline=True) if chebi_name is None: logger.info('Could not get valid name for %s' % chebi_id) continue # We skip entries of the form Glu-Lys with synonyms like EK since # there are highly ambiguous with other acronyms, and are unlikely # to be used in practice. if is_aa_sequence(chebi_name) and re.match(r'(^[A-Z-]+$)', name): continue term_args = (normalize(name), name, db, chebi_id, chebi_name, 'synonym', 'chebi') if term_args in added: continue else: term = Term(*term_args) terms.append(term) added.add(term_args) logger.info('Loaded %d terms' % len(terms)) return terms
def sanitize_chebi_ids(chebi_ids, name): chebi_ids = {chebi_id if chebi_id.startswith('CHEBI:') else 'CHEBI:%s' % chebi_id for chebi_id in chebi_ids} chebi_ids = {chebi_client.get_primary_id(chebi_id) for chebi_id in chebi_ids} # Make sure we eliminate Nones here which can appear as a result # of failed primary ID lookups chebi_ids = {ci for ci in chebi_ids if ci is not None} if not chebi_ids: return [] elif len(chebi_ids) == 1: return list(chebi_ids) specific_chebi_id = get_specific_chebi_id(frozenset(chebi_ids), name) return specific_chebi_id
def _get_primary_id_wrapper(chebi_id): return get_primary_id(chebi_id)[6:]
def test_chebi_to_primary(): assert chebi_client.get_primary_id('CHEBI:6281') == 'CHEBI:17490' assert chebi_client.get_primary_id('CHEBI:161680') == 'CHEBI:161680'