def validate(db_ns, db_id): """Validate identifier, accepting HGNC name or ID""" if db_ns == 'HGNC': if db_id.isdigit(): return validate_id(db_ns, db_id) else: return get_hgnc_id(db_id) is not None else: return validate_id(db_ns, db_id)
def id_url(ag): # Return identifier URLs in a prioritized order # TODO: we should add handling for UPPRO here, however, that would require # access to UniProt client resources in the context of the DB REST API # which could be problematic for db_name in link_namespace_order: if db_name in ag.db_refs: # Handle a special case where a list of IDs is given if isinstance(ag.db_refs[db_name], list): db_id = ag.db_refs[db_name][0] if db_name == 'WM': db_id = db_id[0] else: db_id = ag.db_refs[db_name] # We can add more name spaces here if there are issues if db_name in {'CHEBI'}: db_id = ensure_prefix('CHEBI', db_id) # Here we validate IDs to make sure we don't surface invalid # links. if not validate_id(db_name, db_id): logger.debug('Invalid grounding encountered: %s:%s' % (db_name, db_id)) continue # Finally, we return a valid identifiers.org URL return get_identifiers_url(db_name, db_id)
def update_mesh_supplementary_names(): """Update MeSH ID to name mappings for supplementary terms.""" supp_url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/' 'xmlmesh/supp2021.gz') supp_path = os.path.join(path, 'mesh_supp2021.gz') if not os.path.exists(supp_path): logging.info('Download MeSH supplement from %s', supp_url) urlretrieve(supp_url, supp_path) logging.info('Done downloading MeSH supplement') with gzip.open(supp_path) as supp_file: logging.info('Parsing MeSH supplement') supp_et = ET.parse(supp_file) supp_rows = [] reg_number_mappings = [] for record in supp_et.iterfind('SupplementalRecord'): uid = record.find('SupplementalRecordUI').text name = record.find('SupplementalRecordName/String').text mapped_to_terms = record.findall('HeadingMappedToList/HeadingMappedTo/' 'DescriptorReferredTo/DescriptorUI') mapped_to = ','.join([term.text.replace('*', '') for term in mapped_to_terms]) term_name_str = _get_term_name_str(record, name) reg_number_tags = record.findall('ConceptList/Concept/RegistryNumber') if len(reg_number_tags) == 1: reg_number = reg_number_tags[0].text from indra.statements import validate if validate.validate_id('CAS', reg_number): reg_number_mappings.append([uid, reg_number]) supp_rows.append((uid, name, term_name_str, mapped_to)) fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv') write_unicode_csv(fname, supp_rows, delimiter='\t') fname = os.path.join(path, 'mesh_cas_mappings.tsv') write_unicode_csv(fname, reg_number_mappings, delimiter='\t')
def _make_agent(self, hprd_id, refseq_id=None): if hprd_id is None or hprd_id is nan: return None # Get the basic info (HGNC name/symbol, Entrez ID) from the # ID mappings dataframe try: egid = self.id_df.loc[hprd_id].EGID except KeyError: logger.info('HPRD ID %s not found in mappings table.' % hprd_id) return None if not egid: logger.info('No Entrez ID for HPRD ID %s' % hprd_id) return None # Get the HGNC ID hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) # If we couldn't get an HGNC ID for the Entrez ID, this means that # the Entrez ID has been discontinued or replaced. if not hgnc_id: self.no_hgnc_for_egid.update(egid) return None # Get the (possibly updated) HGNC Symbol hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is not None # See if we can get a Uniprot ID from the HGNC symbol--if there is # a RefSeq ID we wil also try to use it to get an isoform specific # UP ID, but we will have this one to fall back on. But if we can't # get one here, then we skip the Statement up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_from_hgnc: self.no_up_for_hgnc.update((egid, hgnc_name, hgnc_id)) return None # If we have provided the RefSeq ID, it's because we need to make # sure that we are getting the right isoform-specific ID (for sequence # positions of PTMs). Here we try to get the Uniprot ID from the # Refseq->UP mappings in the protmapper.uniprot_client. if refseq_id is not None: if not validate_id('REFSEQ_PROT', refseq_id): if validate_id('NCBIPROTEIN', refseq_id): refseq_ns = 'NCBIPROTEIN' else: refseq_ns = None else: refseq_ns = 'REFSEQ_PROT' if refseq_ns == 'REFSEQ_PROT': # Get the Uniprot IDs from the uniprot client up_ids = uniprot_client.get_ids_from_refseq(refseq_id, reviewed_only=True) else: up_ids = [] # Nothing for this RefSeq ID (quite likely because the RefSeq # ID is obsolete; take the UP ID from HGNC if len(up_ids) == 0: self.no_up_for_refseq.update(refseq_id) up_id = up_id_from_hgnc # More than one reviewed entry--no thanks, we'll take the one # from HGNC instead elif len(up_ids) > 1: self.many_ups_for_refseq.update(refseq_id) up_id = up_id_from_hgnc # We got a unique, reviewed UP entry for the RefSeq ID else: up_id = up_ids[0] # If it's the canonical isoform, strip off the '-1' if up_id.endswith('-1'): up_id = up_id.split('-')[0] # For completeness, get the Refseq ID from the HPRD ID table else: refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN if not validate_id('REFSEQ_PROT', refseq_id): if validate_id('NCBIPROTEIN', refseq_id): refseq_ns = 'NCBIPROTEIN' else: refseq_ns = None else: refseq_ns = 'REFSEQ_PROT' up_id = up_id_from_hgnc # Make db_refs, return Agent db_refs = {} if hgnc_id: db_refs['HGNC'] = hgnc_id if up_id: if ',' in up_id: pass elif '-' in up_id: up_base = up_id.split('-')[0] db_refs['UP'] = up_base db_refs['UPISO'] = up_id else: db_refs['UP'] = up_id if egid: db_refs['EGID'] = egid if refseq_ns and refseq_id: db_refs[refseq_ns] = refseq_id return Agent(hgnc_name, db_refs=db_refs)