def validate(db_ns, db_id):
    """Validate identifier, accepting HGNC name or ID"""
    if db_ns == 'HGNC':
        if db_id.isdigit():
            return validate_id(db_ns, db_id)
        else:
            return get_hgnc_id(db_id) is not None
    else:
        return validate_id(db_ns, db_id)
Exemple #2
0
def id_url(ag):
    # Return identifier URLs in a prioritized order
    # TODO: we should add handling for UPPRO here, however, that would require
    # access to UniProt client resources in the context of the DB REST API
    # which could be problematic
    for db_name in link_namespace_order:
        if db_name in ag.db_refs:
            # Handle a special case where a list of IDs is given
            if isinstance(ag.db_refs[db_name], list):
                db_id = ag.db_refs[db_name][0]
                if db_name == 'WM':
                    db_id = db_id[0]
            else:
                db_id = ag.db_refs[db_name]
            # We can add more name spaces here if there are issues
            if db_name in {'CHEBI'}:
                db_id = ensure_prefix('CHEBI', db_id)
            # Here we validate IDs to make sure we don't surface invalid
            # links.
            if not validate_id(db_name, db_id):
                logger.debug('Invalid grounding encountered: %s:%s' %
                             (db_name, db_id))
                continue
            # Finally, we return a valid identifiers.org URL
            return get_identifiers_url(db_name, db_id)
Exemple #3
0
def update_mesh_supplementary_names():
    """Update MeSH ID to name mappings for supplementary terms."""
    supp_url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/'
                'xmlmesh/supp2021.gz')
    supp_path = os.path.join(path, 'mesh_supp2021.gz')
    if not os.path.exists(supp_path):
        logging.info('Download MeSH supplement from %s', supp_url)
        urlretrieve(supp_url, supp_path)
        logging.info('Done downloading MeSH supplement')
    with gzip.open(supp_path) as supp_file:
        logging.info('Parsing MeSH supplement')
        supp_et = ET.parse(supp_file)
    supp_rows = []
    reg_number_mappings = []
    for record in supp_et.iterfind('SupplementalRecord'):
        uid = record.find('SupplementalRecordUI').text
        name = record.find('SupplementalRecordName/String').text
        mapped_to_terms = record.findall('HeadingMappedToList/HeadingMappedTo/'
                                         'DescriptorReferredTo/DescriptorUI')
        mapped_to = ','.join([term.text.replace('*', '')
                              for term in mapped_to_terms])
        term_name_str = _get_term_name_str(record, name)
        reg_number_tags = record.findall('ConceptList/Concept/RegistryNumber')
        if len(reg_number_tags) == 1:
            reg_number = reg_number_tags[0].text
            from indra.statements import validate
            if validate.validate_id('CAS', reg_number):
                reg_number_mappings.append([uid, reg_number])
        supp_rows.append((uid, name, term_name_str, mapped_to))

    fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv')
    write_unicode_csv(fname, supp_rows, delimiter='\t')
    fname = os.path.join(path, 'mesh_cas_mappings.tsv')
    write_unicode_csv(fname, reg_number_mappings, delimiter='\t')
Exemple #4
0
 def _make_agent(self, hprd_id, refseq_id=None):
     if hprd_id is None or hprd_id is nan:
         return None
     # Get the basic info (HGNC name/symbol, Entrez ID) from the
     # ID mappings dataframe
     try:
         egid = self.id_df.loc[hprd_id].EGID
     except KeyError:
         logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
         return None
     if not egid:
         logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
         return None
     # Get the HGNC ID
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     # If we couldn't get an HGNC ID for the Entrez ID, this means that
     # the Entrez ID has been discontinued or replaced.
     if not hgnc_id:
         self.no_hgnc_for_egid.update(egid)
         return None
     # Get the (possibly updated) HGNC Symbol
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     assert hgnc_name is not None
     # See if we can get a Uniprot ID from the HGNC symbol--if there is
     # a RefSeq ID we wil also try to use it to get an isoform specific
     # UP ID, but we will have this one to fall back on. But if we can't
     # get one here, then we skip the Statement
     up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id_from_hgnc:
         self.no_up_for_hgnc.update((egid, hgnc_name, hgnc_id))
         return None
     # If we have provided the RefSeq ID, it's because we need to make
     # sure that we are getting the right isoform-specific ID (for sequence
     # positions of PTMs). Here we try to get the Uniprot ID from the
     # Refseq->UP mappings in the protmapper.uniprot_client.
     if refseq_id is not None:
         if not validate_id('REFSEQ_PROT', refseq_id):
             if validate_id('NCBIPROTEIN', refseq_id):
                 refseq_ns = 'NCBIPROTEIN'
             else:
                 refseq_ns = None
         else:
             refseq_ns = 'REFSEQ_PROT'
         if refseq_ns == 'REFSEQ_PROT':
             # Get the Uniprot IDs from the uniprot client
             up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                         reviewed_only=True)
         else:
             up_ids = []
         # Nothing for this RefSeq ID (quite likely because the RefSeq
         # ID is obsolete; take the UP ID from HGNC
         if len(up_ids) == 0:
             self.no_up_for_refseq.update(refseq_id)
             up_id = up_id_from_hgnc
         # More than one reviewed entry--no thanks, we'll take the one
         # from HGNC instead
         elif len(up_ids) > 1:
             self.many_ups_for_refseq.update(refseq_id)
             up_id = up_id_from_hgnc
         # We got a unique, reviewed UP entry for the RefSeq ID
         else:
             up_id = up_ids[0]
             # If it's the canonical isoform, strip off the '-1'
             if up_id.endswith('-1'):
                 up_id = up_id.split('-')[0]
     # For completeness, get the Refseq ID from the HPRD ID table
     else:
         refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
         if not validate_id('REFSEQ_PROT', refseq_id):
             if validate_id('NCBIPROTEIN', refseq_id):
                 refseq_ns = 'NCBIPROTEIN'
             else:
                 refseq_ns = None
         else:
             refseq_ns = 'REFSEQ_PROT'
         up_id = up_id_from_hgnc
     # Make db_refs, return Agent
     db_refs = {}
     if hgnc_id:
         db_refs['HGNC'] = hgnc_id
     if up_id:
         if ',' in up_id:
             pass
         elif '-' in up_id:
             up_base = up_id.split('-')[0]
             db_refs['UP'] = up_base
             db_refs['UPISO'] = up_id
         else:
             db_refs['UP'] = up_id
     if egid:
         db_refs['EGID'] = egid
     if refseq_ns and refseq_id:
         db_refs[refseq_ns] = refseq_id
     return Agent(hgnc_name, db_refs=db_refs)