Example #1
0
 def synonyms2chembl(self, synonyms, like=True):
     self.result = {}
     syn_lower = dict(zip([s.lower() for s in synonyms], synonyms))
     syn_lst = ','.join(['"%s"' % syn for syn in synonyms])
     synq = self.comp_syn % syn_lst
     recq = self.comp_rec % ('name', ' IN (%s)' % syn_lst)
     synqid = self.mysql.get_qid(synq)
     recqid = self.mysql.get_qid(recq)
     self.mysql.send_query(synq, silent=True)
     self.mysql.send_query(recq, silent=True)
     self.mysql.wait_results([synqid, recqid])
     self.mysql_ready()
     for r in chain(self.mysql.get_result(synqid),
                    self.mysql.get_result(recqid)):
         syn = syn_lower[r['syn'].lower()]
         if syn not in self.result:
             self.result[syn] = []
         if r['chembl_id'] is not None:
             self.result[syn].append(r['chembl_id'])
     if like:
         like_results = {}
         notfound = [
             n for n in list(set(synonyms) - set(self.result.keys()))
             if not n.isdigit()
         ]
         qids = {}
         trds = []
         for field in ['name', 'key']:
             for syn in notfound:
                 q = self.comp_rec % (field, ' LIKE "%%%s%%"' % syn)
                 qid = self.mysql.get_qid(q)
                 qids[qid] = syn
                 self.mysql.send_query(q, silent=True)
         self.mysql.wait_results(qids.keys())
         self.mysql_ready()
         for qid, syn in iteritems(qids):
             res = self.mysql.get_result(qid)
             this_result = []
             for r in res:
                 if r['chembl_id'] is not None:
                     this_result.append(r['chembl_id'])
             if syn not in like_results:
                 like_results[syn] = []
             like_results[syn].append(this_result)
         for syn, results in iteritems(like_results):
             # choosing the shortest returned list of ChEMBL IDs
             if len(results) > 0 and syn not in self.result:
                 results = [common.uniqList(r) for r in results]
                 self.result[syn] = reduce(
                     lambda x, y: x if len(y) == 0 or len(x) < len(y) and
                     len(x) > 0 else y, results)
     self.result = dict([(k, common.uniqList(v))
                         for k, v in iteritems(self.result)])
Example #2
0
    def map_names(self,
                  names,
                  nameType,
                  targetNameType,
                  ncbi_tax_id=None,
                  strict=False,
                  silent=True):
        """
        Same as `map_name` just with multiple IDs.
        
        """

        return (common.uniqList(
            itertools.chain(*map(
                lambda n: self.map_name(n,
                                        nameType,
                                        targetNameType,
                                        ncbi_tax_id=ncbi_tax_id,
                                        strict=strict,
                                        silent=silent), names))))
Example #3
0
 def write_set(self, id_list, setname, id_type, map_ids=True):
     self.sets[setname] = set(common.uniqList(common.flatList(
         self.mapper.map_name(n, self.ids[id_type], self.target_id)
         for n in id_list))) if map_ids \
         else set(id_list)
Example #4
0
def get_pubmed_data(pp, cachefile=None, htp_threshold=20):
    """
    For one PyPath object, obtains metadata for all PubMed IDs
    through NCBI E-utils.

    :param pp:
        ``pypath.PyPath`` object
    :param htp_threshold:
        The number of interactions for one reference
        above the study considered to be high-throughput.
    """

    if cachefile is None:

        cachefile = settings.get('pubmed_cache')

    if htp_threshold is not None:
        pp.htp_stats()

    pubmeds = common.uniqList(
        common.flatList([[r.pmid for r in e['references']]
                         for e in pp.graph.es]))

    if htp_threshold is not None:
        pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs']

    notpmid = [i for i in pubmeds if not i.isdigit()]

    sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' %
                     len(notpmid))

    pmdata = {}
    if os.path.exists(cachefile):
        sys.stdout.write('\t:: Loading data previously downloaded '
                         'from PubMed, from file `%s`\n' % cachefile)
        pmdata = pickle.load(open(cachefile, 'rb'))

    missing = list(set(pubmeds) - set(pmdata.keys()))
    sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' %
                     len(missing))
    cached_pubmeds_len = len(pmdata)
    pmdata_new = dataio.get_pubmeds(missing)
    pmdata.update(pmdata_new)

    sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile)

    if len(pmdata) > cached_pubmeds_len:
        pickle.dump(pmdata, open(cachefile, 'wb'))

    pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds)

    points = []
    earliest = []

    for e in pp.graph.es:

        for s, rs in iteritems(e['refs_by_source']):

            pms = [
                r.pmid for r in rs
                if (htp_threshold is None
                    or r.pmid not in pp.htp[htp_threshold]['htrefs'])
                and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid]
            ]
            if len(pms) > 0:
                yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms]
                earliest.append((s, 0, min(yrs), '', e.index))
                for pm in pms:
                    points.append((s, pm, int(pmdata[pm]['pubdate'][:4]),
                                   pmdata[pm]['source'], e.index))

    points = common.uniqList(points)
    earliest = common.uniqList(earliest)

    points = pd.DataFrame.from_records(points)
    earliest = pd.DataFrame.from_records(earliest)
    points.columns = ['database', 'pmid', 'year', 'journal', 'eid']
    earliest.columns = ['database', 'none', 'year', 'none', 'eid']

    return points, earliest
Example #5
0
    def map_name(self,
                 name,
                 nameType,
                 targetNameType,
                 ncbi_tax_id=None,
                 strict=False,
                 silent=True):
        r"""
        This function should be used to convert individual IDs.
        It takes care about everything, you don't need to think
        on the details. How does it work: looks up dictionaries 
        between the original and target ID type, if doesn't 
        find, attempts to load from the predefined inputs.
        If the original name is genesymbol, first it looks up
        among the preferred gene names from UniProt, if not 
        found, it takes an attempt with the alternative gene
        names. If the gene symbol still couldn't be found, and 
        strict = False, the last attempt only the first 5 chara-
        cters of the gene symbol matched. If the target name 
        type is uniprot, then it converts all the ACs to primary. 
        Then, for the Trembl IDs it looks up the preferred gene 
        names, and find Swissprot IDs with the same preferred 
        gene name.

        @name : str
            The original name which shall be converted.
        @nameType : str
            The type of the name.
            Available by default:
            - genesymbol (gene name)
            - entrez (Entrez Gene ID \[#\])
            - refseqp (NCBI RefSeq Protein ID \[NP\_\*|XP\_\*\])
            - ensp (Ensembl protein ID \[ENSP\*\])
            - enst (Ensembl transcript ID \[ENST\*\])
            - ensg (Ensembl genomic DNA ID \[ENSG\*\])
            - hgnc (HGNC ID \[HGNC:#\])
            - gi (GI number \[#\])
            - embl (DDBJ/EMBL/GeneBank CDS accession)
            - embl_id (DDBJ/EMBL/GeneBank accession)
            To use other IDs, you need to define the input method
            and load the table before calling :py:func:Mapper.map_name().

        """

        ncbi_tax_id = self.get_tax_id(ncbi_tax_id)
        if type(nameType) is list:
            mappedNames = []
            for nt in nameType:
                mappedNames += self.map_name(name, nt, targetNameType, strict,
                                             silent)
            return common.uniqList(mappedNames)
        if nameType == targetNameType:
            if targetNameType != 'uniprot':
                return [name]
            else:
                mappedNames = [name]
        elif nameType.startswith('refseq'):
            mappedNames = self.map_refseq(name,
                                          nameType,
                                          targetNameType,
                                          ncbi_tax_id=ncbi_tax_id,
                                          strict=strict)
        else:
            mappedNames = self._map_name(name, nameType, targetNameType,
                                         ncbi_tax_id)
        if not len(mappedNames):
            mappedNames = self._map_name(name.upper(), nameType,
                                         targetNameType, ncbi_tax_id)
        if not len(mappedNames) and \
            nameType not in set(['uniprot', 'trembl', 'uniprot-sec']):
            mappedNames = self._map_name(name.lower(), nameType,
                                         targetNameType, ncbi_tax_id)
        if not len(mappedNames) and nameType == 'genesymbol':
            mappedNames = self._map_name(name, 'genesymbol-syn',
                                         targetNameType, ncbi_tax_id)
            if not strict and not len(mappedNames):
                mappedNames = self._map_name('%s1' % name, 'genesymbol',
                                             targetNameType, ncbi_tax_id)
                if not len(mappedNames):
                    mappedNames = self._map_name(name, 'genesymbol5',
                                                 targetNameType, ncbi_tax_id)

        if not len(mappedNames) and nameType == 'mir-mat-name':

            mappedNames = self._map_name(name, 'mir-name', targetNameType,
                                         ncbi_tax_id)

        if targetNameType == 'uniprot':
            orig = mappedNames
            mappedNames = self.primary_uniprot(mappedNames)
            mappedNames = self.trembl_swissprot(mappedNames, ncbi_tax_id)
            if len(set(orig) - set(mappedNames)) > 0:
                self.uniprot_mapped.append((orig, mappedNames))
            mappedNames = [u for u in mappedNames if self.reup.match(u)]
        return common.uniqList(mappedNames)
Example #6
0
 def cleanDict(self, mapping):
     for key, value in iteritems(mapping):
         mapping[key] = common.uniqList(value)
     return mapping