def synonyms2chembl(self, synonyms, like=True): self.result = {} syn_lower = dict(zip([s.lower() for s in synonyms], synonyms)) syn_lst = ','.join(['"%s"' % syn for syn in synonyms]) synq = self.comp_syn % syn_lst recq = self.comp_rec % ('name', ' IN (%s)' % syn_lst) synqid = self.mysql.get_qid(synq) recqid = self.mysql.get_qid(recq) self.mysql.send_query(synq, silent=True) self.mysql.send_query(recq, silent=True) self.mysql.wait_results([synqid, recqid]) self.mysql_ready() for r in chain(self.mysql.get_result(synqid), self.mysql.get_result(recqid)): syn = syn_lower[r['syn'].lower()] if syn not in self.result: self.result[syn] = [] if r['chembl_id'] is not None: self.result[syn].append(r['chembl_id']) if like: like_results = {} notfound = [ n for n in list(set(synonyms) - set(self.result.keys())) if not n.isdigit() ] qids = {} trds = [] for field in ['name', 'key']: for syn in notfound: q = self.comp_rec % (field, ' LIKE "%%%s%%"' % syn) qid = self.mysql.get_qid(q) qids[qid] = syn self.mysql.send_query(q, silent=True) self.mysql.wait_results(qids.keys()) self.mysql_ready() for qid, syn in iteritems(qids): res = self.mysql.get_result(qid) this_result = [] for r in res: if r['chembl_id'] is not None: this_result.append(r['chembl_id']) if syn not in like_results: like_results[syn] = [] like_results[syn].append(this_result) for syn, results in iteritems(like_results): # choosing the shortest returned list of ChEMBL IDs if len(results) > 0 and syn not in self.result: results = [common.uniqList(r) for r in results] self.result[syn] = reduce( lambda x, y: x if len(y) == 0 or len(x) < len(y) and len(x) > 0 else y, results) self.result = dict([(k, common.uniqList(v)) for k, v in iteritems(self.result)])
def map_names(self, names, nameType, targetNameType, ncbi_tax_id=None, strict=False, silent=True): """ Same as `map_name` just with multiple IDs. """ return (common.uniqList( itertools.chain(*map( lambda n: self.map_name(n, nameType, targetNameType, ncbi_tax_id=ncbi_tax_id, strict=strict, silent=silent), names))))
def write_set(self, id_list, setname, id_type, map_ids=True): self.sets[setname] = set(common.uniqList(common.flatList( self.mapper.map_name(n, self.ids[id_type], self.target_id) for n in id_list))) if map_ids \ else set(id_list)
def get_pubmed_data(pp, cachefile=None, htp_threshold=20): """ For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ if cachefile is None: cachefile = settings.get('pubmed_cache') if htp_threshold is not None: pp.htp_stats() pubmeds = common.uniqList( common.flatList([[r.pmid for r in e['references']] for e in pp.graph.es])) if htp_threshold is not None: pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs'] notpmid = [i for i in pubmeds if not i.isdigit()] sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' % len(notpmid)) pmdata = {} if os.path.exists(cachefile): sys.stdout.write('\t:: Loading data previously downloaded ' 'from PubMed, from file `%s`\n' % cachefile) pmdata = pickle.load(open(cachefile, 'rb')) missing = list(set(pubmeds) - set(pmdata.keys())) sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' % len(missing)) cached_pubmeds_len = len(pmdata) pmdata_new = dataio.get_pubmeds(missing) pmdata.update(pmdata_new) sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile) if len(pmdata) > cached_pubmeds_len: pickle.dump(pmdata, open(cachefile, 'wb')) pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds) points = [] earliest = [] for e in pp.graph.es: for s, rs in iteritems(e['refs_by_source']): pms = [ r.pmid for r in rs if (htp_threshold is None or r.pmid not in pp.htp[htp_threshold]['htrefs']) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid] ] if len(pms) > 0: yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms] earliest.append((s, 0, min(yrs), '', e.index)) for pm in pms: points.append((s, pm, int(pmdata[pm]['pubdate'][:4]), pmdata[pm]['source'], e.index)) points = common.uniqList(points) earliest = common.uniqList(earliest) points = pd.DataFrame.from_records(points) earliest = pd.DataFrame.from_records(earliest) points.columns = ['database', 'pmid', 'year', 'journal', 'eid'] earliest.columns = ['database', 'none', 'year', 'none', 'eid'] return points, earliest
def map_name(self, name, nameType, targetNameType, ncbi_tax_id=None, strict=False, silent=True): r""" This function should be used to convert individual IDs. It takes care about everything, you don't need to think on the details. How does it work: looks up dictionaries between the original and target ID type, if doesn't find, attempts to load from the predefined inputs. If the original name is genesymbol, first it looks up among the preferred gene names from UniProt, if not found, it takes an attempt with the alternative gene names. If the gene symbol still couldn't be found, and strict = False, the last attempt only the first 5 chara- cters of the gene symbol matched. If the target name type is uniprot, then it converts all the ACs to primary. Then, for the Trembl IDs it looks up the preferred gene names, and find Swissprot IDs with the same preferred gene name. @name : str The original name which shall be converted. @nameType : str The type of the name. Available by default: - genesymbol (gene name) - entrez (Entrez Gene ID \[#\]) - refseqp (NCBI RefSeq Protein ID \[NP\_\*|XP\_\*\]) - ensp (Ensembl protein ID \[ENSP\*\]) - enst (Ensembl transcript ID \[ENST\*\]) - ensg (Ensembl genomic DNA ID \[ENSG\*\]) - hgnc (HGNC ID \[HGNC:#\]) - gi (GI number \[#\]) - embl (DDBJ/EMBL/GeneBank CDS accession) - embl_id (DDBJ/EMBL/GeneBank accession) To use other IDs, you need to define the input method and load the table before calling :py:func:Mapper.map_name(). """ ncbi_tax_id = self.get_tax_id(ncbi_tax_id) if type(nameType) is list: mappedNames = [] for nt in nameType: mappedNames += self.map_name(name, nt, targetNameType, strict, silent) return common.uniqList(mappedNames) if nameType == targetNameType: if targetNameType != 'uniprot': return [name] else: mappedNames = [name] elif nameType.startswith('refseq'): mappedNames = self.map_refseq(name, nameType, targetNameType, ncbi_tax_id=ncbi_tax_id, strict=strict) else: mappedNames = self._map_name(name, nameType, targetNameType, ncbi_tax_id) if not len(mappedNames): mappedNames = self._map_name(name.upper(), nameType, targetNameType, ncbi_tax_id) if not len(mappedNames) and \ nameType not in set(['uniprot', 'trembl', 'uniprot-sec']): mappedNames = self._map_name(name.lower(), nameType, targetNameType, ncbi_tax_id) if not len(mappedNames) and nameType == 'genesymbol': mappedNames = self._map_name(name, 'genesymbol-syn', targetNameType, ncbi_tax_id) if not strict and not len(mappedNames): mappedNames = self._map_name('%s1' % name, 'genesymbol', targetNameType, ncbi_tax_id) if not len(mappedNames): mappedNames = self._map_name(name, 'genesymbol5', targetNameType, ncbi_tax_id) if not len(mappedNames) and nameType == 'mir-mat-name': mappedNames = self._map_name(name, 'mir-name', targetNameType, ncbi_tax_id) if targetNameType == 'uniprot': orig = mappedNames mappedNames = self.primary_uniprot(mappedNames) mappedNames = self.trembl_swissprot(mappedNames, ncbi_tax_id) if len(set(orig) - set(mappedNames)) > 0: self.uniprot_mapped.append((orig, mappedNames)) mappedNames = [u for u in mappedNames if self.reup.match(u)] return common.uniqList(mappedNames)
def cleanDict(self, mapping): for key, value in iteritems(mapping): mapping[key] = common.uniqList(value) return mapping