def validate(self, build_config='mygene_allspecies', n=10): '''Validate merged genedoc, currently for ES backend only.''' import random import itertools import pyes from pprint import pprint self.load_build_config(build_config) last_build = self._build_config['build'][-1] print "Last build record:" pprint(last_build) #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only' target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) print "Validating..." target_cnt = self.target.count() stats_cnt = last_build['stats']['total_genes'] if target_cnt == stats_cnt: print "OK [total count={}]".format(target_cnt) else: print "Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, stats_cnt) if n > 0: for src in self._build_config['sources']: print "\nSrc:", src # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene': # print "skipped." # continue cnt = self.src[src].count() fdr1 = doc_feeder(self.src[src], step=10000, s=cnt-n) rand_s = random.randint(0, cnt-n) fdr2 = doc_feeder(self.src[src], step=n, s=rand_s, e=rand_s+n) _first_exception = True for doc in itertools.chain(fdr1, fdr2): _id = doc['_id'] try: es_doc = self.target.get_from_id(_id) except pyes.exceptions.NotFoundException: if _first_exception: print _first_exception = False print _id, 'not found.' continue for k in doc: if src == 'entrez_homologene' and k == 'taxid': # there is occasionally known error for taxid in homologene data. continue assert es_doc.get(k, None) == doc[k], (_id, k, es_doc.get(k, None), doc[k])
def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True): from utils.mongo import doc_feeder def rate_control(cnt, t): delay = 0 if t > 90: delay = 30 elif t > 60: delay = 10 if delay: print("\tPausing for {}s...".format(delay), end='') time.sleep(delay) print("done.") src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query) if bulk: if update: # input doc will update existing one # if allow_upsert, create new one if not exist res = self.update_docs(src_docs, upsert=allow_upsert) else: # input doc will overwrite existing one res = self.index_bulk(src_docs) if len(res[1]) > 0: print("Error: {} docs failed indexing.".format(len(res[1]))) return res[0] else: cnt = 0 for doc in src_docs: self.index(doc) cnt += 1 if verbose: print(cnt, ':', doc['_id']) return cnt
def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True): def rate_control(cnt, t): delay = 0 if t > 90: delay = 30 elif t > 60: delay = 10 if delay: print("\tPausing for {}s...".format(delay), end='') time.sleep(delay) print("done.") src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query) if bulk: res = self.index_bulk(src_docs) if len(res[1]) > 0: print("Error: {} docs failed indexing.".format(len(res[1]))) return res[0] else: cnt = 0 for doc in src_docs: self.index(doc) cnt += 1 if verbose: print(cnt, ':', doc['_id']) return cnt
def do_index(doc_li, index_name, doc_type, step=1000, update=True, verbose=True): for doc_batch in doc_feeder(doc_li, step=step, verbose=verbose): _index_doc_batch(doc_batch, index_name, doc_type, update=update)
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print "\t{}\trecords will be added.".format(len(changes['add'])) print "\t{}\trecords will be deleted.".format(len(changes['delete'])) print "\t{}\trecords will be updated.".format(len(changes['update'])) print print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name) print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name) if noconfirm or ask("Continue?")=='Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print "Adding {} new records...".format(len(changes['add'])) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) if len(changes['delete']) > 0: print "Deleting {} old records...".format(len(changes['delete'])) t0 = time.time() es_idxer.delete_docs(changes['delete']) print "Done. [{}]".format(timesofar(t0)) if len(changes['update']) > 0: print "Updating {} existing records...".format(len(changes['update'])) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) print '='*20 print 'Finished. [{}]'.format(timesofar(t00))
def do_index_from_collection(collection, index_name, doc_type, skip, step=10000, update=True): from utils.mongo import doc_feeder for doc_batch in doc_feeder(collection, step=step, s=skip, inbatch=True): _index_doc_batch(doc_batch, index_name, doc_type, update=update)
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set': { "unmatched_ref": "True" }}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def _merge_sequential(self, collection, geneid_set, step=100000, idmapping_d=None): for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc.pop('_id', None) doc.pop('taxid', None) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) self.target.update(__id, doc)
def _merge_parallel_ipython(self, collection, geneid_set, step=100000, idmapping_d=None): from IPython.parallel import Client, require rc = Client() dview = rc[:] #dview = rc.load_balanced_view() dview.block = False target_collection = self.target.target_collection dview['server'] = target_collection.database.connection.host dview['port'] = target_collection.database.connection.port dview['database'] = target_collection.database.name dview['collection_name'] = target_collection.name def partition(lst, n): q, r = divmod(len(lst), n) indices = [q*i + min(i, r) for i in xrange(n+1)] return [lst[indices[i]:indices[i+1]] for i in xrange(n)] @require('mongokit', 'time') def worker(doc_li): conn = mongokit.Connection(server, port) target_collection = conn[database][collection_name] print "len(doc_li): {}".format(len(doc_li)) t0 = time.time() for doc in doc_li: __id = doc.pop('_id') doc.pop('taxid', None) target_collection.update({'_id': __id}, {'$set': doc}, manipulate=False, upsert=False) # ,safe=True) print 'Done. [%.1fs]' % (time.time()-t0) for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id self.doc_queue.append(doc) if len(self.doc_queue) >= step: #dview.scatter('doc_li', self.doc_queue) #dview.apply_async(worker) dview.map_async(worker, partition(self.doc_queue, len(rc.ids))) self.doc_queue = [] print "!",
def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True): from utils.mongo import doc_feeder def rate_control(cnt, t): delay = 0 if t > 90: delay = 30 elif t > 60: delay = 10 if delay: print("\tPausing for {}s...".format(delay), end='') time.sleep(delay) print("done.") src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query) if bulk: if update: # input doc will update existing one # if allow_upsert, create new one if not exist res = self.update_docs(src_docs, upsert=allow_upsert) else: # input doc will overwrite existing one res = self.index_bulk(src_docs) if len(res[1]) > 0: print("Error: {} docs failed indexing.".format(len(res[1]))) file_name = collection + '_es_error.pyobj' dump(res, file_name) return res[0] else: cnt = 0 for doc in src_docs: self.index(doc) cnt += 1 if verbose: print(cnt, ':', doc['_id']) return cnt
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None): from multiprocessing import Process, Queue NUMBER_OF_PROCESSES = 8 input_queue = Queue() input_queue.conn_pool = [] def worker(q, target): while True: doc = q.get() if doc == 'STOP': break __id = doc.pop('_id') doc.pop('taxid', None) target.update(__id, doc) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) # Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(input_queue, self.target)).start() for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id input_queue.put(doc) # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): input_queue.put('STOP')
def do_index_from_collection_0(collection, index_name, doc_type, skip, step=10000, update=True): from utils.mongo import doc_feeder for doc_batch in doc_feeder(collection, step=step, s=skip, inbatch=True): _index_doc_batch(doc_batch, index_name, doc_type, update=update)
def make_genedoc_root(self): if not self._entrez_geneid_d: self._load_entrez_geneid_d() if 'ensembl_gene' in self._build_config['gene_root']: self._load_ensembl2entrez_li() ensembl2entrez = self._idmapping_d_cache['ensembl_gene'] if "species" in self._build_config: _query = {'taxid': {'$in': self._build_config['species']}} elif "species_to_exclude" in self._build_config: _query = {'taxid': {'$nin': self._build_config['species_to_exclude']}} else: _query = None geneid_set = [] species_set = set() if "entrez_gene" in self._build_config['gene_root']: for doc_li in doc_feeder(self.src['entrez_gene'], inbatch=True, step=self.step, query=_query): #target_collection.insert(doc_li, manipulate=False, check_keys=False) self.target.insert(doc_li) geneid_set.extend([doc['_id'] for doc in doc_li]) species_set |= set([doc['taxid'] for doc in doc_li]) cnt_total_entrez_genes = len(geneid_set) cnt_total_species = len(species_set) print '# of entrez Gene IDs in total: %d' % cnt_total_entrez_genes print '# of species in total: %d' % cnt_total_species if "ensembl_gene" in self._build_config['gene_root']: cnt_ensembl_only_genes = 0 cnt_total_ensembl_genes = 0 for doc_li in doc_feeder(self.src['ensembl_gene'], inbatch=True, step=self.step, query=_query): _doc_li = [] for _doc in doc_li: cnt_total_ensembl_genes += 1 ensembl_id = _doc['_id'] entrez_gene = ensembl2entrez.get(ensembl_id, None) if entrez_gene is None: #this is an Ensembl only gene _doc_li.append(_doc) cnt_ensembl_only_genes += 1 geneid_set.append(_doc['_id']) if _doc_li: #target_collection.insert(_doc_li, manipulate=False, check_keys=False) self.target.insert(_doc_li) cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes print '# of ensembl Gene IDs in total: %d' % cnt_total_ensembl_genes print '# of ensembl Gene IDs match entrez Gene IDs: %d' % cnt_matching_ensembl_genes print '# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' % cnt_ensembl_only_genes geneid_set = set(geneid_set) print '# of total Root Gene IDs: %d' % len(geneid_set) _stats = {'total_entrez_genes': cnt_total_entrez_genes, 'total_species': cnt_total_species, 'total_ensembl_genes': cnt_total_ensembl_genes, 'total_ensembl_genes_mapped_to_entrez': cnt_matching_ensembl_genes, 'total_ensembl_only_genes': cnt_ensembl_only_genes, 'total_genes': len(geneid_set)} self._stats = _stats self._src_version = self.get_src_version() self.log_src_build({'stats': _stats, 'src_version': self._src_version}) return geneid_set