def doc_feeder(self, index_type=None, index_name=None, step=10000, verbose=True, query=None, scroll='10m', **kwargs): conn = self.conn index_name = index_name or self.ES_INDEX_NAME doc_type = index_type or self.ES_INDEX_TYPE n = self.count(query=query)['count'] cnt = 0 t0 = time.time() if verbose: print('\ttotal docs: {}'.format(n)) _kwargs = kwargs.copy() _kwargs.update(dict(size=step, index=index_name, doc_type=doc_type)) res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs) t1 = time.time() for doc in res: if verbose and cnt % step == 0: if cnt != 0: print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1))) print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='') t1 = time.time() yield doc cnt += 1 if verbose: print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1))) print("Finished! [{}]".format(timesofar(t0)))
def load_contig(contig): '''save cadd contig into mongodb collection. should be an iterable. ''' # if CADD_INPUT == "exome": # CADD_INPUT = exome tabix = pysam.Tabixfile(whole_genome) src_db = get_src_db() target_coll = src_db["cadd"] t0 = time.time() cnt = 0 docs = (doc for doc in fetch_generator(tabix, contig)) doc_list = [] for doc in docs: doc_list.append(doc) cnt += 1 if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded cadd chromosome %s into mongodb" % contig) print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('=' * 20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def redo_parse_gbff(path): '''call this function manually to re-start the parsing step and set src_dump. This is used when main() is broken at parsing step, then parsing need to be re-started after the fix. ''' #mark the download starts src_dump = get_src_dump() t0 = time.time() t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(path) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('='*20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def load_contig(contig): '''save cadd contig into mongodb collection. should be an iterable. ''' # if CADD_INPUT == "exome": # CADD_INPUT = exome tabix = pysam.Tabixfile(whole_genome) src_db = get_src_db() target_coll = src_db["cadd"] t0 = time.time() cnt = 0 docs = (doc for doc in fetch_generator(tabix, contig)) doc_list = [] for doc in docs: doc_list.append(doc) cnt += 1 if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded cadd chromosome %s into mongodb" % contig) print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None): '''A iterator for returning docs in a collection, with batch query. additional filter query can be passed via "query", e.g., doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}}) batch_callback is a callback function as fn(cnt, t), called after every batch fields is optional parameter passed to find to restrict fields to return. ''' src = get_src_db() if type(collection) == str: cur = src[collection].find() else: cur = collection.find() n = cur.count() s = s or 0 e = e or n print('Retrieving {} documents from database "{}".'.format(n, collection)) t0 = time.time() if inbatch: doc_li = [] cnt = 0 t1 = time.time() try: if s: cur.skip(s) cnt = s print("Skipping {} documents.".format(s)) if e: cur.limit(e - (s or 0)) cur.batch_size(step) print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='') for doc in cur: if inbatch: doc_li.append(doc) else: yield doc cnt += 1 if cnt % step == 0: if inbatch: yield doc_li doc_li = [] print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) if batch_callback: batch_callback(cnt, time.time()-t1) if cnt < e: t1 = time.time() print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='') if inbatch and doc_li: #Important: need to yield the last batch here yield doc_li #print 'Done.[%s]' % timesofar(t1) print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) print("=" * 20) print('Finished.[total time: {}]'.format(timesofar(t0))) finally: cur.close()
def doc_feeder(self, step=10000, verbose=True, query=None, scroll='10m', **kwargs): q = query if query else {'query': {'match_all': {}}} _q_cnt = self.count(q=q, raw=True) n = _q_cnt['count'] n_shards = _q_cnt['_shards']['total'] assert n_shards == _q_cnt['_shards']['successful'] _size = int(step / n_shards) assert _size * n_shards == step cnt = 0 t0 = time.time() if verbose: print('\ttotal docs: {}'.format(n)) t1 = time.time() res = self._es.search(self._index, self._doc_type, body=q, size=_size, search_type='scan', scroll=scroll, **kwargs) # double check initial scroll request returns no hits assert len(res['hits']['hits']) == 0 while 1: if verbose: t1 = time.time() if cnt < n: print('\t{}-{}...'.format(cnt + 1, min(cnt + step, n)), end='') res = self._es.scroll(res['_scroll_id'], scroll=scroll) if len(res['hits']['hits']) == 0: break else: for doc in res['hits']['hits']: _doc = doc.get('_source', {}) # "_id" field is not stored by default # so it may not be returned in _source _doc.setdefault("_id", doc["_id"]) yield _doc cnt += 1 if verbose: print('done.[%.1f%%,%s]' % (min(cnt, n) * 100. / n, timesofar(t1))) if verbose: print("Finished! [{}]".format(timesofar(t0))) assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format( cnt, n, res)
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None): '''A iterator for returning docs in a collection, with batch query. additional filter query can be passed via "query", e.g., doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}}) batch_callback is a callback function as fn(cnt, t), called after every batch fields is optional parameter passed to find to restrict fields to return. ''' src = get_src_db() cur = src[collection].find() n = cur.count() s = s or 0 e = e or n print('Retrieving {} documents from database "{}".'.format(n, collection)) t0 = time.time() if inbatch: doc_li = [] cnt = 0 t1 = time.time() try: if s: cur.skip(s) cnt = s print("Skipping {} documents.".format(s)) if e: cur.limit(e - (s or 0)) cur.batch_size(step) print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='') for doc in cur: if inbatch: doc_li.append(doc) else: yield doc cnt += 1 if cnt % step == 0: if inbatch: yield doc_li doc_li = [] print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) if batch_callback: batch_callback(cnt, time.time()-t1) if cnt < e: t1 = time.time() print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='') if inbatch and doc_li: #Important: need to yield the last batch here yield doc_li #print 'Done.[%s]' % timesofar(t1) print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) print("=" * 20) print('Finished.[total time: {}]'.format(timesofar(t0))) finally: cur.close()
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def two_docs_iterator(b1, b2, id_list, step=10000): t0 = time.time() n = len(id_list) for i in range(0, n, step): t1 = time.time() print("Processing %d-%d documents..." % (i + 1, min(i + step, n))) _ids = id_list[i:i + step] iter1 = b1.mget_from_ids(_ids, asiter=True) iter2 = b2.mget_from_ids(_ids, asiter=True) for doc1, doc2 in zip(iter1, iter2): yield doc1, doc2 print('Done.[%.1f%%,%s]' % (i * 100. / n, timesofar(t1))) print("=" * 20) print('Finished.[total time: %s]' % timesofar(t0))
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def two_docs_iterator(b1, b2, id_list, step=10000): t0 = time.time() n = len(id_list) for i in range(0, n, step): t1 = time.time() print "Processing %d-%d documents..." % (i + 1, min(i + step, n)), _ids = id_list[i:i+step] iter1 = b1.mget_from_ids(_ids, asiter=True) iter2 = b2.mget_from_ids(_ids, asiter=True) for doc1, doc2 in zip(iter1, iter2): yield doc1, doc2 print 'Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)) print "="*20 print 'Finished.[total time: %s]' % timesofar(t0)
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True): '''save src data into mongodb collection. if src_module is provided, src_data = src_module.load_data() if new_collection is True, it requires the target collection is empty. else, use src_data directly, should be a iterable. ''' src_db = get_src_db() target_coll = src_db[collection_name] if new_collection and target_coll.count() > 0: print("Error: target collection {} exists.".format(collection_name)) return t0 = time.time() cnt = 0 if src_module: src_data = src_module.load_data() if src_data: doc_list = [] for doc in src_data: cnt += 1 if not inbatch: target_coll.insert(doc, manipulate=False, check_keys=False, w=0) else: doc_list.append(doc) if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded %s into mongodb" % collection_name) print("total docs: {}; total time: {}".format(cnt, timesofar(t0))) else: print("Error: no src data to load.")
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 #src = self.get_source_collection(changes) step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def merge(self, step=100000, restart_at=0): t0 = time.time() self.validate_src_collections() self.log_building_start() try: if self.using_ipython_cluster: self._merge_ipython_cluster(step=step) else: self._merge_local(step=step, restart_at=restart_at) if self.target.name == 'es': print "Updating metadata...", self.update_mapping_meta() t1 = round(time.time() - t0, 0) t = timesofar(t0) self.log_src_build({'status': 'success', 'time': t, 'time_in_s': t1, 'timestamp': datetime.now()}) finally: #do a simple validation here if getattr(self, '_stats', None): print "Validating..." target_cnt = self.target.count() if target_cnt == self._stats['total_genes']: print "OK [total count={}]".format(target_cnt) else: print "Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, self._stats['total_genes']) if self.merge_logging: sys.stdout.close()
def load_x(idx, fieldname, cvt_fn=None): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(2,19,idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value} gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def run2(): from databuild.esbuilder import ESIndexerBase esb = ESIndexerBase() doc_d = build(sources) t0 = time.time() esb.build_index(doc_d) print 'Done[%s]' % timesofar(t0)
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False): t0 = time.time() rc = Client(CLUSTER_CLIENT_JSON) lview = rc.load_balanced_view() print "\t# nodes in use: {}".format(len(lview.targets or rc.ids)) lview.block = False print "\t# of tasks: {}".format(len(task_list)) print "\tsubmitting...", job = lview.map_async(worker, task_list) print "done." try: job.wait_interactive() except KeyboardInterrupt: #handle "Ctrl-C" if ask("\nAbort all submitted jobs?") == 'Y': lview.abort() print "Aborted, all submitted jobs are cancelled." else: print "Aborted, but your jobs are still running on the cluster." return if len(job.result) != len(task_list): print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list)) print "\ttotal time: {}".format(timesofar(t0)) if shutdown_ipengines_after_done: print "\tshuting down all ipengine nodes...", lview.shutdown() print 'Done.' return job.result
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'entrez_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) sys.stderr = sys.stdout #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def load(self, genedoc_d=None, update_data=True, update_master=True, test=False, step=10000): if not self.temp_collection: self.make_temp_collection() self.temp_collection.drop() # drop all existing records just in case. if update_data: genedoc_d = genedoc_d or self.load_genedoc() print("Uploading to the DB...", end='') t0 = time.time() # for doc in self.doc_iterator(genedoc_d, batch=False): # if not test: # doc.save() for doc_li in self.doc_iterator(genedoc_d, batch=True, step=step): if not test: self.temp_collection.insert(doc_li, manipulate=False, check_keys=False) print('Done[%s]' % timesofar(t0)) self.switch_collection() if getattr(self, 'ENTREZ_GENEDOC_ROOT', False): print('Uploading "geneid_d" to GridFS...', end='') t0 = time.time() geneid_d = self.get_geneid_d() dump2gridfs(geneid_d, self.__collection__ + '__geneid_d.pyobj', self.db) print('Done[%s]' % timesofar(t0)) if getattr(self, 'ENSEMBL_GENEDOC_ROOT', False): print('Uploading "mapping2entrezgene" to GridFS...', end='') t0 = time.time() x2entrezgene_list = self.get_mapping_to_entrez() dump2gridfs(x2entrezgene_list, self.__collection__ + '__2entrezgene_list.pyobj', self.db) print('Done[%s]' % timesofar(t0)) if update_master: # update src_master collection if not test: _doc = {"_id": unicode(self.__collection__), "name": unicode(self.__collection__), "timestamp": datetime.datetime.now()} for attr in ['ENTREZ_GENEDOC_ROOT', 'ENSEMBL_GENEDOC_ROOT', 'id_type']: if hasattr(self, attr): _doc[attr] = getattr(self, attr) if hasattr(self, 'get_mapping'): _doc['mapping'] = getattr(self, 'get_mapping')() conn.GeneDocSourceMaster(_doc).save()
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print "\t{}\trecords will be added.".format(len(changes['add'])) print "\t{}\trecords will be deleted.".format(len(changes['delete'])) print "\t{}\trecords will be updated.".format(len(changes['update'])) print print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name) print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name) if noconfirm or ask("Continue?")=='Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print "Adding {} new records...".format(len(changes['add'])) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) if len(changes['delete']) > 0: print "Deleting {} old records...".format(len(changes['delete'])) t0 = time.time() es_idxer.delete_docs(changes['delete']) print "Done. [{}]".format(timesofar(t0)) if len(changes['update']) > 0: print "Updating {} existing records...".format(len(changes['update'])) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) print '='*20 print 'Finished. [{}]'.format(timesofar(t00))
def _db_upload(self, doc_li, step=10000, verbose=True): import time from utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i+step])) if verbose: print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res)-10))
def _db_upload(self, doc_li, step=10000, verbose=True): import time from utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i + step])) if verbose: print('\t%d-%d Done [%s]...' % (i + 1, min(i + step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res) - 10))
def main(daemon=False): running_processes = {} while 1: src_to_update_li = check_mongo() if src_to_update_li: print '\nDispatcher: found pending jobs ', src_to_update_li for src_to_update in src_to_update_li: if src_to_update not in running_processes: mark_upload_started(src_to_update) p = dispatch(src_to_update) src_dump.update({'_id': src_to_update}, {"$set": {"upload.pid": p.pid}}) p.t0 = time.time() running_processes[src_to_update] = p jobs_finished = [] if running_processes: print 'Dispatcher: {} active job(s)'.format(len(running_processes)) print get_process_info(running_processes) for src in running_processes: p = running_processes[src] returncode = p.poll() if returncode is not None: t1 = round(time.time()-p.t0, 0) d = { 'upload.returncode': returncode, 'upload.timestamp': datetime.now(), 'upload.time_in_s': t1, 'upload.time': timesofar(p.t0), 'upload.logfile': p.logfile, } if returncode == 0: print 'Dispatcher: {} finished successfully with code {} (time: {}s)'.format(src, returncode, t1) d['upload.status'] = "success" else: print 'Dispatcher: {} failed with code {} (time: {}s)'.format(src, returncode, t1) d['upload.status'] = "failed" mark_upload_done(src, d) jobs_finished.append(src) p.log_f.close() else: p.log_f.flush() for src in jobs_finished: del running_processes[src] if running_processes: time.sleep(10) else: if daemon: #continue monitor src_dump collection print '\b'*50, for i in range(100): print '\b'*2+[unichr(8212), '\\', '|', '/'][i%4], time.sleep(0.1) else: break
def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1)))
def doc_feeder(self, step=1000, s=None, e=None, inbatch=False, query=None, **kwargs): '''A iterator for returning docs in a ES index with batch query. additional filter query can be passed via "query", e.g., doc_feeder(query='taxid:9606'}}) other parameters can be passed via "**kwargs": fields, from, size etc. ''' if query: q = StringQuery(query) else: q = MatchAllQuery() raw_res = None cnt = 0 t0 = time.time() while 1: t1 = time.time() if raw_res is None: raw_res = self.conn.search_raw(q, self._index, self._doc_type, start=s, size=step, scan=True, scroll='5m', **kwargs) n = raw_res['hits']['total'] print 'Retrieving %d documents from index "%s/%s".' % (n, self._index, self._doc_type) else: raw_res = self.conn.search_scroll(raw_res._scroll_id, scroll='5m') hits_cnt = len(raw_res['hits']['hits']) if hits_cnt == 0: break else: print "Processing %d-%d documents..." % (cnt+1, cnt+hits_cnt) , res = self._cleaned_res(raw_res) if inbatch: yield res else: for hit in res: yield hit cnt += hits_cnt print 'Done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)) if e and cnt > e: break print "="*20 print 'Finished.[total docs: %s, total time: %s]' % (cnt, timesofar(t0))
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() print("Checking latest mart_version:\t", end=' ') mart_version = chk_latest_mart_version() print(mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): print("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def doc_feeder(self, step=10000, verbose=True, query=None, scroll='10m', **kwargs): q = query if query else {'query': {'match_all': {}}} _q_cnt = self.count(q=q, raw=True) n = _q_cnt['count'] n_shards = _q_cnt['_shards']['total'] assert n_shards == _q_cnt['_shards']['successful'] _size = int(step / n_shards) assert _size * n_shards == step cnt = 0 t0 = time.time() if verbose: print('\ttotal docs: {}'.format(n)) t1 = time.time() res = self._es.search(self._index, self._doc_type, body=q, size=_size, search_type='scan', scroll=scroll, **kwargs) # double check initial scroll request returns no hits assert len(res['hits']['hits']) == 0 while 1: if verbose: t1 = time.time() if cnt < n: print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='') res = self._es.scroll(res['_scroll_id'], scroll=scroll) if len(res['hits']['hits']) == 0: break else: for doc in res['hits']['hits']: _doc = doc.get('_source', {}) # "_id" field is not stored by default # so it may not be returned in _source _doc.setdefault("_id", doc["_id"]) yield _doc cnt += 1 if verbose: print('done.[%.1f%%,%s]' % (min(cnt, n)*100./n, timesofar(t1))) if verbose: print("Finished! [{}]".format(timesofar(t0))) assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format(cnt, n, res)
def build(sources, batch=True): entrez_root = ENTREZ_ROOT in sources ensembl_root = ENSEMBL_ROOT in sources print "Preparing root nodes...", t0 = time.time() if entrez_root and ensembl_root: root_nodes = merge_root_nodes() elif entrez_root: root_nodes = list(get_src(ENTREZ_ROOT).find()) elif ensembl_root: root_nodes = list(get_src(ENSEMBL_ROOT).find()) else: raise ValueError, "You need at least one source with root nodes." print 'Done[%s, %s]' % (len(root_nodes), timesofar(t0)) print "Merging other sources with root nodes...", t0 = time.time() _sources = copy.copy(sources) if entrez_root: _sources.remove(ENTREZ_ROOT) if ensembl_root: _sources.remove(ENSEMBL_ROOT) src_collections = [get_src(src) for src in _sources] out_d = {} if not batch: for _id in root_nodes: vli = [root_nodes[_id]] for sc in src_collections: v = sc.get_from_id(_id) if v: vli.append(v) v_merged = dict_attrmerge(vli) out_d[_id] = v_merged else: for doc_d in _doc_feeder(src_collections, root_nodes.keys(), step=10000, asdict=True): _id, vli = doc_d.items()[0] vli = [root_nodes[_id]] + [v for v in vli if v] v_merged = dict_attrmerge(vli) out_d[_id] = v_merged print 'Done[%s, %s]' % (len(out_d), timesofar(t0)) return out_d
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True, step=1000): '''save src data into mongodb collection. if src_module is provided, src_data = src_module.load_data() if new_collection is True, it requires the target collection is empty. else, use src_data directly, should be a iterable. ''' src_db = get_src_db() target_coll = src_db[collection_name] if new_collection and target_coll.count() > 0: print("Error: target collection {} exists.".format(collection_name)) return t0 = time.time() cnt = 0 if src_module: src_data = src_module.load_data() if src_data: doc_list = [] for doc in src_data: cnt += 1 if not inbatch: try: target_coll.insert_one(doc) except: print('One duplicate id exists, id is {}'.format(doc['_id'])) continue else: doc_list.append(doc) if len(doc_list) == step: target_coll.insert_many(doc_list) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded %s into mongodb" % collection_name) print("total docs: {}; total time: {}".format(cnt, timesofar(t0))) else: print("Error: no src data to load.")
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' use_parallel = '-p' in sys.argv t0 = time.time() bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) bdr.using_ipython_cluster = use_parallel bdr.merge() print "Finished.", timesofar(t0)
def load_ucsc_exons(): print('DATA_FOLDER: ' + DATA_FOLDER) species_li = os.listdir(DATA_FOLDER) print "Found {} species folders.".format(len(species_li)) t0 = time.time() gene2exons = {} for species in species_li: print species, '...' gene2exons.update(load_exons_for_species(species)) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' use_parallel = '-p' in sys.argv noconfirm = '-b' in sys.argv if config == 'clean': clean_target_collection() else: t0 = time.time() build_index(config, use_parallel=use_parallel, noconfirm=noconfirm) print "Finished.", timesofar(t0)
def upload(docs, collection): '''do the actual upload docs to the db.''' print 'Uploading to DB...', t0 = time.time() if type(docs) is types.DictType: doc_li = docs.values() else: doc_li = docs db = get_db() #database for merged data coll = db[collection] for i in range(0, len(doc_li), 10000): coll.insert(doc_li[i:i+10000]) print 'Done[%s]' % timesofar(t0)
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[POS_KEY] = doc[POS_KEY][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: print(_doc['rsid'], '\t', _doc['_id']) else: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 cnt_1 += 1 print("Done. [{}]".format(timesofar(t0))) print("Total rs: {}; total docs: {}; skipped rs: {}".format( cnt_1, cnt_2, cnt_3))
def doc_feeder0(collection, step=1000, s=None, e=None, inbatch=False): '''A iterator for returning docs in a collection, with batch query.''' n = collection.count() s = s or 1 e = e or n print 'Found %d documents in database "%s".' % (n, collection.name) for i in range(s - 1, e + 1, step): print "Processing %d-%d documents..." % (i + 1, i + step), t0 = time.time() res = collection.find(skip=i, limit=step, timeout=False) if inbatch: yield res else: for doc in res: yield doc print 'Done.[%s]' % timesofar(t0)
def handle_genedoc_merged(self, **kwargs): for config in ('mygene', 'mygene_allspecies'): t0 = time.time() p = Popen(['python', '-m', 'databuild.sync', config, '-p', '-b'], cwd=src_path) returncode = p.wait() t = timesofar(t0) if returncode == 0: msg = 'Dispatcher: "{}" syncer finished successfully with code {} (time: {})'.format(config, returncode, t) else: msg = 'Dispatcher: "{}" syncer failed successfully with code {} (time: {})'.format(config, returncode, t) print(msg) if hipchat_msg: msg += '<a href="http://su07:8000/log/sync/{}">sync log</a>'.format(config) hipchat_msg(msg, message_format='html') assert returncode == 0, "Subprocess failed. Check error above."
def main(): no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() (file_name, release) = get_newest_release() doc = src_dump.find_one({'_id': 'clinvar'}) if new_release_available(doc['release']): data_file = os.path.join(doc['data_folder'], file_name) if os.path.exists(data_file): print("No newer file found. Abort now.") return if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): return log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'clinvar_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) # mark the download starts doc = { '_id': 'clinvar', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'release': release, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() try: download_ftp_file(no_confirm) finally: sys.stdout.close() # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. print("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() print(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): print("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'refseq_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def load_collection(database, input_file_list, collection_name): """ : param database: mongodb url : param input_file_list: variant docs, path to file : param collection_name: annotation source name """ conn = pymongo.MongoClient(database) db = conn.variantdoc posts = db[collection_name] t1 = time.time() cnt = 0 input_file_list = getFileList() for doc in load_data(input_file_list): posts.insert(doc, manipulate=False, check_keys=False, w=0) cnt += 1 if cnt % 100000 == 0: print cnt, timesofar(t1) print "successfully loaded %s into mongodb" % collection_name
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result