def set_chrom(self, batch_size, job_manager): # divide & conquer... build batches jobs = [] total = self.target_backend.count() btotal = math.ceil(total/batch_size) bnum = 1 cnt = 0 results = {"missing" : [], "disagreed" : []} root_keys = {} # grab ids only, so we can get more and fill queue for each step # each round, fill the queue to make sure every cpu slots are always working id_batch_size = batch_size * job_manager.process_queue._max_workers * 2 self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \ (self.target_backend.target_collection.name, id_batch_size, batch_size)) for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger): for doc_ids in iter_n(big_doc_ids,batch_size): yield from asyncio.sleep(0.1) cnt += len(doc_ids) pinfo = self.get_pinfo() pinfo["step"] = "post-merge (chrom)" pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.)) self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \ (bnum,btotal,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process(pinfo, partial(chrom_worker, self.target_backend.target_name, doc_ids)) def processed(f,results, batch_num): try: fres = f.result() results["missing"].extend(fres["missing"]) results["disagreed"].extend(fres["disagreed"]) # merge root key counts rk = fres["root_keys"] for k in rk: root_keys.setdefault(k,0) root_keys[k] += rk[k] self.logger.info("chrom batch #%d, done" % batch_num) except Exception as e: import traceback self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \ (batch_num, e, traceback.format_exc())) raise job.add_done_callback(partial(processed, results=results, batch_num=bnum)) jobs.append(job) bnum += 1 self.logger.info("%d jobs created for merging step" % len(jobs)) if jobs: yield from asyncio.gather(*jobs) self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"]))) if results["missing"] or results["disagreed"]: fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) self.logger.info("Pickling 'chrom' discrepancies into %s" % fn) pickle.dump(results,open(fn,"wb")) # now store metadata root_keys["total"] = root_keys.pop("_id") self.logger.info("Root keys: %s" % root_keys) src_build = self.source_backend.build src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}}) return results
def set_chrom(self, batch_size, job_manager): # divide & conquer... build batches jobs = [] total = self.target_backend.count() btotal = math.ceil(total/batch_size) bnum = 1 cnt = 0 results = {"missing" : [], "disagreed" : []} # grab ids only, so we can get more and fill queue for each step # each round, fill the queue to make sure every cpu slots are always working id_batch_size = batch_size * job_manager.process_queue._max_workers * 2 self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \ (self.target_backend.target_collection.name, id_batch_size, batch_size)) for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger): for doc_ids in iter_n(big_doc_ids,batch_size): yield from asyncio.sleep(0.1) cnt += len(doc_ids) pinfo = self.get_pinfo() pinfo["step"] = "post-merge (chrom)" pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.)) self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \ (bnum,btotal,cnt,total,(cnt/total*100.))) ids = [doc["_id"] for doc in doc_ids] job = yield from job_manager.defer_to_process(pinfo, partial(chrom_worker, self.target_backend.target_name, ids)) def processed(f,results, batch_num): try: fres = f.result() results["missing"].extend(fres["missing"]) results["disagreed"].extend(fres["disagreed"]) self.logger.info("chrom batch #%d, done" % batch_num) except Exception as e: import traceback self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \ (batch_num, e, traceback.format_exc())) raise job.add_done_callback(partial(processed, results=results, batch_num=bnum)) jobs.append(job) bnum += 1 self.logger.info("%d jobs created for merging step" % len(jobs)) if jobs: yield from asyncio.gather(*jobs) self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"]))) if results["missing"] or results["disagreed"]: fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) self.logger.info("Pickling 'chrom' discrepancies into %s" % fn) pickle.dump(results,open(fn,"wb")) return results
def do_snpeff(self, batch_size=SNPEFF_BATCH_SIZE, force=False, force_use_cache=False): self.logger.info( "Updating snpeff information from source '%s' (collection:%s)" % (self.fullname, self.collection_name)) # select Snpeff uploader to get collection name and src_dump _id version = self.__class__.__metadata__["assembly"] snpeff_class = getattr(snpeff_upload, "Snpeff%sUploader" % version.capitalize()) snpeff_main_source = snpeff_class.main_source snpeff_doc = self.src_dump.find_one({"_id": snpeff_main_source}) assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?" snpeff_dir = snpeff_doc["download"]["data_folder"] # -q: when there's an update, there's a message on stderr.... cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % ( snpeff_dir, version) # genome files are in "data_folder"/../data genomes = glob.glob( os.path.join(snpeff_dir, "..", "data", "%s_genome.*" % version)) assert len( genomes ) == 1, "Expected only one genome files for '%s', got: %s" % (version, genomes) genome = genomes[0] annotator = snpeff_parser.SnpeffAnnotator(cmd, logger=self.logger) vcf_builder = snpeff_parser.VCFConstruct(genome, logger=self.logger) storage = UpsertStorage(None, snpeff_class.name, self.logger) col = self.db[self.collection_name] total = math.ceil(col.count() / batch_size) cnt = 0 to_process = [] def process(ids): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt", "ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs, version) howmany = storage.process(data, batch_size) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name) for ids in id_feeder(col, batch_size=batch_size, logger=self.logger, force_use=force_use_cache): cnt += 1 self.logger.debug("Processing batch %s/%s [%.1f]" % (cnt, total, (cnt / total * 100))) # don't re-compute annotations if already there if not force: for subids in iter_n(ids, 10000): cur = storage.temp_collection.find( {'_id': { '$in': subids }}, {'_id': 1}) already_ids = [d["_id"] for d in list(cur)] newids = list(set(subids).difference(set(already_ids))) if len(subids) != len(newids): self.logger.debug("%d documents already have snpeff annotations, skip them" % \ (len(subids) - len(newids))) to_process.extend(newids) self.logger.debug("Batch filled %d out of %d" % (len(to_process), batch_size)) if not (len(to_process) >= batch_size): # can fill more... continue process(to_process) to_process = [] else: to_process = ids # for potential remainings if to_process: process(to_process)
def rebuild(col): cur = mongo.id_feeder(col, batch_size=10000, logger=config.logger, force_build=force_build) [i for i in cur] # just iterate
def merge_source(self, src_name, batch_size=100000, ids=None, job_manager=None): # it's actually not optional assert job_manager _query = self.generate_document_query(src_name) # Note: no need to check if there's an existing document with _id (we want to merge only with an existing document) # if the document doesn't exist then the update() call will silently fail. # That being said... if no root documents, then there won't be any previously inserted # documents, and this update() would just do nothing. So if no root docs, then upsert # (update or insert, but do something) defined_root_sources = self.get_root_document_sources() upsert = not defined_root_sources or src_name in defined_root_sources if not upsert: self.logger.debug( "Documents from source '%s' will be stored only if a previous document exists with same _id" % src_name) jobs = [] total = self.source_backend[src_name].count() btotal = math.ceil(total / batch_size) bnum = 1 cnt = 0 got_error = False # grab ids only, so we can get more, let's say 10 times more id_batch_size = batch_size * 10 if ids: self.logger.info( "Merging '%s' specific list of _ids, create merger job with batch_size=%d" % (src_name, batch_size)) id_provider = [ids] else: self.logger.info( "Fetch _ids from '%s' with batch_size=%d, and create merger job with batch_size=%d" % (src_name, id_batch_size, batch_size)) id_provider = id_feeder(self.source_backend[src_name], batch_size=id_batch_size) id_provider = ids and [ids] or id_feeder(self.source_backend[src_name], batch_size=id_batch_size, logger=self.logger) for big_doc_ids in id_provider: for doc_ids in iter_n(big_doc_ids, batch_size): # try to put some async here to give control back # (but everybody knows it's a blocking call: doc_feeder) yield from asyncio.sleep(0.1) cnt += len(doc_ids) pinfo = self.get_pinfo() pinfo["step"] = src_name pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal, (cnt / total * 100)) self.logger.info("Creating merger job #%d/%d, to process '%s' %d/%d (%.1f%%)" % \ (bnum,btotal,src_name,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process( pinfo, partial(merger_worker, self.source_backend[src_name].name, self.target_backend.target_name, doc_ids, self.get_mapper_for_source(src_name, init=False), upsert, bnum)) def batch_merged(f, batch_num): nonlocal got_error if type(f.result()) != int: got_error = Exception( "Batch #%s failed while merging source '%s' [%s]" % (batch_num, src_name, f.result())) job.add_done_callback(partial(batch_merged, batch_num=bnum)) jobs.append(job) bnum += 1 # raise error as soon as we know if got_error: raise got_error self.logger.info("%d jobs created for merging step" % len(jobs)) tasks = asyncio.gather(*jobs) def done(f): nonlocal got_error if None in f.result(): got_error = Exception("Some batches failed") return # compute overall inserted/updated records cnt = sum(f.result()) tasks.add_done_callback(done) yield from tasks if got_error: raise got_error else: return {"%s" % src_name: cnt}
def index(self, target_name, index_name, job_manager, steps=["index", "post"], batch_size=10000, ids=None, mode="index"): """ Build an index named "index_name" with data from collection "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following values: - 'purge': will delete index if it exists - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, or, if not pass, ES will be queried to identify which IDs are missing for each batch in order to complete the index. - None (default): will create a new index, assuming it doesn't already exist """ assert job_manager # check what to do if type(steps) == str: steps = [steps] self.target_name = target_name self.index_name = index_name self.setup_log() self.load_build() got_error = False cnt = 0 if "index" in steps: _db = mongo.get_target_db() target_collection = _db[target_name] _mapping = self.get_mapping() _extra = self.get_index_creation_settings() _meta = {} # partially instantiated indexer instance for process workers partial_idxer = partial(ESIndexer, doc_type=self.doc_type, index=index_name, es_host=self.host, step=batch_size, number_of_shards=self.num_shards, number_of_replicas=self.num_replicas) # instantiate one here for index creation es_idxer = partial_idxer() if es_idxer.exists_index(): if mode == "purge": es_idxer.delete_index() elif mode != "resume": raise IndexerException( "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name) if mode != "resume": es_idxer.create_index({self.doc_type: _mapping}, _extra) jobs = [] total = target_collection.count() btotal = math.ceil(total / batch_size) bnum = 1 if ids: self.logger.info( "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d" % (target_name, batch_size)) id_provider = [ids] else: self.logger.info( "Fetch _ids from '%s', and create indexer job with batch_size=%d" % (target_name, batch_size)) id_provider = id_feeder(target_collection, batch_size=batch_size, logger=self.logger) for ids in id_provider: yield from asyncio.sleep(0.0) cnt += len(ids) pinfo = self.get_pinfo() pinfo["step"] = self.target_name pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal, (cnt / total * 100)) self.logger.info("Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)" % \ (bnum,btotal,target_name,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process( pinfo, partial(indexer_worker, self.target_name, ids, partial_idxer, bnum, mode)) def batch_indexed(f, batch_num): nonlocal got_error res = f.result() if type(res) != tuple or type(res[0]) != int: got_error = Exception( "Batch #%s failed while indexing collection '%s' [result:%s]" % (batch_num, self.target_name, repr(f.result()))) job.add_done_callback(partial(batch_indexed, batch_num=bnum)) jobs.append(job) bnum += 1 # raise error as soon as we know if got_error: raise got_error self.logger.info("%d jobs created for indexing step" % len(jobs)) tasks = asyncio.gather(*jobs) def done(f): nonlocal got_error if None in f.result(): got_error = Exception("Some batches failed") return # compute overall inserted/updated records # returned values looks like [(num,[]),(num,[]),...] cnt = sum([val[0] for val in f.result()]) self.logger.info("Index '%s' successfully created" % index_name, extra={"notify": True}) tasks.add_done_callback(done) yield from tasks if "post" in steps: self.logger.info("Running post-index process for index '%s'" % index_name) pinfo = self.get_pinfo() pinfo["step"] = "post_index" # for some reason (like maintaining object's state between pickling). # we can't use process there. Need to use thread to maintain that state without # building an unmaintainable monster job = yield from job_manager.defer_to_thread( pinfo, partial(self.post_index, target_name, index_name, job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)) def posted(f): try: res = f.result() self.logger.info( "Post-index process done for index '%s': %s" % (index_name, res)) except Exception as e: got_error = e self.logger.error( "Post-index process failed for index '%s': %s" % (index_name, e), extra={"notify": True}) raise job.add_done_callback(posted) yield from asyncio.gather(job) # consume future if got_error: raise got_error else: return {"%s" % self.target_name: cnt}
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info( "Running inspector on %s (type:%s,data_provider:%s)" % (repr(data_provider), data_provider_type, backend_provider)) if sample is not None: self.logger.info( "Sample set to %s, inspect only a subset of data", sample) if limit is None: self.logger.info("Inspecting all the documents") else: nonlocal batch_size # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit if batch_size > limit: batch_size = limit self.logger.info("Inspecting only %s documents", limit) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 doccnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] converters, mode = btinspect.get_converters(mode) inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): if sample is not None: if random.random() > sample: continue cnt += 1 doccnt += batch_size if limit and doccnt > limit: break pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) # just potential converters btinspect.run_converters(inspected, converters) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # when inspecting with "stats" mode, we can get huge number but mongo # can't store more than 2^64, make sure to get rid of big nums there def clean_big_nums(k, v): # TODO: same with float/double? seems mongo handles more there ? if isinstance(v, int) and v > 2**64: return k, math.nan else: return k, v dict_traverse(_map, clean_big_nums) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \ (repr(data_provider),data_provider_type,backend_provider)) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): cnt += 1 pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error
def index(self, target_name, index_name, job_manager, steps=["index", "post"], batch_size=10000, ids=None, mode="index", worker=None): """ Build an index named "index_name" with data from collection "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following values: - 'purge': will delete index if it exists - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, or, if not pass, ES will be queried to identify which IDs are missing for each batch in order to complete the index. - 'merge': will merge data with existing index' documents, used when populated several distinct times (cold/hot merge for instance) - None (default): will create a new index, assuming it doesn't already exist """ assert job_manager # check what to do if type(steps) == str: steps = [steps] self.target_name = target_name self.index_name = index_name self.load_build() self.setup_log() # select proper index worker according to mode: if worker is None: # none specified, choose correct one if mode == "merge": worker = merge_index_worker else: worker = new_index_worker got_error = False cnt = 0 if "index" in steps: self.register_status("indexing", transient=True, init=True, job={"step": "index"}) assert self.build_doc.get("backend_url") target_collection = create_backend( self.build_doc["backend_url"]).target_collection backend_url = self.build_doc["backend_url"] _mapping = self.get_mapping() _extra = self.get_index_creation_settings() # partially instantiated indexer instance for process workers partial_idxer = partial(ESIndexer, doc_type=self.doc_type, index=index_name, es_host=self.host, step=batch_size, number_of_shards=self.num_shards, number_of_replicas=self.num_replicas, **self.kwargs) # instantiate one here for index creation es_idxer = partial_idxer() if es_idxer.exists_index(): if mode == "purge": es_idxer.delete_index() elif mode not in ["resume", "merge"]: msg = "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name self.register_status("failed", job={"err": msg}) raise IndexerException(msg) if mode not in ["resume", "merge"]: try: es_idxer.create_index({self.doc_type: _mapping}, _extra) except Exception as e: self.logger.exception("Failed to create index") self.register_status("failed", job={"err": repr(e)}) raise def clean_ids(ids): # can't use a generator, it's going to be pickled cleaned = [] for _id in ids: if type(_id) != str: self.logger.warning( "_id '%s' has invalid type (!str), skipped", repr(_id)) continue if len(_id) > 512: # this is an ES6 limitation self.logger.warning("_id is too long: '%s'", _id) continue cleaned.append(_id) return cleaned jobs = [] total = target_collection.count() btotal = math.ceil(total / batch_size) bnum = 1 if ids: self.logger.info( "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d", target_name, batch_size) id_provider = iter_n(ids, batch_size) else: self.logger.info( "Fetch _ids from '%s', and create indexer job with batch_size=%d", target_name, batch_size) id_provider = id_feeder(target_collection, batch_size=batch_size, logger=self.logger) for ids in id_provider: yield from asyncio.sleep(0.0) origcnt = len(ids) ids = clean_ids(ids) newcnt = len(ids) if origcnt != newcnt: self.logger.warning( "%d document(s) can't be indexed and will be skipped (invalid _id)", origcnt - newcnt) # progress count cnt += len(ids) pinfo = self.get_pinfo() pinfo["step"] = self.target_name try: descprogress = cnt / total * 100 except ZeroDivisionError: descprogress = 0.0 pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal, descprogress) self.logger.info( "Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)", bnum, btotal, backend_url, cnt, total, descprogress) job = yield from job_manager.defer_to_process( pinfo, partial(indexer_worker, backend_url, ids, partial_idxer, bnum, mode, worker)) def batch_indexed(f, batch_num): nonlocal got_error try: res = f.result() if type(res) != tuple or type(res[0]) != int: got_error = Exception( "Batch #%s failed while indexing collection '%s' [result:%s]" % (batch_num, self.target_name, repr(res))) except Exception as e: got_error = e self.logger.exception("Batch indexed error") return job.add_done_callback(partial(batch_indexed, batch_num=bnum)) jobs.append(job) bnum += 1 # raise error as soon as we know if got_error: self.register_status("failed", job={"err": repr(got_error)}) raise got_error self.logger.info("%d jobs created for indexing step", len(jobs)) tasks = asyncio.gather(*jobs) def done(f): nonlocal got_error if None in f.result(): got_error = None return # compute overall inserted/updated records # returned values looks like [(num,[]),(num,[]),...] cnt = sum((val[0] for val in f.result())) self.register_status("success", job={"step": "index"}, index={"count": cnt}) if total != cnt: # raise error if counts don't match, but index is still created, # fully registered in case we want to use it anyways err = "Merged collection has %d documents but %d have been indexed (check logs for more)" % ( total, cnt) raise IndexerException(err) self.logger.info( "Index '%s' successfully created using merged collection %s", index_name, target_name, extra={"notify": True}) tasks.add_done_callback(done) yield from tasks if "post" in steps: self.logger.info("Running post-index process for index '%s'", index_name) self.register_status("indexing", transient=True, init=True, job={"step": "post-index"}) pinfo = self.get_pinfo() pinfo["step"] = "post_index" # for some reason (like maintaining object's state between pickling). # we can't use process there. Need to use thread to maintain that state without # building an unmaintainable monster job = yield from job_manager.defer_to_thread( pinfo, partial(self.post_index, target_name, index_name, job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)) def posted(f): nonlocal got_error try: res = f.result() self.logger.info( "Post-index process done for index '%s': %s", index_name, res) self.register_status("indexing", job={"step": "post-index"}) except Exception as e: got_error = e self.logger.exception( "Post-index process failed for index '%s':", index_name, extra={"notify": True}) return job.add_done_callback(posted) yield from job # consume future if got_error: self.register_status("failed", job={"err": repr(got_error)}) raise got_error else: self.register_status("success") return {"%s" % self.index_name: cnt}
def set_chrom(self, batch_size, job_manager): # divide & conquer... build batches jobs = [] total = self.target_backend.count() btotal = math.ceil(total/batch_size) bnum = 1 cnt = 0 results = { "missing" : { "count": 0, "examples": [] }, "disagreed" : { "count": 0, "examples": [] } } root_keys = {} # grab ids only, so we can get more and fill queue for each step # each round, fill the queue to make sure every cpu slots are always working id_batch_size = batch_size * job_manager.process_queue._max_workers * 2 self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \ (self.target_backend.target_collection.name, id_batch_size, batch_size)) for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger): for doc_ids in iter_n(big_doc_ids,batch_size): # yield from asyncio.sleep(0.1) # comment this out, this raised an error in py3.8 cnt += len(doc_ids) pinfo = self.get_pinfo() pinfo["step"] = "post-merge (chrom)" pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.)) self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \ (bnum,btotal,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process(pinfo, partial(chrom_worker, self.target_backend.target_name, doc_ids)) def processed(f,results, batch_num): try: fres = f.result() for errtype in ("missing","disagreed"): if fres[errtype]: results[errtype]["count"] += len(fres[errtype]) if len(results[errtype]["examples"]) < self.__class__.MAX_CHROM_EX: results[errtype]["examples"].extend(fres[errtype]) # merge root key counts rk = fres["root_keys"] for k in rk: root_keys.setdefault(k,0) root_keys[k] += rk[k] self.logger.info("chrom batch #%d, done" % batch_num) except Exception as e: import traceback self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \ (batch_num, e, traceback.format_exc())) raise job.add_done_callback(partial(processed, results=results, batch_num=bnum)) jobs.append(job) bnum += 1 self.logger.info("%d jobs created for merging step" % len(jobs)) if jobs: yield from asyncio.gather(*jobs) self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (results["missing"]["count"], results["disagreed"]["count"])) if results["missing"]["count"] or results["disagreed"]["count"]: fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) self.logger.info("Pickling 'chrom' discrepancies into %s" % fn) pickle.dump(results,open(fn,"wb")) # now store metadata root_keys["total"] = root_keys.pop("_id") self.logger.info("Root keys: %s" % root_keys) src_build = self.source_backend.build src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}}) return results
def rebuild(col): cur = mongo.id_feeder(col,batch_size=10000,logger=config.logger,force_build=force_build) [i for i in cur] # just iterate
def do_index(self, job_manager, batch_size, ids, mode, **kwargs): client = MongoClient(**self.mongo_client_args) database = client[self.mongo_database_name] collection = database[self.mongo_collection_name] if ids: self.logger.info(("Indexing from '%s' with specific list of _ids, " "create indexer job with batch_size=%d."), self.mongo_collection_name, batch_size) # use user provided ids in batch id_provider = iter_n(ids, batch_size) else: self.logger.info(("Fetch _ids from '%s', and create " "indexer job with batch_size=%d."), self.mongo_collection_name, batch_size) # use ids from the target mongodb collection in batch id_provider = id_feeder(collection, batch_size, logger=self.logger) jobs = [] # asyncio.Future(s) error = None # the first Exception total = len(ids) if ids else collection.count() schedule = Schedule(total, batch_size) def batch_finished(future): nonlocal error try: schedule.finished += future.result() except Exception as exc: self.logger.warning(exc) error = exc for batch_num, ids in zip(schedule, id_provider): yield from asyncio.sleep(0.0) # when one batch failed, and job scheduling has not completed, # stop scheduling and cancel all on-going jobs, to fail quickly. if error: for job in jobs: if not job.done(): job.cancel() raise error self.logger.info(schedule) pinfo = self.pinfo.get_pinfo( schedule.suffix(self.mongo_collection_name)) job = yield from job_manager.defer_to_process( pinfo, dispatch, self.mongo_client_args, self.mongo_database_name, self.mongo_collection_name, self.es_client_args, self.es_blkidx_args, self.es_index_name, ids, mode, batch_num) job.add_done_callback(batch_finished) jobs.append(job) self.logger.info(schedule) yield from asyncio.gather(*jobs) schedule.completed() self.logger.notify(schedule) return {"count": total, "created_at": datetime.now().astimezone()}
def export_ids(col_name): """ Export all _ids from collection named col_name. If col_name refers to a build where a cold_collection is defined, will also extract _ids and sort/uniq them to have the full list of _ids of the actual merged (cold+hot) collection Output file is stored in DATA_EXPORT_FOLDER/ids, defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is returned as the end, if successful. """ # prepare output directory DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None) if not DATA_EXPORT_FOLDER: DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export") ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids") if not os.path.exists(ids_export_folder): logging.debug("Creating export/ids folder: %s" % ids_export_folder) os.makedirs(ids_export_folder) build = get_src_build().find_one({"_id":col_name}) cold = None if build: col = get_target_db()[col_name] if build.get("build_config",{}).get("cold_collection"): cold_name = build["build_config"]["cold_collection"] cold = get_target_db()[cold_name] logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name)) else: # it's a src col = get_src_db()[col_name] # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore, # so we're sure to work with latest data. If cache is valid, this will be pretty fast logging.info("Screening _ids in collection '%s'" % col.name) for _id in id_feeder(col,validate_only=True): pass # now accessing cache col_ids_cache = get_cache_filename(col.name) assert os.path.exists(col_ids_cache) logging.info("Now using cache file %s" % col_ids_cache) if cold: logging.info("Screening _ids in cold collection '%s'" % cold.name) for _id in id_feeder(cold,validate_only=True): pass # now accessing cache cold_ids_cache = get_cache_filename(cold.name) assert os.path.exists(cold_ids_cache) logging.info("Now using cache file %s" % cold_ids_cache) outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name) # NOTE: can't use anyfile to open cache files and send _id through pipes # because it would load _id in memory (unless using hacks) so use cat (and # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe # on the shell if cold: fout = anyfile(outfn,"wb") colext = os.path.splitext(col_ids_cache)[1] coldext = os.path.splitext(cold_ids_cache)[1] assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext) comp = colext.replace(".","") supportedcomps = ["xz","gz",""] # no compression allowed as well assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps) # IDs sent to pipe's input (sort) then compress it (xz) pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE) psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True) pcat.stdout.close() # will raise end of pipe error when finished if comp: pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout) else: # just print stdin to stdout pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout) psort.stdout.close() try: logging.info("Running pipe to compute list of unique _ids") (out,err) = pcomp.communicate() # run the pipe! (blocking) if err: raise Exception(err) except Exception as e: logging.error("Error while running pipe to export _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise else: logging.info("Copying cache _id file") try: shutil.copyfile(col_ids_cache,outfn) except Exception as e: logging.error("Error while exporting _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise logging.info("Done exporting _ids to '%s'" % outfn) return outfn
def diff_cols(self, old_db_col_names, new_db_col_names, batch_size=100000, steps=["count", "content", "mapping"], mode=None, exclude=[]): """ Compare new with old collections and produce diff files. Root keys can be excluded from comparison with "exclude" parameter. *_db_col_names can be: 1. a colleciton name (as a string) asusming they are in the target database. 2. tuple with 2 elements, the first one is then either "source" or "target" to respectively specify src or target database, and the second element is the collection name. 3. tuple with 3 elements (URI,db,collection), looking like: ("mongodb://*****:*****@host","dbname","collection"), allowing to specify any connection on any server steps: 'count' will count the root keys for every documents in new collection (to check number of docs from datasources). 'content' will perform diff on actual content. 'mapping' will perform diff on ES mappings (if target collection involved) mode: 'purge' will remove any existing files for this comparison. """ new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) # check what to do if type(steps) == str: steps = [steps] diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names) if os.path.exists(diff_folder): if mode == "purge" and os.path.exists(diff_folder): rmdashfr(diff_folder) else: raise FileExistsError( "Found existing files in '%s', use mode='purge'" % diff_folder) if not os.path.exists(diff_folder): os.makedirs(diff_folder) # create metadata file storing info about how we created the diff # and some summary data diff_stats = { "update": 0, "add": 0, "delete": 0, "mapping_changed": False } metadata = { "diff": { "type": self.diff_type, "func": self.diff_func.__name__, "version": "%s.%s" % (old.version, new.version), "stats": diff_stats, # ref to diff_stats "files": [], # when "new" is a target collection: "mapping_file": None, "info": { "generated_on": str(datetime.now()), "exclude": exclude, "steps": steps, "mode": mode, "batch_size": batch_size } }, "old": { "backend": old_db_col_names, "version": old.version }, "new": { "backend": new_db_col_names, "version": new.version }, # when "new" is a target collection: "_meta": {}, "build_config": {}, } if isinstance( new, DocMongoBackend ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE: build_doc = get_src_build().find_one( {"_id": new.target_collection.name}) if not build_doc: raise DifferException("Collection '%s' has no corresponding build document" % \ new.target_collection.name) metadata["_meta"] = build_doc.get("_meta", {}) metadata["build_config"] = build_doc.get("build_config") # dump it here for minimum information, in case we don't go further json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) got_error = False if "mapping" in steps: def diff_mapping(old, new, diff_folder): summary = {} old_build = get_src_build().find_one( {"_id": old.target_collection.name}) new_build = get_src_build().find_one( {"_id": new.target_collection.name}) if old_build and new_build: # mapping diff always in jsondiff mapping_diff = jsondiff(old_build["mapping"], new_build["mapping"]) if mapping_diff: file_name = os.path.join(diff_folder, "mapping.pyobj") dump(mapping_diff, file_name) md5 = md5sum(file_name) summary["mapping_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } else: self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \ (old.target_collection.name,new.target_collection.name)) return summary def mapping_diffed(f): res = f.result() if res.get("mapping_file"): nonlocal got_error # check mapping differences: only "add" ops are allowed, as any others actions would be # ingored by ES once applied (you can't update/delete elements of an existing mapping) mf = os.path.join(diff_folder, res["mapping_file"]["name"]) ops = loadobj(mf) for op in ops: if op["op"] != "add": err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \ " only 'add' operations are allowed. You can still produce the " + \ "diff by removing 'mapping' from 'steps' arguments. " + \ "Ex: steps=['count','content']. Diff operation was: %s" % op) got_error = err metadata["diff"]["mapping_file"] = mf diff_stats["mapping_changed"] = True self.logger.info( "Diff file containing mapping differences generated: %s" % res.get("mapping_file")) pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "mapping: old vs new", "description": "" } job = yield from self.job_manager.defer_to_thread( pinfo, partial(diff_mapping, old, new, diff_folder)) job.add_done_callback(mapping_diffed) yield from job if got_error: raise got_error if "count" in steps: cnt = 0 pinfo = { "category": "diff", "step": "count", "source": "%s vs %s" % (new.target_name, old.target_name), "description": "" } self.logger.info("Counting root keys in '%s'" % new.target_name) diff_stats["root_keys"] = {} jobs = [] data_new = id_feeder(new, batch_size=batch_size) for id_list in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_count, id_list, new_db_col_names, cnt)) jobs.append(job) def counted(f): root_keys = {} # merge the counts for d in f.result(): for k in d: root_keys.setdefault(k, 0) root_keys[k] += d[k] self.logger.info("root keys count: %s" % root_keys) diff_stats["root_keys"] = root_keys tasks = asyncio.gather(*jobs) tasks.add_done_callback(counted) yield from tasks self.logger.info( "Finished counting keys in the new collection: %s" % diff_stats["root_keys"]) if "content" in steps: skip = 0 cnt = 0 jobs = [] pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "content: new vs old", "description": "" } data_new = id_feeder(new, batch_size=batch_size) selfcontained = "selfcontained" in self.diff_type for id_list_new in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["update"] += res["update"] diff_stats["add"] += res["add"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Updated: {}, Added: {})".format( res["update"], res["add"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_new_vs_old, id_list_new, old_db_col_names, new_db_col_names, cnt, diff_folder, self.diff_func, exclude, selfcontained)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(diff_stats["update"], diff_stats["add"])) data_old = id_feeder(old, batch_size=batch_size) jobs = [] pinfo["step"] = "content: old vs new" for id_list_old in data_old: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["delete"] += res["delete"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Deleted: {})".format(res["delete"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_old_vs_new, id_list_old, new_db_col_names, cnt, diff_folder)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(diff_stats["delete"])) self.logger.info( "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})" .format(diff_stats["update"], diff_stats["add"], diff_stats["delete"], diff_stats["mapping_changed"])) # pickle again with potentially more information (diff_stats) json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % ( old_db_col_names, new_db_col_names, steps, diff_stats) self.logger.info("success %s" % strargs, extra={"notify": True}) return diff_stats
def do_snpeff(self, batch_size=SNPEFF_BATCH_SIZE, force=False, force_use_cache=False): self.logger.info("Updating snpeff information from source '%s' (collection:%s)" % (self.fullname,self.collection_name)) # select Snpeff uploader to get collection name and src_dump _id version = self.__class__.__metadata__["assembly"] snpeff_class = getattr(snpeff_upload,"Snpeff%sUploader" % version.capitalize()) snpeff_main_source = snpeff_class.main_source snpeff_doc = self.src_dump.find_one({"_id" : snpeff_main_source}) assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?" snpeff_dir = snpeff_doc["data_folder"] # -q: when there's an update, there's a message on stderr.... cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % (snpeff_dir,version) # genome files are in "data_folder"/../data genomes = glob.glob(os.path.join(snpeff_dir,"..","data","%s_genome.*" % version)) assert len(genomes) == 1, "Expected only one genome files for '%s', got: %s" % (version,genomes) genome = genomes[0] annotator = snpeff_parser.SnpeffAnnotator(cmd,logger=self.logger) vcf_builder = snpeff_parser.VCFConstruct(genome,logger=self.logger) storage = UpsertStorage(None,snpeff_class.name,self.logger) col = self.db[self.collection_name] total = math.ceil(col.count()/batch_size) cnt = 0 to_process = [] def process(ids): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt","ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs,version) howmany = storage.process(data, batch_size) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name) for ids in id_feeder(col, batch_size=batch_size, logger=self.logger, force_use=force_use_cache): cnt += 1 self.logger.debug("Processing batch %s/%s [%.1f]" % (cnt,total,(cnt/total*100))) # don't re-compute annotations if already there if not force: for subids in iter_n(ids,10000): cur = storage.temp_collection.find({'_id' : {'$in' : subids}},{'_id':1}) already_ids = [d["_id"] for d in list(cur)] newids = list(set(subids).difference(set(already_ids))) if len(subids) != len(newids): self.logger.debug("%d documents already have snpeff annotations, skip them" % \ (len(subids) - len(newids))) to_process.extend(newids) self.logger.debug("Batch filled %d out of %d" % (len(to_process),batch_size)) if not (len(to_process) >= batch_size): # can fill more... continue process(to_process) to_process = [] else: to_process = ids # for potential remainings if to_process: process(to_process)