def update2(self, id_patchs, collection, source_collection): from utils import backend _es = backend.GeneDocESBackend(self._esi) _db = backend.GeneDocMongoDBBackend(source_collection) for _id_chunk in iter_n(id_patchs, 100): es_docs = _es.mget_from_ids(_id_chunk, step=100) db_docs = _db.mget_from_ids(_id_chunk) es_docs = dict([(doc['_id'], doc) for doc in es_docs]) db_docs = dict([(doc['_id'], doc) for doc in db_docs]) for _id in es_docs: doc_es = es_docs[_id] doc_mongo = db_docs.get(_id, None) if doc_mongo: doc_es[collection] = doc_mongo[collection] else: print('id does not exist in mongodb collection:', _id) doc_es.pop('_id', None) es_info = { '_op_type': 'index', '_index': self._index, "_id": _id, '_type': self._doc_type, '_source': doc_es } yield es_info
def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator cnt_update = 0 cnt_create = 0 for ids_chunk in iter_n(ids, 100): id_list_all = self._esi.mexists(ids_chunk, verbose=False) for _id, _exists in id_list_all: _doc = self._src[collection].find_one({'_id': _id}) _doc.pop('_id') # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, '_id': _id, 'doc': _doc } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': _doc } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create)
def doc_iterator(self, genedoc_d, batch=True, step=10000): if isinstance(genedoc_d, types.GeneratorType) and batch: for doc_li in iter_n(genedoc_d, n=step): yield doc_li else: if batch: doc_li = [] i = 0 for _id, doc in genedoc_d.items(): doc['_id'] = _id _doc = copy.copy(self) _doc.clear() _doc.update(doc) #if validate: # _doc.validate() if batch: doc_li.append(_doc) i += 1 if i % step == 0: yield doc_li doc_li = [] else: yield _doc if batch: yield doc_li
def set_chrom(self, batch_size, job_manager): # divide & conquer... build batches jobs = [] total = self.target_backend.count() btotal = math.ceil(total/batch_size) bnum = 1 cnt = 0 results = {"missing" : [], "disagreed" : []} root_keys = {} # grab ids only, so we can get more and fill queue for each step # each round, fill the queue to make sure every cpu slots are always working id_batch_size = batch_size * job_manager.process_queue._max_workers * 2 self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \ (self.target_backend.target_collection.name, id_batch_size, batch_size)) for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger): for doc_ids in iter_n(big_doc_ids,batch_size): yield from asyncio.sleep(0.1) cnt += len(doc_ids) pinfo = self.get_pinfo() pinfo["step"] = "post-merge (chrom)" pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.)) self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \ (bnum,btotal,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process(pinfo, partial(chrom_worker, self.target_backend.target_name, doc_ids)) def processed(f,results, batch_num): try: fres = f.result() results["missing"].extend(fres["missing"]) results["disagreed"].extend(fres["disagreed"]) # merge root key counts rk = fres["root_keys"] for k in rk: root_keys.setdefault(k,0) root_keys[k] += rk[k] self.logger.info("chrom batch #%d, done" % batch_num) except Exception as e: import traceback self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \ (batch_num, e, traceback.format_exc())) raise job.add_done_callback(partial(processed, results=results, batch_num=bnum)) jobs.append(job) bnum += 1 self.logger.info("%d jobs created for merging step" % len(jobs)) if jobs: yield from asyncio.gather(*jobs) self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"]))) if results["missing"] or results["disagreed"]: fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) self.logger.info("Pickling 'chrom' discrepancies into %s" % fn) pickle.dump(results,open(fn,"wb")) # now store metadata root_keys["total"] = root_keys.pop("_id") self.logger.info("Root keys: %s" % root_keys) src_build = self.source_backend.build src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}}) return results
def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1)))
def post_dump(self, *args, **kwargs): self.logger.info("Merging JSON documents in '%s'" % self.new_data_folder) # we'll merge 100 files together, that's 100'000 documents. That way we don't have one huge # big files and we don't have thousands of them too. We'll also remove metadata (useless now) parts = glob.iglob(os.path.join(self.new_data_folder,"molecule.part*")) for chunk,cnt in iter_n(parts,self.__class__.CHUNK_MERGE_SIZE,with_cnt=True): outfile = os.path.join(self.new_data_folder,"molecule.%s.json" % cnt) merged_data = {"molecules" : []} for f in chunk: data = json.load(open(f)) merged_data["molecules"].extend(data["molecules"]) json.dump(merged_data,open(outfile,"w")) self.logger.info("Merged %s files" % cnt) # now we can delete the parts self.logger.info("Deleting part files") parts = glob.iglob(os.path.join(self.new_data_folder,"molecule.part*")) for f in parts: os.remove(f) self.logger.info("Post-dump merge done")
def verify_ids(doc_iter, index, doc_type, step=100000, ): '''verify how many docs from input interator/list overlapping with existing docs.''' index = index doc_type = doc_type es = get_es() q = {'query': {'ids': {"values": []}}} total_cnt = 0 found_cnt = 0 out = [] for doc_batch in iter_n(doc_iter, n=step): id_li = [doc['_id'] for doc in doc_batch] # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch] q['query']['ids']['values'] = id_li xres = es.search(index=index, doc_type=doc_type, body=q, _source=False) found_cnt += xres['hits']['total'] total_cnt += len(id_li) print(xres['hits']['total'], found_cnt, total_cnt) out.extend([x['_id'] for x in xres['hits']['hits']]) return out
def do_snpeff(self, batch_size=SNPEFF_BATCH_SIZE, force=False, force_use_cache=False): self.logger.info("Updating snpeff information from source '%s' (collection:%s)" % (self.fullname,self.collection_name)) # select Snpeff uploader to get collection name and src_dump _id version = self.__class__.__metadata__["assembly"] snpeff_class = getattr(snpeff_upload,"Snpeff%sUploader" % version.capitalize()) snpeff_main_source = snpeff_class.main_source snpeff_doc = self.src_dump.find_one({"_id" : snpeff_main_source}) assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?" snpeff_dir = snpeff_doc["data_folder"] # -q: when there's an update, there's a message on stderr.... cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % (snpeff_dir,version) # genome files are in "data_folder"/../data genomes = glob.glob(os.path.join(snpeff_dir,"..","data","%s_genome.*" % version)) assert len(genomes) == 1, "Expected only one genome files for '%s', got: %s" % (version,genomes) genome = genomes[0] annotator = snpeff_parser.SnpeffAnnotator(cmd,logger=self.logger) vcf_builder = snpeff_parser.VCFConstruct(genome,logger=self.logger) storage = UpsertStorage(None,snpeff_class.name,self.logger) col = self.db[self.collection_name] total = math.ceil(col.count()/batch_size) cnt = 0 to_process = [] def process(ids): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt","ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs,version) howmany = storage.process(data, batch_size) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name) for ids in id_feeder(col, batch_size=batch_size, logger=self.logger, force_use=force_use_cache): cnt += 1 self.logger.debug("Processing batch %s/%s [%.1f]" % (cnt,total,(cnt/total*100))) # don't re-compute annotations if already there if not force: for subids in iter_n(ids,10000): cur = storage.temp_collection.find({'_id' : {'$in' : subids}},{'_id':1}) already_ids = [d["_id"] for d in list(cur)] newids = list(set(subids).difference(set(already_ids))) if len(subids) != len(newids): self.logger.debug("%d documents already have snpeff annotations, skip them" % \ (len(subids) - len(newids))) to_process.extend(newids) self.logger.debug("Batch filled %d out of %d" % (len(to_process),batch_size)) if not (len(to_process) >= batch_size): # can fill more... continue process(to_process) to_process = [] else: to_process = ids # for potential remainings if to_process: process(to_process)
def id_feeder(col, batch_size=1000, build_cache=True, logger=logging, force_use=False, force_build=False): """Return an iterator for all _ids in collection "col" Search for a valid cache file if available, if not return a doc_feeder for that collection. Valid cache is a cache file that is newer than the collection. "db" can be "target" or "src". "build_cache" True will build a cache file as _ids are fetched, if no cache file was found "force_use" True will use any existing cache file and won't check whether it's valid of not. "force_build" True will build a new cache even if current one exists and is valid. """ src_db = get_src_db() ts = None found_meta = True if isinstance(col, DocMongoBackend): col = col.target_collection try: if col.database.name == config.DATA_TARGET_DATABASE: info = src_db["src_build"].find_one({"_id": col.name}) if not info: logger.warning( "Can't find information for target collection '%s'" % col.name) else: ts = info["started_at"].timestamp() elif col.database.name == config.DATA_SRC_DATABASE: info = src_db["src_dump"].find_one({ "$where": "function() {if(this.upload) {for(var index in this.upload.jobs) {if(this.upload.jobs[index].step == \"%s\") return this;}}}" % col.name }) if not info: logger.warning( "Can't find information for source collection '%s'" % col.name) else: ts = info["upload"]["jobs"][col.name]["started_at"].timestamp() else: logging.warning( "Can't find metadata for collection '%s' (not a target, not a source collection)" % col) found_meta = False build_cache = False except KeyError: logger.warning("Couldn't find timestamp in database for '%s'" % col.name) except Exception as e: logger.info( "%s is not a mongo collection, _id cache won't be built (error: %s)" % (col, e)) build_cache = False # try to find a cache file use_cache = False cache_file = None cache_format = getattr(config, "CACHE_FORMAT", None) if found_meta and getattr(config, "CACHE_FOLDER", None): cache_file = get_cache_filename(col.name) try: # size of empty file differs depending on compression empty_size = {None: 0, "xz": 32, "gzip": 25, "bz2": 14} if force_build: logger.warning("Force building cache file") use_cache = False # check size, delete if invalid elif os.path.getsize(cache_file) <= empty_size.get( cache_format, 32): logger.warning("Cache file exists but is empty, delete it") os.remove(cache_file) elif force_use: use_cache = True logger.info("Force using cache file") else: mt = os.path.getmtime(cache_file) if ts and mt >= ts: use_cache = True else: logger.info("Cache is too old, discard it") except FileNotFoundError: pass if use_cache: logger.debug("Found valid cache file for '%s': %s" % (col.name, cache_file)) with open_compressed_file(cache_file) as cache_in: if cache_format: iocache = io.TextIOWrapper(cache_in) else: iocache = cache_in for ids in iter_n(iocache, batch_size): yield [_id.strip() for _id in ids if _id.strip()] else: logger.debug( "No cache file found (or invalid) for '%s', use doc_feeder" % col.name) cache_out = None cache_temp = None if getattr(config, "CACHE_FOLDER", None) and config.CACHE_FOLDER and build_cache: if not os.path.exists(config.CACHE_FOLDER): os.makedirs(config.CACHE_FOLDER) cache_temp = "%s._tmp_" % cache_file # clean aborted cache file generation for tmpcache in glob.glob( os.path.join(config.CACHE_FOLDER, "%s*" % cache_temp)): logger.info("Removing aborted cache file '%s'" % tmpcache) os.remove(tmpcache) # use temp file and rename once done cache_temp = "%s%s" % (cache_temp, get_random_string()) cache_out = get_compressed_outfile(cache_temp, compress=cache_format) logger.info("Building cache file '%s'" % cache_temp) else: logger.info( "Can't build cache, cache not allowed or no cache folder") build_cache = False if isinstance(col, Collection): doc_feeder_func = partial(doc_feeder, col, step=batch_size, inbatch=True, fields={"_id": 1}) elif isinstance(col, DocMongoBackend): doc_feeder_func = partial(doc_feeder, col.target_collection, step=batch_size, inbatch=True, fields={"_id": 1}) elif isinstance(col, DocESBackend): # get_id_list directly return the _id, wrap it to match other # doc_feeder_func returned vals. Also return a batch of id def wrap_id(): ids = [] for _id in col.get_id_list(step=batch_size): ids.append({"_id": _id}) if len(ids) >= batch_size: yield ids ids = [] if ids: yield ids doc_feeder_func = partial(wrap_id) else: raise Exception("Unknown backend %s" % col) for doc_ids in doc_feeder_func(): doc_ids = [_doc["_id"] for _doc in doc_ids] if build_cache: strout = "\n".join(doc_ids) + "\n" if cache_format: # assuming binary format (b/ccompressed) cache_out.write(strout.encode()) else: cache_out.write(strout) yield doc_ids if build_cache: cache_out.close() cache_final = os.path.splitext(cache_temp)[0] os.rename(cache_temp, cache_final)