def process(ids): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt", "ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs, version) howmany = storage.process(data, batch_size) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name)
def process(ids, bnum): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt", "ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs, version) try: howmany = storage.process(data, batch_size) except Exception as e: # batch failed, rebuild it (it was a generator, we don't know where it is now) # and retry one by one data = annotate_start_end(hgvs_vcfs, version) howmany = 0 for doc in data: try: howmany += storage.process([doc], batch_size=1) except Exception as e: self.logger.exception( "Couldn't annotate document _id '%s', skip it: %s" % (doc["_id"], e)) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name)
def process(ids): self.logger.info("%d documents to annotate" % len(ids)) hgvs_vcfs = vcf_builder.build_vcfs(ids) # merge "vcf" and snpeff annotations keys when possible # (it no snpeff data, we keep 'vcf' data) for annot in annotator.annotate(hgvs_vcfs): hgvs_vcfs[annot["_id"]].update(annot) # trim if sequence is to big for _id in hgvs_vcfs: vcf = hgvs_vcfs[_id] for k in ["alt","ref"]: if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN: msg = "...(trimmed)" vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg hgvs_vcfs[_id] = vcf data = annotate_start_end(hgvs_vcfs,version) howmany = storage.process(data, batch_size) if howmany: # we need to update some metadata info about snpeff b/c data has changed # so cache could be invalid self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name) mongo.invalidate_cache(snpeff_class.name)
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new and an old mongo collection""" # check if diff files was already synced res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res