def test_smalldoc(self): left = {'a':[9,8,3],'b':'B','c':{'1':1,'2':2,'3':3}} right = {'c':{'5':5,'4':4,'1':1},'B':'capitalB','a':[1,2,3,4,5],'b':'bbb'} patch = jsondiff.make(left,right) new_right = jsonpatch.apply_patch(left,patch) new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True) eq_(right,new_new_right)
def test_object(self): left = {"c":{"1":1,"2":2,"3":3}} right = {"c":{"1":1,"4":4,"5":5}} patch = jsondiff.make(left,right) new_right = jsonpatch.apply_patch(left,patch) eq_(right,new_right) # patch contains "add" and "remove" ops, so it cannot be re-patched that easy... # use ignore and verify new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True) eq_(right,new_new_right)
def test_scalar(self): left = {"one": 1, "ONE": "111"} right = {"two": 2, "TWO": "222"} patch = jsondiff.make(left,right) new_right = jsonpatch.apply_patch(left,patch) eq_(right,new_right) # do it again, it's a "remove"/"add" op, so we need to ignore # conflicts but make sure the result is the one we expect new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True) eq_(right,new_new_right)
def test_array(self): left = {"a": [1, 2, 3]} right = {"a": [1, 2, 3, 4, 5]} patch = jsondiff.make(left, right) new_right = jsonpatch.apply_patch(left, patch) assert right == new_right # do it again, it's a "replace" op so it can be re-patched safely new_new_right = jsonpatch.apply_patch(new_right, patch) assert right == new_new_right # smaller list on right left = {"a": [1, 2, 3, 4, 5]} right = {"a": [6, 7]} patch = jsondiff.make(left, right) new_right = jsonpatch.apply_patch(left, patch) assert right == new_right
def sync_es_for_update(indexer, diffupdates, batch_size, res): batch = [] ids = [p["_id"] for p in diffupdates] iterids_bcnt = iter_n(ids, batch_size, True) for batchids, bcnt in iterids_bcnt: for i, doc in enumerate(indexer.get_docs(batchids)): # recompute correct index in diff["update"], since we split it in batches diffidx = i + bcnt - len( batchids ) # len(batchids) is not == batch_size for the last one... try: patch_info = diffupdates[ diffidx] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"], "%s != %s" % ( patch_info["_id"], doc["_id"]) # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied logging.warning("_id '%s' already synced" % doc["_id"]) res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict as e: # assuming already applied logging.warning( "_id '%s' already synced ? JsonPatchError: %s" % (doc["_id"], e)) res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0]
def test_bigdoc(self): v2_path = join(dirname(__file__), "v2.json") v3_path = join(dirname(__file__), "v3.json") v2 = json.load(open(v2_path)) v3 = json.load(open(v3_path)) patch = jsondiff.make(v2, v3) new_v3 = jsonpatch.apply_patch(v2, patch) assert v3 == new_v3
def update_mapping(): diffm = os.path.join(diff_folder, diff_mapping_file) ops = loadobj(diffm) mapping = indexer.get_mapping() # we should have the same doc type declared in the mapping mapping[doc_type]["properties"] = jsonpatch.apply_patch( mapping[doc_type]["properties"], ops) res = indexer.update_mapping(mapping) return res
def sync_es_for_update(diff_file, indexer, diffupdates, batch_size, res, debug): batch = [] ids = [p["_id"] for p in diffupdates] iterids_bcnt = iter_n(ids, batch_size, True) for batchids, bcnt in iterids_bcnt: try: for i, doc in enumerate(indexer.get_docs(batchids)): # recompute correct index in diff["update"], since we split it in batches diffidx = i + bcnt - len( batchids ) # len(batchids) is not == batch_size for the last one... try: patch_info = diffupdates[ diffidx] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"], "%s != %s" % ( patch_info["_id"], doc["_id"] ) # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied logging.warning("_id '%s' already synced" % doc["_id"]) res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict as e: # assuming already applied logging.warning( "_id '%s' already synced ? JsonPatchError: %s" % (doc["_id"], e)) res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] except Exception as e: if debug: logging.error( "From diff file '%s', %d IDs couldn't be synced because: %s\n%s" % (diff_file, e, len(batchids))) pickfile = "batch_sync_updater_%s_%s.pickle" % ( bcnt, os.path.basename(diff_file)) logging.error("IDs pickled in '%s'" % pickfile) pickle.dump(batchids, open(pickfile, "wb")) raise
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new and an old mongo collection""" # check if diff files was already synced res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new mongo collection and an elasticsearch index""" new = create_backend(new_db_col_names) # mongo collection to sync from indexer = create_backend(es_config).target_esidxer diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("es") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: try: # force action=create to spot docs already added indexer.index(doc, doc["_id"], action="create") res["added"] += 1 except ConflictError: # already added res["skipped"] += 1 continue except Exception as e: errors.append({ "_id": doc["_id"], "file": diff_file, "error": e }) import pickle pickle.dump(errors, open("errors", "wb")) raise # update: get doc from indexer and apply diff batch = [] ids = [p["_id"] for p in diff["update"]] for i, doc in enumerate(indexer.get_docs(ids)): try: patch_info = diff["update"][ i] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"] # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("es", True) dump(diff, diff_file) return res
def test_bigdoc(self): v2 = json.load(open("v2.json")) v3 = json.load(open("v3.json")) patch = jsondiff.make(v2,v3) new_v3 = jsonpatch.apply_patch(v2,patch) eq_(v3,new_v3)