예제 #1
0
 def test_smalldoc(self):
     left = {'a':[9,8,3],'b':'B','c':{'1':1,'2':2,'3':3}}
     right = {'c':{'5':5,'4':4,'1':1},'B':'capitalB','a':[1,2,3,4,5],'b':'bbb'}
     patch = jsondiff.make(left,right)
     new_right = jsonpatch.apply_patch(left,patch)
     new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True)
     eq_(right,new_new_right)
예제 #2
0
 def test_object(self):
     left = {"c":{"1":1,"2":2,"3":3}}
     right = {"c":{"1":1,"4":4,"5":5}}
     patch = jsondiff.make(left,right)
     new_right = jsonpatch.apply_patch(left,patch)
     eq_(right,new_right)
     # patch contains "add" and "remove" ops, so it cannot be re-patched that easy...
     # use ignore and verify
     new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True)
     eq_(right,new_new_right)
예제 #3
0
 def test_scalar(self):
     left = {"one": 1, "ONE": "111"}
     right = {"two": 2, "TWO": "222"}
     patch = jsondiff.make(left,right)
     new_right = jsonpatch.apply_patch(left,patch)
     eq_(right,new_right)
     # do it again, it's a "remove"/"add" op, so we need to ignore
     # conflicts but make sure the result is the one we expect
     new_new_right = jsonpatch.apply_patch(new_right,patch,ignore_conflicts=True,verify=True)
     eq_(right,new_new_right)
예제 #4
0
 def test_array(self):
     left = {"a": [1, 2, 3]}
     right = {"a": [1, 2, 3, 4, 5]}
     patch = jsondiff.make(left, right)
     new_right = jsonpatch.apply_patch(left, patch)
     assert right == new_right
     # do it again, it's a "replace" op so it can be re-patched safely
     new_new_right = jsonpatch.apply_patch(new_right, patch)
     assert right == new_new_right
     # smaller list on right
     left = {"a": [1, 2, 3, 4, 5]}
     right = {"a": [6, 7]}
     patch = jsondiff.make(left, right)
     new_right = jsonpatch.apply_patch(left, patch)
     assert right == new_right
예제 #5
0
def sync_es_for_update(indexer, diffupdates, batch_size, res):
    batch = []
    ids = [p["_id"] for p in diffupdates]
    iterids_bcnt = iter_n(ids, batch_size, True)
    for batchids, bcnt in iterids_bcnt:
        for i, doc in enumerate(indexer.get_docs(batchids)):
            # recompute correct index in diff["update"], since we split it in batches
            diffidx = i + bcnt - len(
                batchids
            )  # len(batchids) is not == batch_size for the last one...
            try:
                patch_info = diffupdates[
                    diffidx]  # same order as what's return by get_doc()...
                assert patch_info["_id"] == doc["_id"], "%s != %s" % (
                    patch_info["_id"], doc["_id"])  # ... but just make sure
                newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
                if newdoc == doc:
                    # already applied
                    logging.warning("_id '%s' already synced" % doc["_id"])
                    res["skipped"] += 1
                    continue
                batch.append(newdoc)
            except jsonpatch.JsonPatchConflict as e:
                # assuming already applied
                logging.warning(
                    "_id '%s' already synced ? JsonPatchError: %s" %
                    (doc["_id"], e))
                res["skipped"] += 1
                continue
            if len(batch) >= batch_size:
                res["updated"] += indexer.index_bulk(batch, batch_size)[0]
                batch = []
        if batch:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
예제 #6
0
 def test_bigdoc(self):
     v2_path = join(dirname(__file__), "v2.json")
     v3_path = join(dirname(__file__), "v3.json")
     v2 = json.load(open(v2_path))
     v3 = json.load(open(v3_path))
     patch = jsondiff.make(v2, v3)
     new_v3 = jsonpatch.apply_patch(v2, patch)
     assert v3 == new_v3
예제 #7
0
 def update_mapping():
     diffm = os.path.join(diff_folder, diff_mapping_file)
     ops = loadobj(diffm)
     mapping = indexer.get_mapping()
     # we should have the same doc type declared in the mapping
     mapping[doc_type]["properties"] = jsonpatch.apply_patch(
         mapping[doc_type]["properties"], ops)
     res = indexer.update_mapping(mapping)
     return res
예제 #8
0
def sync_es_for_update(diff_file, indexer, diffupdates, batch_size, res,
                       debug):
    batch = []
    ids = [p["_id"] for p in diffupdates]
    iterids_bcnt = iter_n(ids, batch_size, True)
    for batchids, bcnt in iterids_bcnt:
        try:
            for i, doc in enumerate(indexer.get_docs(batchids)):
                # recompute correct index in diff["update"], since we split it in batches
                diffidx = i + bcnt - len(
                    batchids
                )  # len(batchids) is not == batch_size for the last one...
                try:
                    patch_info = diffupdates[
                        diffidx]  # same order as what's return by get_doc()...
                    assert patch_info["_id"] == doc["_id"], "%s != %s" % (
                        patch_info["_id"], doc["_id"]
                    )  # ... but just make sure
                    newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
                    if newdoc == doc:
                        # already applied
                        logging.warning("_id '%s' already synced" % doc["_id"])
                        res["skipped"] += 1
                        continue
                    batch.append(newdoc)
                except jsonpatch.JsonPatchConflict as e:
                    # assuming already applied
                    logging.warning(
                        "_id '%s' already synced ? JsonPatchError: %s" %
                        (doc["_id"], e))
                    res["skipped"] += 1
                    continue
                if len(batch) >= batch_size:
                    res["updated"] += indexer.index_bulk(batch, batch_size)[0]
                    batch = []
            if batch:
                res["updated"] += indexer.index_bulk(batch, batch_size)[0]
        except Exception as e:
            if debug:
                logging.error(
                    "From diff file '%s', %d IDs couldn't be synced because: %s\n%s"
                    % (diff_file, e, len(batchids)))
                pickfile = "batch_sync_updater_%s_%s.pickle" % (
                    bcnt, os.path.basename(diff_file))
                logging.error("IDs pickled in '%s'" % pickfile)
                pickle.dump(batchids, open(pickfile, "wb"))
            raise
예제 #9
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={},
                               debug=False):
    """Worker to sync data between a new and an old mongo collection"""
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
예제 #10
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={}):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    new = create_backend(new_db_col_names)  # mongo collection to sync from
    indexer = create_backend(es_config).target_esidxer
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("es") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, doc["_id"], action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({
                        "_id": doc["_id"],
                        "file": diff_file,
                        "error": e
                    })
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise

    # update: get doc from indexer and apply diff
    batch = []
    ids = [p["_id"] for p in diff["update"]]
    for i, doc in enumerate(indexer.get_docs(ids)):
        try:
            patch_info = diff["update"][
                i]  # same order as what's return by get_doc()...
            assert patch_info["_id"] == doc["_id"]  # ... but just make sure
            newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
            if newdoc == doc:
                # already applied
                res["skipped"] += 1
                continue
            batch.append(newdoc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
            batch = []
    if batch:
        res["updated"] += indexer.index_bulk(batch, batch_size)[0]

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("es", True)
    dump(diff, diff_file)
    return res
예제 #11
0
 def test_bigdoc(self):
     v2 = json.load(open("v2.json"))
     v3 = json.load(open("v3.json"))
     patch = jsondiff.make(v2,v3)
     new_v3 = jsonpatch.apply_patch(v2,patch)
     eq_(v3,new_v3)