Python create_backend 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: biothings.databuild.backend

메소드/함수: create_backend

hotexamples.com에서의 예제들: 8

Python create_backend - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 biothings.databuild.backend.create_backend에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: differ.py 프로젝트: SuLab/biothings.api

        def analyze(diff_file, detailed):
            data = loadobj(diff_file)
            sources[data["source"]] = 1
            if detailed:
                # TODO: if self-contained, no db connection needed
                new_col = create_backend(metadata["new"]["backend"])
                old_col = create_backend(metadata["old"]["backend"])
            if len(adds) < max_reported_ids:
                if detailed:
                    # look for which root keys were added in new collection
                    for _id in data["add"]:
                        # selfcontained = dict for whole doc (see TODO above)
                        if type(_id) == dict:
                            _id = _id["_id"]
                        doc = new_col.get_from_id(_id)
                        rkeys = sorted(doc.keys())
                        adds["ids"].append([_id, rkeys])
                else:
                    if data["add"] and type(data["add"][0]) == dict:
                        adds["ids"].extend([d["_id"] for d in data["add"]])
                    else:
                        adds["ids"].extend(data["add"])
            adds["count"] += len(data["add"])
            if len(dels) < max_reported_ids:
                if detailed:
                    # look for which root keys were deleted in old collection
                    for _id in data["delete"]:
                        doc = old_col.get_from_id(_id)
                        rkeys = sorted(doc.keys())
                        dels["ids"].append([_id, rkeys])
                else:
                    dels["ids"].extend(data["delete"])
            dels["count"] += len(data["delete"])
            for up in data["update"]:
                for patch in up["patch"]:
                    update_details[patch["op"]].setdefault(
                        patch["path"], {
                            "count": 0,
                            "ids": []
                        })
                    if len(update_details[patch["op"]][patch["path"]]
                           ["ids"]) < max_reported_ids:
                        update_details[patch["op"]][
                            patch["path"]]["ids"].append(up["_id"])
                    update_details[patch["op"]][patch["path"]]["count"] += 1
            update_details["count"] += len(data["update"])

            assert len(
                sources
            ) == 1, "Should have one datasource from diff files, got: %s" % [
                s for s in sources
            ]

예제 #2

파일 보기

파일: differ.py 프로젝트: SuLab/biothings.api

def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num,
                           diff_folder):
    new = create_backend(new_db_col_names)
    docs_common = new.mget_from_ids(id_list_old)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_old = list(set(id_list_old) - set(ids_common))
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'delete': id_in_old,
        'add': [],
        'update': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    summary = {"add": 0, "update": 0, "delete": len(id_in_old)}
    if len(id_in_old) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary

예제 #3

파일 보기

파일: differ.py 프로젝트: SuLab/biothings.api

def diff_worker_count(id_list, db_col_names, batch_num):
    col = create_backend(db_col_names)
    docs = col.mget_from_ids(id_list)
    res = {}
    for doc in docs:
        for k in doc:
            res.setdefault(k, 0)
            res[k] += 1
    return res

예제 #4

파일 보기

파일: differ.py 프로젝트: SuLab/biothings.api

def diff_worker_new_vs_old(id_list_new,
                           old_db_col_names,
                           new_db_col_names,
                           batch_num,
                           diff_folder,
                           diff_func,
                           exclude=[],
                           selfcontained=False):
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    docs_common = old.mget_from_ids(id_list_new)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_new = list(set(id_list_new) - set(ids_common))
    _updates = []
    if len(ids_common) > 0:
        _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude)
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'add': id_in_new,
        'update': _updates,
        'delete': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    if selfcontained:
        _result["add"] = new.mget_from_ids(id_in_new)
    summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0}
    if len(_updates) != 0 or len(id_in_new) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary

예제 #5

파일 보기

    def sync_cols(self,
                  diff_folder,
                  batch_size=10000,
                  mode=None,
                  force=False,
                  target_backend=None,
                  steps=["mapping", "content", "meta"]):
        """
        Sync a collection with diff files located in diff_folder. This folder contains a metadata.json file which
        describes the different involved collection: "old" is the collection/index to be synced, "new" is the collecion
        that should be obtained once all diff files are applied (not used, just informative).
        If target_backend (bt.databbuild.backend.create_backend() notation), then it will replace "old" (that is, the one
        being synced)
        """
        got_error = False
        cnt = 0
        jobs = []
        meta = json.load(open(os.path.join(diff_folder, "metadata.json")))
        diff_type = self.diff_type
        selfcontained = "selfcontained" in meta["diff"]["type"]
        # first try to use what's been passed explicitely
        # then default to what's in config (tuple will be used for create_backend() call)
        # or use what we have in the diff metadata
        old_db_col_names = target_backend or \
            (btconfig.ES_HOST,btconfig.ES_INDEX_NAME,btconfig.ES_DOC_TYPE) or \
            meta["old"]["backend"]
        new_db_col_names = meta["new"]["backend"]
        diff_mapping_file = meta["diff"]["mapping_file"]
        pinfo = {
            "category": "sync",
            "source": "%s -> %s" % (old_db_col_names, new_db_col_names),
            "step": "",
            "description": ""
        }
        summary = {}
        if "mapping" in steps and self.target_backend == "es":
            if diff_mapping_file:
                # old_db_col_names is actually the index name in that case
                index_name = old_db_col_names
                doc_type = meta["build_config"]["doc_type"]
                indexer = create_backend(old_db_col_names).target_esidxer
                pinfo["step"] = "mapping"
                pinfo["description"] = diff_mapping_file

                def update_mapping():
                    diffm = os.path.join(diff_folder, diff_mapping_file)
                    ops = loadobj(diffm)
                    mapping = indexer.get_mapping()
                    # we should have the same doc type declared in the mapping
                    mapping[doc_type]["properties"] = jsonpatch.apply_patch(
                        mapping[doc_type]["properties"], ops)
                    res = indexer.update_mapping(mapping)
                    return res

                job = yield from self.job_manager.defer_to_thread(
                    pinfo, partial(update_mapping))

                def updated(f):
                    try:
                        res = f.result()
                        self.logger.info("Mapping updated on index '%s'" %
                                         index_name)
                        summary["mapping_updated"] = True
                    except Exception as e:
                        self.logger.error(
                            "Failed to update mapping on index '%s': %s" %
                            (index_name, e))
                        got_error = e

                job.add_done_callback(updated)
                yield from job

            if got_error:
                self.logger.error("Failed to update mapping on index '%s': %s" % \
                    (old_db_col_names, got_error),extra={"notify":True})
                raise got_error

        if "content" in steps:
            if selfcontained:
                # selfconained is a worker param, isolate diff format
                diff_type = diff_type.replace("-selfcontained", "")
            diff_files = [
                os.path.join(diff_folder, e["name"])
                for e in meta["diff"]["files"]
            ]
            total = len(diff_files)
            self.logger.info("Syncing %s to %s using diff files in '%s'" %
                             (old_db_col_names, new_db_col_names, diff_folder))
            pinfo["step"] = "content"
            for diff_file in diff_files:
                cnt += 1
                pinfo["description"] = "file %s (%s/%s)" % (diff_file, cnt,
                                                            total)
                worker = getattr(sys.modules[self.__class__.__module__],"sync_%s_%s_worker" % \
                        (self.target_backend,diff_type))
                self.logger.info(
                    "Creating sync worker %s for file %s (%s/%s)" %
                    (worker.__name__, diff_file, cnt, total))
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(worker, diff_file, old_db_col_names,
                            new_db_col_names, batch_size, cnt, force,
                            selfcontained, meta))
                jobs.append(job)

            def synced(f):
                try:
                    for d in f.result():
                        for k in d:
                            summary.setdefault(k, 0)
                            summary[k] += d[k]
                except Exception as e:
                    got_error = e
                    raise

            tasks = asyncio.gather(*jobs)
            tasks.add_done_callback(synced)
            yield from tasks
            if got_error:
                self.logger.error("Failed to sync collection from %s to %s using diff files in '%s': %s" % \
                    (old_db_col_names, new_db_col_names, diff_folder, got_error),extra={"notify":True})
                raise got_error

        if "meta" in steps and self.target_backend == "es":
            # old_db_col_names is actually the index name in that case
            index_name = old_db_col_names[1]
            doc_type = meta["build_config"]["doc_type"]
            indexer = create_backend(old_db_col_names).target_esidxer
            new_meta = meta["_meta"]
            pinfo["step"] = "metadata"

            def update_metadata():
                res = indexer.update_mapping_meta({"_meta": new_meta})
                return res

            job = yield from self.job_manager.defer_to_thread(
                pinfo, partial(update_metadata))

            def updated(f):
                try:
                    res = f.result()
                    self.logger.info("Metadata updated on index '%s': %s" %
                                     (index_name, res))
                    summary["metadata_updated"] = True
                except Exception as e:
                    self.logger.error(
                        "Failed to update metadata on index '%s': %s" %
                        (index_name, e))
                    got_error = e

            job.add_done_callback(updated)
            yield from job

            if got_error:
                self.logger.error("Failed to update metadata on index '%s': %s" % \
                    (old_db_col_names, got_error),extra={"notify":True})
                raise got_error

        self.logger.info("Succesfully synced index %s to reach collection %s using diff files in '%s': %s" % \
                (old_db_col_names, new_db_col_names, diff_folder,summary),extra={"notify":True})

        return summary

예제 #6

파일 보기

def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={}):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    new = create_backend(new_db_col_names)  # mongo collection to sync from
    indexer = create_backend(es_config).target_esidxer
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("es") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, doc["_id"], action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({
                        "_id": doc["_id"],
                        "file": diff_file,
                        "error": e
                    })
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise

    # update: get doc from indexer and apply diff
    batch = []
    ids = [p["_id"] for p in diff["update"]]
    for i, doc in enumerate(indexer.get_docs(ids)):
        try:
            patch_info = diff["update"][
                i]  # same order as what's return by get_doc()...
            assert patch_info["_id"] == doc["_id"]  # ... but just make sure
            newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
            if newdoc == doc:
                # already applied
                res["skipped"] += 1
                continue
            batch.append(newdoc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
            batch = []
    if batch:
        res["updated"] += indexer.index_bulk(batch, batch_size)[0]

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("es", True)
    dump(diff, diff_file)
    return res

예제 #7

파일 보기

def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={}):
    """Worker to sync data between a new and an old mongo collection"""
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("mongo") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("mongo", True)
    dump(diff, diff_file)
    return res

예제 #8

파일 보기

파일: differ.py 프로젝트: SuLab/biothings.api

    def diff_cols(self,
                  old_db_col_names,
                  new_db_col_names,
                  batch_size=100000,
                  steps=["count", "content", "mapping"],
                  mode=None,
                  exclude=[]):
        """
        Compare new with old collections and produce diff files. Root keys can be excluded from
        comparison with "exclude" parameter.
        *_db_col_names can be: 
         1. a colleciton name (as a string) asusming they are
            in the target database.
         2. tuple with 2 elements, the first one is then either "source" or "target"
            to respectively specify src or target database, and the second element is
            the collection name.
         3. tuple with 3 elements (URI,db,collection), looking like:
            ("mongodb://*****:*****@host","dbname","collection"), allowing to specify
            any connection on any server
        steps: 'count' will count the root keys for every documents in new collection 
               (to check number of docs from datasources).
               'content' will perform diff on actual content.
               'mapping' will perform diff on ES mappings (if target collection involved)
        mode: 'purge' will remove any existing files for this comparison.
        """
        new = create_backend(new_db_col_names)
        old = create_backend(old_db_col_names)
        # check what to do
        if type(steps) == str:
            steps = [steps]

        diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names)

        if os.path.exists(diff_folder):
            if mode == "purge" and os.path.exists(diff_folder):
                rmdashfr(diff_folder)
            else:
                raise FileExistsError(
                    "Found existing files in '%s', use mode='purge'" %
                    diff_folder)
        if not os.path.exists(diff_folder):
            os.makedirs(diff_folder)

        # create metadata file storing info about how we created the diff
        # and some summary data
        diff_stats = {
            "update": 0,
            "add": 0,
            "delete": 0,
            "mapping_changed": False
        }
        metadata = {
            "diff": {
                "type": self.diff_type,
                "func": self.diff_func.__name__,
                "version": "%s.%s" % (old.version, new.version),
                "stats": diff_stats,  # ref to diff_stats
                "files": [],
                # when "new" is a target collection:
                "mapping_file": None,
                "info": {
                    "generated_on": str(datetime.now()),
                    "exclude": exclude,
                    "steps": steps,
                    "mode": mode,
                    "batch_size": batch_size
                }
            },
            "old": {
                "backend": old_db_col_names,
                "version": old.version
            },
            "new": {
                "backend": new_db_col_names,
                "version": new.version
            },
            # when "new" is a target collection:
            "_meta": {},
            "build_config": {},
        }
        if isinstance(
                new, DocMongoBackend
        ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE:
            build_doc = get_src_build().find_one(
                {"_id": new.target_collection.name})
            if not build_doc:
                raise DifferException("Collection '%s' has no corresponding build document" % \
                        new.target_collection.name)
            metadata["_meta"] = build_doc.get("_meta", {})
            metadata["build_config"] = build_doc.get("build_config")

        # dump it here for minimum information, in case we don't go further
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)

        got_error = False
        if "mapping" in steps:

            def diff_mapping(old, new, diff_folder):
                summary = {}
                old_build = get_src_build().find_one(
                    {"_id": old.target_collection.name})
                new_build = get_src_build().find_one(
                    {"_id": new.target_collection.name})
                if old_build and new_build:
                    # mapping diff always in jsondiff
                    mapping_diff = jsondiff(old_build["mapping"],
                                            new_build["mapping"])
                    if mapping_diff:
                        file_name = os.path.join(diff_folder, "mapping.pyobj")
                        dump(mapping_diff, file_name)
                        md5 = md5sum(file_name)
                        summary["mapping_file"] = {
                            "name": os.path.basename(file_name),
                            "md5sum": md5
                        }
                else:
                    self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \
                            (old.target_collection.name,new.target_collection.name))
                return summary

            def mapping_diffed(f):
                res = f.result()
                if res.get("mapping_file"):
                    nonlocal got_error
                    # check mapping differences: only "add" ops are allowed, as any others actions would be
                    # ingored by ES once applied (you can't update/delete elements of an existing mapping)
                    mf = os.path.join(diff_folder, res["mapping_file"]["name"])
                    ops = loadobj(mf)
                    for op in ops:
                        if op["op"] != "add":
                            err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \
                                " only 'add' operations are allowed. You can still produce the " + \
                                "diff by removing 'mapping' from 'steps' arguments. " + \
                                "Ex: steps=['count','content']. Diff operation was: %s" % op)
                            got_error = err
                    metadata["diff"]["mapping_file"] = mf
                    diff_stats["mapping_changed"] = True
                self.logger.info(
                    "Diff file containing mapping differences generated: %s" %
                    res.get("mapping_file"))

            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "mapping: old vs new",
                "description": ""
            }
            job = yield from self.job_manager.defer_to_thread(
                pinfo, partial(diff_mapping, old, new, diff_folder))
            job.add_done_callback(mapping_diffed)
            yield from job
            if got_error:
                raise got_error

        if "count" in steps:
            cnt = 0
            pinfo = {
                "category": "diff",
                "step": "count",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "description": ""
            }

            self.logger.info("Counting root keys in '%s'" % new.target_name)
            diff_stats["root_keys"] = {}
            jobs = []
            data_new = id_feeder(new, batch_size=batch_size)
            for id_list in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt
                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_count, id_list, new_db_col_names, cnt))
                jobs.append(job)

            def counted(f):
                root_keys = {}
                # merge the counts
                for d in f.result():
                    for k in d:
                        root_keys.setdefault(k, 0)
                        root_keys[k] += d[k]
                self.logger.info("root keys count: %s" % root_keys)
                diff_stats["root_keys"] = root_keys

            tasks = asyncio.gather(*jobs)
            tasks.add_done_callback(counted)
            yield from tasks
            self.logger.info(
                "Finished counting keys in the new collection: %s" %
                diff_stats["root_keys"])

        if "content" in steps:
            skip = 0
            cnt = 0
            jobs = []
            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "content: new vs old",
                "description": ""
            }
            data_new = id_feeder(new, batch_size=batch_size)
            selfcontained = "selfcontained" in self.diff_type
            for id_list_new in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["update"] += res["update"]
                    diff_stats["add"] += res["add"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Updated: {}, Added: {})".format(
                        res["update"], res["add"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_new_vs_old, id_list_new,
                            old_db_col_names, new_db_col_names, cnt,
                            diff_folder, self.diff_func, exclude,
                            selfcontained))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
                .format(diff_stats["update"], diff_stats["add"]))

            data_old = id_feeder(old, batch_size=batch_size)
            jobs = []
            pinfo["step"] = "content: old vs new"
            for id_list_old in data_old:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["delete"] += res["delete"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Deleted: {})".format(res["delete"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_old_vs_new, id_list_old,
                            new_db_col_names, cnt, diff_folder))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the old collection. Total number of docs deleted: {}"
                .format(diff_stats["delete"]))

        self.logger.info(
            "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})"
            .format(diff_stats["update"], diff_stats["add"],
                    diff_stats["delete"], diff_stats["mapping_changed"]))

        # pickle again with potentially more information (diff_stats)
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)
        strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % (
            old_db_col_names, new_db_col_names, steps, diff_stats)
        self.logger.info("success %s" % strargs, extra={"notify": True})
        return diff_stats