def analyze(diff_file, detailed): data = loadobj(diff_file) sources[data["source"]] = 1 if detailed: # TODO: if self-contained, no db connection needed new_col = create_backend(metadata["new"]["backend"]) old_col = create_backend(metadata["old"]["backend"]) if len(adds) < max_reported_ids: if detailed: # look for which root keys were added in new collection for _id in data["add"]: # selfcontained = dict for whole doc (see TODO above) if type(_id) == dict: _id = _id["_id"] doc = new_col.get_from_id(_id) rkeys = sorted(doc.keys()) adds["ids"].append([_id, rkeys]) else: if data["add"] and type(data["add"][0]) == dict: adds["ids"].extend([d["_id"] for d in data["add"]]) else: adds["ids"].extend(data["add"]) adds["count"] += len(data["add"]) if len(dels) < max_reported_ids: if detailed: # look for which root keys were deleted in old collection for _id in data["delete"]: doc = old_col.get_from_id(_id) rkeys = sorted(doc.keys()) dels["ids"].append([_id, rkeys]) else: dels["ids"].extend(data["delete"]) dels["count"] += len(data["delete"]) for up in data["update"]: for patch in up["patch"]: update_details[patch["op"]].setdefault( patch["path"], { "count": 0, "ids": [] }) if len(update_details[patch["op"]][patch["path"]] ["ids"]) < max_reported_ids: update_details[patch["op"]][ patch["path"]]["ids"].append(up["_id"]) update_details[patch["op"]][patch["path"]]["count"] += 1 update_details["count"] += len(data["update"]) assert len( sources ) == 1, "Should have one datasource from diff files, got: %s" % [ s for s in sources ]
def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num, diff_folder): new = create_backend(new_db_col_names) docs_common = new.mget_from_ids(id_list_old) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old) - set(ids_common)) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': new.target_name, 'timestamp': get_timestamp() } summary = {"add": 0, "update": 0, "delete": len(id_in_old)} if len(id_in_old) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def diff_worker_count(id_list, db_col_names, batch_num): col = create_backend(db_col_names) docs = col.mget_from_ids(id_list) res = {} for doc in docs: for k in doc: res.setdefault(k, 0) res[k] += 1 return res
def diff_worker_new_vs_old(id_list_new, old_db_col_names, new_db_col_names, batch_num, diff_folder, diff_func, exclude=[], selfcontained=False): new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) docs_common = old.mget_from_ids(id_list_new) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': new.target_name, 'timestamp': get_timestamp() } if selfcontained: _result["add"] = new.mget_from_ids(id_in_new) summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def sync_cols(self, diff_folder, batch_size=10000, mode=None, force=False, target_backend=None, steps=["mapping", "content", "meta"]): """ Sync a collection with diff files located in diff_folder. This folder contains a metadata.json file which describes the different involved collection: "old" is the collection/index to be synced, "new" is the collecion that should be obtained once all diff files are applied (not used, just informative). If target_backend (bt.databbuild.backend.create_backend() notation), then it will replace "old" (that is, the one being synced) """ got_error = False cnt = 0 jobs = [] meta = json.load(open(os.path.join(diff_folder, "metadata.json"))) diff_type = self.diff_type selfcontained = "selfcontained" in meta["diff"]["type"] # first try to use what's been passed explicitely # then default to what's in config (tuple will be used for create_backend() call) # or use what we have in the diff metadata old_db_col_names = target_backend or \ (btconfig.ES_HOST,btconfig.ES_INDEX_NAME,btconfig.ES_DOC_TYPE) or \ meta["old"]["backend"] new_db_col_names = meta["new"]["backend"] diff_mapping_file = meta["diff"]["mapping_file"] pinfo = { "category": "sync", "source": "%s -> %s" % (old_db_col_names, new_db_col_names), "step": "", "description": "" } summary = {} if "mapping" in steps and self.target_backend == "es": if diff_mapping_file: # old_db_col_names is actually the index name in that case index_name = old_db_col_names doc_type = meta["build_config"]["doc_type"] indexer = create_backend(old_db_col_names).target_esidxer pinfo["step"] = "mapping" pinfo["description"] = diff_mapping_file def update_mapping(): diffm = os.path.join(diff_folder, diff_mapping_file) ops = loadobj(diffm) mapping = indexer.get_mapping() # we should have the same doc type declared in the mapping mapping[doc_type]["properties"] = jsonpatch.apply_patch( mapping[doc_type]["properties"], ops) res = indexer.update_mapping(mapping) return res job = yield from self.job_manager.defer_to_thread( pinfo, partial(update_mapping)) def updated(f): try: res = f.result() self.logger.info("Mapping updated on index '%s'" % index_name) summary["mapping_updated"] = True except Exception as e: self.logger.error( "Failed to update mapping on index '%s': %s" % (index_name, e)) got_error = e job.add_done_callback(updated) yield from job if got_error: self.logger.error("Failed to update mapping on index '%s': %s" % \ (old_db_col_names, got_error),extra={"notify":True}) raise got_error if "content" in steps: if selfcontained: # selfconained is a worker param, isolate diff format diff_type = diff_type.replace("-selfcontained", "") diff_files = [ os.path.join(diff_folder, e["name"]) for e in meta["diff"]["files"] ] total = len(diff_files) self.logger.info("Syncing %s to %s using diff files in '%s'" % (old_db_col_names, new_db_col_names, diff_folder)) pinfo["step"] = "content" for diff_file in diff_files: cnt += 1 pinfo["description"] = "file %s (%s/%s)" % (diff_file, cnt, total) worker = getattr(sys.modules[self.__class__.__module__],"sync_%s_%s_worker" % \ (self.target_backend,diff_type)) self.logger.info( "Creating sync worker %s for file %s (%s/%s)" % (worker.__name__, diff_file, cnt, total)) job = yield from self.job_manager.defer_to_process( pinfo, partial(worker, diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force, selfcontained, meta)) jobs.append(job) def synced(f): try: for d in f.result(): for k in d: summary.setdefault(k, 0) summary[k] += d[k] except Exception as e: got_error = e raise tasks = asyncio.gather(*jobs) tasks.add_done_callback(synced) yield from tasks if got_error: self.logger.error("Failed to sync collection from %s to %s using diff files in '%s': %s" % \ (old_db_col_names, new_db_col_names, diff_folder, got_error),extra={"notify":True}) raise got_error if "meta" in steps and self.target_backend == "es": # old_db_col_names is actually the index name in that case index_name = old_db_col_names[1] doc_type = meta["build_config"]["doc_type"] indexer = create_backend(old_db_col_names).target_esidxer new_meta = meta["_meta"] pinfo["step"] = "metadata" def update_metadata(): res = indexer.update_mapping_meta({"_meta": new_meta}) return res job = yield from self.job_manager.defer_to_thread( pinfo, partial(update_metadata)) def updated(f): try: res = f.result() self.logger.info("Metadata updated on index '%s': %s" % (index_name, res)) summary["metadata_updated"] = True except Exception as e: self.logger.error( "Failed to update metadata on index '%s': %s" % (index_name, e)) got_error = e job.add_done_callback(updated) yield from job if got_error: self.logger.error("Failed to update metadata on index '%s': %s" % \ (old_db_col_names, got_error),extra={"notify":True}) raise got_error self.logger.info("Succesfully synced index %s to reach collection %s using diff files in '%s': %s" % \ (old_db_col_names, new_db_col_names, diff_folder,summary),extra={"notify":True}) return summary
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new mongo collection and an elasticsearch index""" new = create_backend(new_db_col_names) # mongo collection to sync from indexer = create_backend(es_config).target_esidxer diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("es") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: try: # force action=create to spot docs already added indexer.index(doc, doc["_id"], action="create") res["added"] += 1 except ConflictError: # already added res["skipped"] += 1 continue except Exception as e: errors.append({ "_id": doc["_id"], "file": diff_file, "error": e }) import pickle pickle.dump(errors, open("errors", "wb")) raise # update: get doc from indexer and apply diff batch = [] ids = [p["_id"] for p in diff["update"]] for i, doc in enumerate(indexer.get_docs(ids)): try: patch_info = diff["update"][ i] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"] # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("es", True) dump(diff, diff_file) return res
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new and an old mongo collection""" new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("mongo") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("mongo", True) dump(diff, diff_file) return res
def diff_cols(self, old_db_col_names, new_db_col_names, batch_size=100000, steps=["count", "content", "mapping"], mode=None, exclude=[]): """ Compare new with old collections and produce diff files. Root keys can be excluded from comparison with "exclude" parameter. *_db_col_names can be: 1. a colleciton name (as a string) asusming they are in the target database. 2. tuple with 2 elements, the first one is then either "source" or "target" to respectively specify src or target database, and the second element is the collection name. 3. tuple with 3 elements (URI,db,collection), looking like: ("mongodb://*****:*****@host","dbname","collection"), allowing to specify any connection on any server steps: 'count' will count the root keys for every documents in new collection (to check number of docs from datasources). 'content' will perform diff on actual content. 'mapping' will perform diff on ES mappings (if target collection involved) mode: 'purge' will remove any existing files for this comparison. """ new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) # check what to do if type(steps) == str: steps = [steps] diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names) if os.path.exists(diff_folder): if mode == "purge" and os.path.exists(diff_folder): rmdashfr(diff_folder) else: raise FileExistsError( "Found existing files in '%s', use mode='purge'" % diff_folder) if not os.path.exists(diff_folder): os.makedirs(diff_folder) # create metadata file storing info about how we created the diff # and some summary data diff_stats = { "update": 0, "add": 0, "delete": 0, "mapping_changed": False } metadata = { "diff": { "type": self.diff_type, "func": self.diff_func.__name__, "version": "%s.%s" % (old.version, new.version), "stats": diff_stats, # ref to diff_stats "files": [], # when "new" is a target collection: "mapping_file": None, "info": { "generated_on": str(datetime.now()), "exclude": exclude, "steps": steps, "mode": mode, "batch_size": batch_size } }, "old": { "backend": old_db_col_names, "version": old.version }, "new": { "backend": new_db_col_names, "version": new.version }, # when "new" is a target collection: "_meta": {}, "build_config": {}, } if isinstance( new, DocMongoBackend ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE: build_doc = get_src_build().find_one( {"_id": new.target_collection.name}) if not build_doc: raise DifferException("Collection '%s' has no corresponding build document" % \ new.target_collection.name) metadata["_meta"] = build_doc.get("_meta", {}) metadata["build_config"] = build_doc.get("build_config") # dump it here for minimum information, in case we don't go further json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) got_error = False if "mapping" in steps: def diff_mapping(old, new, diff_folder): summary = {} old_build = get_src_build().find_one( {"_id": old.target_collection.name}) new_build = get_src_build().find_one( {"_id": new.target_collection.name}) if old_build and new_build: # mapping diff always in jsondiff mapping_diff = jsondiff(old_build["mapping"], new_build["mapping"]) if mapping_diff: file_name = os.path.join(diff_folder, "mapping.pyobj") dump(mapping_diff, file_name) md5 = md5sum(file_name) summary["mapping_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } else: self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \ (old.target_collection.name,new.target_collection.name)) return summary def mapping_diffed(f): res = f.result() if res.get("mapping_file"): nonlocal got_error # check mapping differences: only "add" ops are allowed, as any others actions would be # ingored by ES once applied (you can't update/delete elements of an existing mapping) mf = os.path.join(diff_folder, res["mapping_file"]["name"]) ops = loadobj(mf) for op in ops: if op["op"] != "add": err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \ " only 'add' operations are allowed. You can still produce the " + \ "diff by removing 'mapping' from 'steps' arguments. " + \ "Ex: steps=['count','content']. Diff operation was: %s" % op) got_error = err metadata["diff"]["mapping_file"] = mf diff_stats["mapping_changed"] = True self.logger.info( "Diff file containing mapping differences generated: %s" % res.get("mapping_file")) pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "mapping: old vs new", "description": "" } job = yield from self.job_manager.defer_to_thread( pinfo, partial(diff_mapping, old, new, diff_folder)) job.add_done_callback(mapping_diffed) yield from job if got_error: raise got_error if "count" in steps: cnt = 0 pinfo = { "category": "diff", "step": "count", "source": "%s vs %s" % (new.target_name, old.target_name), "description": "" } self.logger.info("Counting root keys in '%s'" % new.target_name) diff_stats["root_keys"] = {} jobs = [] data_new = id_feeder(new, batch_size=batch_size) for id_list in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_count, id_list, new_db_col_names, cnt)) jobs.append(job) def counted(f): root_keys = {} # merge the counts for d in f.result(): for k in d: root_keys.setdefault(k, 0) root_keys[k] += d[k] self.logger.info("root keys count: %s" % root_keys) diff_stats["root_keys"] = root_keys tasks = asyncio.gather(*jobs) tasks.add_done_callback(counted) yield from tasks self.logger.info( "Finished counting keys in the new collection: %s" % diff_stats["root_keys"]) if "content" in steps: skip = 0 cnt = 0 jobs = [] pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "content: new vs old", "description": "" } data_new = id_feeder(new, batch_size=batch_size) selfcontained = "selfcontained" in self.diff_type for id_list_new in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["update"] += res["update"] diff_stats["add"] += res["add"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Updated: {}, Added: {})".format( res["update"], res["add"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_new_vs_old, id_list_new, old_db_col_names, new_db_col_names, cnt, diff_folder, self.diff_func, exclude, selfcontained)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(diff_stats["update"], diff_stats["add"])) data_old = id_feeder(old, batch_size=batch_size) jobs = [] pinfo["step"] = "content: old vs new" for id_list_old in data_old: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["delete"] += res["delete"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Deleted: {})".format(res["delete"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_old_vs_new, id_list_old, new_db_col_names, cnt, diff_folder)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(diff_stats["delete"])) self.logger.info( "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})" .format(diff_stats["update"], diff_stats["add"], diff_stats["delete"], diff_stats["mapping_changed"])) # pickle again with potentially more information (diff_stats) json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % ( old_db_col_names, new_db_col_names, steps, diff_stats) self.logger.info("success %s" % strargs, extra={"notify": True}) return diff_stats