def unregister_url(self, url=None, name=None): dp = get_data_plugin() if url: url = url.strip() doc = dp.find_one({"plugin.url": url}) elif name: doc = dp.find_one({"_id": name}) url = doc["plugin"]["url"] else: raise ValueError("Specify 'url' or 'name'") if not doc: raise AssistantException( "Plugin is not registered (url=%s, name=%s)" % (url, name)) # should be only one but just in case dp.remove({"_id": doc["_id"]}) # delete plugin code so it won't be auto-register # by 'local' plugin assistant (issue studio #7) if doc.get("download", {}).get("data_folder"): codefolder = doc["download"]["data_folder"] self.logger.info("Delete plugin source code in '%s'" % codefolder) rmdashfr(codefolder) assistant = self.submit(url) try: self.data_plugin_manager.register.pop(assistant.plugin_name) except KeyError: raise AssistantException("Plugin '%s' is not registered" % url) self.dumper_manager.register.pop(assistant.plugin_name, None) self.uploader_manager.register.pop(assistant.plugin_name, None)
def do(): do_clone = False if force: # force is also a way to clean and start from scratch rmdashfr(self.src_root_folder) if not os.path.exists(self.src_root_folder): # data folder doesn't even exist, no git files yet, we need to clone os.makedirs(self.src_root_folder) do_clone = True self.register_status("downloading", transient=True) if do_clone: self._clone(self.__class__.GIT_REPO_URL, self.src_root_folder) self._pull(self.src_root_folder, release)
def unregister_url(self, url): url = url.strip() dp = get_data_plugin() doc = dp.find_one({"plugin.url": url}) # should be only one but just in case dp.remove({"plugin.url": url}) # delete plugin code so it won't be auto-register # by 'local' plugin assistant (issue studio #7) if doc.get("download", {}).get("data_folder"): codefolder = doc["download"]["data_folder"] self.logger.info("Delete plugin source code in '%s'" % codefolder) rmdashfr(codefolder) assistant = self.submit(url) try: self.data_plugin_manager.register.pop(assistant.plugin_name) except KeyError: raise AssistantException("Plugin '%s' is not registered" % url) self.dumper_manager.register.pop(assistant.plugin_name, None) self.uploader_manager.register.pop(assistant.plugin_name, None)
def export(self, plugin_name, folder=None, what=["dumper", "uploader", "mapping"], purge=False): """ Export generated code for a given plugin name, in given folder (or use DEFAULT_EXPORT_FOLDER if None). Exported information can be: - dumper: dumper class generated from the manifest - uploader: uploader class generated from the manifest - mapping: mapping generated from inspection or from the manifest If "purge" is true, any existing folder/code will be deleted first, otherwise, will raise an error if some folder/files already exist. """ res = {} # sanity checks if type(what) == str: what = [what] folder = folder or self.default_export_folder assert os.path.exists( folder ), "Folder used to export code doesn't exist: %s" % os.path.abspath( folder) assert plugin_name # avoid deleting the whole export folder when purge=True... folder = os.path.join(folder, plugin_name) if purge: rmdashfr(folder) if not os.path.exists(folder): os.makedirs(folder) elif not purge: raise FileExistsError( "Folder '%s' already exists, use purge=True" % folder) dinit = os.path.join(folder, "__init__.py") with open(dinit, "w") as fout: fout.write("") if "dumper" in what: res.update(self.export_dumper(plugin_name, folder)) if "uploader" in what: res.update(self.export_uploader(plugin_name, folder)) if "mapping" in what: assert "uploader" in what, "'uploader' needs to be exported too to export mapping" res.update(self.export_mapping(plugin_name, folder)) # there's also at least a parser module, maybe a release module, and some more # dependencies, indirect, not listed in the manifest. We'll just copy everything from # the plugin folder to the export folder plugin_folder = os.path.join(btconfig.DATA_PLUGIN_FOLDER, plugin_name) for f in os.listdir(plugin_folder): src = os.path.join(plugin_folder, f) dst = os.path.join(folder, f) # useless or strictly plugin-machinery-specific, skip if f in ["__pycache__", "manifest.json", "__init__.py" ] or f.startswith("."): self.logger.debug("Skipping '%s', not necessary" % src) continue self.logger.debug("Copying %s to %s" % (src, dst)) try: with open(src) as fin: with open(dst, "w") as fout: fout.write(fin.read()) except IsADirectoryError: self.logger.error( "%s is a directory, expecting only files to copy" % src) continue return res
def diff_cols(self, old_db_col_names, new_db_col_names, batch_size=100000, steps=["count", "content", "mapping"], mode=None, exclude=[]): """ Compare new with old collections and produce diff files. Root keys can be excluded from comparison with "exclude" parameter. *_db_col_names can be: 1. a colleciton name (as a string) asusming they are in the target database. 2. tuple with 2 elements, the first one is then either "source" or "target" to respectively specify src or target database, and the second element is the collection name. 3. tuple with 3 elements (URI,db,collection), looking like: ("mongodb://*****:*****@host","dbname","collection"), allowing to specify any connection on any server steps: 'count' will count the root keys for every documents in new collection (to check number of docs from datasources). 'content' will perform diff on actual content. 'mapping' will perform diff on ES mappings (if target collection involved) mode: 'purge' will remove any existing files for this comparison. """ new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) # check what to do if type(steps) == str: steps = [steps] diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names) if os.path.exists(diff_folder): if mode == "purge" and os.path.exists(diff_folder): rmdashfr(diff_folder) else: raise FileExistsError( "Found existing files in '%s', use mode='purge'" % diff_folder) if not os.path.exists(diff_folder): os.makedirs(diff_folder) # create metadata file storing info about how we created the diff # and some summary data diff_stats = { "update": 0, "add": 0, "delete": 0, "mapping_changed": False } metadata = { "diff": { "type": self.diff_type, "func": self.diff_func.__name__, "version": "%s.%s" % (old.version, new.version), "stats": diff_stats, # ref to diff_stats "files": [], # when "new" is a target collection: "mapping_file": None, "info": { "generated_on": str(datetime.now()), "exclude": exclude, "steps": steps, "mode": mode, "batch_size": batch_size } }, "old": { "backend": old_db_col_names, "version": old.version }, "new": { "backend": new_db_col_names, "version": new.version }, # when "new" is a target collection: "_meta": {}, "build_config": {}, } if isinstance( new, DocMongoBackend ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE: build_doc = get_src_build().find_one( {"_id": new.target_collection.name}) if not build_doc: raise DifferException("Collection '%s' has no corresponding build document" % \ new.target_collection.name) metadata["_meta"] = build_doc.get("_meta", {}) metadata["build_config"] = build_doc.get("build_config") # dump it here for minimum information, in case we don't go further json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) got_error = False if "mapping" in steps: def diff_mapping(old, new, diff_folder): summary = {} old_build = get_src_build().find_one( {"_id": old.target_collection.name}) new_build = get_src_build().find_one( {"_id": new.target_collection.name}) if old_build and new_build: # mapping diff always in jsondiff mapping_diff = jsondiff(old_build["mapping"], new_build["mapping"]) if mapping_diff: file_name = os.path.join(diff_folder, "mapping.pyobj") dump(mapping_diff, file_name) md5 = md5sum(file_name) summary["mapping_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } else: self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \ (old.target_collection.name,new.target_collection.name)) return summary def mapping_diffed(f): res = f.result() if res.get("mapping_file"): nonlocal got_error # check mapping differences: only "add" ops are allowed, as any others actions would be # ingored by ES once applied (you can't update/delete elements of an existing mapping) mf = os.path.join(diff_folder, res["mapping_file"]["name"]) ops = loadobj(mf) for op in ops: if op["op"] != "add": err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \ " only 'add' operations are allowed. You can still produce the " + \ "diff by removing 'mapping' from 'steps' arguments. " + \ "Ex: steps=['count','content']. Diff operation was: %s" % op) got_error = err metadata["diff"]["mapping_file"] = mf diff_stats["mapping_changed"] = True self.logger.info( "Diff file containing mapping differences generated: %s" % res.get("mapping_file")) pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "mapping: old vs new", "description": "" } job = yield from self.job_manager.defer_to_thread( pinfo, partial(diff_mapping, old, new, diff_folder)) job.add_done_callback(mapping_diffed) yield from job if got_error: raise got_error if "count" in steps: cnt = 0 pinfo = { "category": "diff", "step": "count", "source": "%s vs %s" % (new.target_name, old.target_name), "description": "" } self.logger.info("Counting root keys in '%s'" % new.target_name) diff_stats["root_keys"] = {} jobs = [] data_new = id_feeder(new, batch_size=batch_size) for id_list in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_count, id_list, new_db_col_names, cnt)) jobs.append(job) def counted(f): root_keys = {} # merge the counts for d in f.result(): for k in d: root_keys.setdefault(k, 0) root_keys[k] += d[k] self.logger.info("root keys count: %s" % root_keys) diff_stats["root_keys"] = root_keys tasks = asyncio.gather(*jobs) tasks.add_done_callback(counted) yield from tasks self.logger.info( "Finished counting keys in the new collection: %s" % diff_stats["root_keys"]) if "content" in steps: skip = 0 cnt = 0 jobs = [] pinfo = { "category": "diff", "source": "%s vs %s" % (new.target_name, old.target_name), "step": "content: new vs old", "description": "" } data_new = id_feeder(new, batch_size=batch_size) selfcontained = "selfcontained" in self.diff_type for id_list_new in data_new: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["update"] += res["update"] diff_stats["add"] += res["add"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Updated: {}, Added: {})".format( res["update"], res["add"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_new_vs_old, id_list_new, old_db_col_names, new_db_col_names, cnt, diff_folder, self.diff_func, exclude, selfcontained)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(diff_stats["update"], diff_stats["add"])) data_old = id_feeder(old, batch_size=batch_size) jobs = [] pinfo["step"] = "content: old vs new" for id_list_old in data_old: cnt += 1 pinfo["description"] = "batch #%s" % cnt def diffed(f): res = f.result() diff_stats["delete"] += res["delete"] if res.get("diff_file"): metadata["diff"]["files"].append(res["diff_file"]) self.logger.info("(Deleted: {})".format(res["delete"])) self.logger.info("Creating diff worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(diff_worker_old_vs_new, id_list_old, new_db_col_names, cnt, diff_folder)) job.add_done_callback(diffed) jobs.append(job) yield from asyncio.gather(*jobs) self.logger.info( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(diff_stats["delete"])) self.logger.info( "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})" .format(diff_stats["update"], diff_stats["add"], diff_stats["delete"], diff_stats["mapping_changed"])) # pickle again with potentially more information (diff_stats) json.dump(metadata, open(os.path.join(diff_folder, "metadata.json"), "w"), indent=True) strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % ( old_db_col_names, new_db_col_names, steps, diff_stats) self.logger.info("success %s" % strargs, extra={"notify": True}) return diff_stats