Example #1
0
 def unregister_url(self, url=None, name=None):
     dp = get_data_plugin()
     if url:
         url = url.strip()
         doc = dp.find_one({"plugin.url": url})
     elif name:
         doc = dp.find_one({"_id": name})
         url = doc["plugin"]["url"]
     else:
         raise ValueError("Specify 'url' or 'name'")
     if not doc:
         raise AssistantException(
             "Plugin is not registered (url=%s, name=%s)" % (url, name))
     # should be only one but just in case
     dp.remove({"_id": doc["_id"]})
     # delete plugin code so it won't be auto-register
     # by 'local' plugin assistant (issue studio #7)
     if doc.get("download", {}).get("data_folder"):
         codefolder = doc["download"]["data_folder"]
         self.logger.info("Delete plugin source code in '%s'" % codefolder)
         rmdashfr(codefolder)
     assistant = self.submit(url)
     try:
         self.data_plugin_manager.register.pop(assistant.plugin_name)
     except KeyError:
         raise AssistantException("Plugin '%s' is not registered" % url)
     self.dumper_manager.register.pop(assistant.plugin_name, None)
     self.uploader_manager.register.pop(assistant.plugin_name, None)
Example #2
0
 def do():
     do_clone = False
     if force:
         # force is also a way to clean and start from scratch
         rmdashfr(self.src_root_folder)
     if not os.path.exists(self.src_root_folder):
         # data folder doesn't even exist, no git files yet, we need to clone
         os.makedirs(self.src_root_folder)
         do_clone = True
     self.register_status("downloading", transient=True)
     if do_clone:
         self._clone(self.__class__.GIT_REPO_URL, self.src_root_folder)
     self._pull(self.src_root_folder, release)
Example #3
0
 def unregister_url(self, url):
     url = url.strip()
     dp = get_data_plugin()
     doc = dp.find_one({"plugin.url": url})
     # should be only one but just in case
     dp.remove({"plugin.url": url})
     # delete plugin code so it won't be auto-register
     # by 'local' plugin assistant (issue studio #7)
     if doc.get("download", {}).get("data_folder"):
         codefolder = doc["download"]["data_folder"]
         self.logger.info("Delete plugin source code in '%s'" % codefolder)
         rmdashfr(codefolder)
     assistant = self.submit(url)
     try:
         self.data_plugin_manager.register.pop(assistant.plugin_name)
     except KeyError:
         raise AssistantException("Plugin '%s' is not registered" % url)
     self.dumper_manager.register.pop(assistant.plugin_name, None)
     self.uploader_manager.register.pop(assistant.plugin_name, None)
Example #4
0
    def export(self,
               plugin_name,
               folder=None,
               what=["dumper", "uploader", "mapping"],
               purge=False):
        """
        Export generated code for a given plugin name, in given folder
        (or use DEFAULT_EXPORT_FOLDER if None). Exported information can be:
        - dumper: dumper class generated from the manifest
        - uploader: uploader class generated from the manifest
        - mapping: mapping generated from inspection or from the manifest
        If "purge" is true, any existing folder/code will be deleted first, otherwise,
        will raise an error if some folder/files already exist.
        """
        res = {}
        # sanity checks
        if type(what) == str:
            what = [what]
        folder = folder or self.default_export_folder
        assert os.path.exists(
            folder
        ), "Folder used to export code doesn't exist: %s" % os.path.abspath(
            folder)
        assert plugin_name  # avoid deleting the whole export folder when purge=True...
        folder = os.path.join(folder, plugin_name)
        if purge:
            rmdashfr(folder)
        if not os.path.exists(folder):
            os.makedirs(folder)
        elif not purge:
            raise FileExistsError(
                "Folder '%s' already exists, use purge=True" % folder)
        dinit = os.path.join(folder, "__init__.py")
        with open(dinit, "w") as fout:
            fout.write("")
        if "dumper" in what:
            res.update(self.export_dumper(plugin_name, folder))
        if "uploader" in what:
            res.update(self.export_uploader(plugin_name, folder))
        if "mapping" in what:
            assert "uploader" in what, "'uploader' needs to be exported too to export mapping"
            res.update(self.export_mapping(plugin_name, folder))
        # there's also at least a parser module, maybe a release module, and some more
        # dependencies, indirect, not listed in the manifest. We'll just copy everything from
        # the plugin folder to the export folder
        plugin_folder = os.path.join(btconfig.DATA_PLUGIN_FOLDER, plugin_name)
        for f in os.listdir(plugin_folder):
            src = os.path.join(plugin_folder, f)
            dst = os.path.join(folder, f)
            # useless or strictly plugin-machinery-specific, skip
            if f in ["__pycache__", "manifest.json", "__init__.py"
                     ] or f.startswith("."):
                self.logger.debug("Skipping '%s', not necessary" % src)
                continue
            self.logger.debug("Copying %s to %s" % (src, dst))
            try:
                with open(src) as fin:
                    with open(dst, "w") as fout:
                        fout.write(fin.read())
            except IsADirectoryError:
                self.logger.error(
                    "%s is a directory, expecting only files to copy" % src)
                continue

        return res
Example #5
0
    def diff_cols(self,
                  old_db_col_names,
                  new_db_col_names,
                  batch_size=100000,
                  steps=["count", "content", "mapping"],
                  mode=None,
                  exclude=[]):
        """
        Compare new with old collections and produce diff files. Root keys can be excluded from
        comparison with "exclude" parameter.
        *_db_col_names can be: 
         1. a colleciton name (as a string) asusming they are
            in the target database.
         2. tuple with 2 elements, the first one is then either "source" or "target"
            to respectively specify src or target database, and the second element is
            the collection name.
         3. tuple with 3 elements (URI,db,collection), looking like:
            ("mongodb://*****:*****@host","dbname","collection"), allowing to specify
            any connection on any server
        steps: 'count' will count the root keys for every documents in new collection 
               (to check number of docs from datasources).
               'content' will perform diff on actual content.
               'mapping' will perform diff on ES mappings (if target collection involved)
        mode: 'purge' will remove any existing files for this comparison.
        """
        new = create_backend(new_db_col_names)
        old = create_backend(old_db_col_names)
        # check what to do
        if type(steps) == str:
            steps = [steps]

        diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names)

        if os.path.exists(diff_folder):
            if mode == "purge" and os.path.exists(diff_folder):
                rmdashfr(diff_folder)
            else:
                raise FileExistsError(
                    "Found existing files in '%s', use mode='purge'" %
                    diff_folder)
        if not os.path.exists(diff_folder):
            os.makedirs(diff_folder)

        # create metadata file storing info about how we created the diff
        # and some summary data
        diff_stats = {
            "update": 0,
            "add": 0,
            "delete": 0,
            "mapping_changed": False
        }
        metadata = {
            "diff": {
                "type": self.diff_type,
                "func": self.diff_func.__name__,
                "version": "%s.%s" % (old.version, new.version),
                "stats": diff_stats,  # ref to diff_stats
                "files": [],
                # when "new" is a target collection:
                "mapping_file": None,
                "info": {
                    "generated_on": str(datetime.now()),
                    "exclude": exclude,
                    "steps": steps,
                    "mode": mode,
                    "batch_size": batch_size
                }
            },
            "old": {
                "backend": old_db_col_names,
                "version": old.version
            },
            "new": {
                "backend": new_db_col_names,
                "version": new.version
            },
            # when "new" is a target collection:
            "_meta": {},
            "build_config": {},
        }
        if isinstance(
                new, DocMongoBackend
        ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE:
            build_doc = get_src_build().find_one(
                {"_id": new.target_collection.name})
            if not build_doc:
                raise DifferException("Collection '%s' has no corresponding build document" % \
                        new.target_collection.name)
            metadata["_meta"] = build_doc.get("_meta", {})
            metadata["build_config"] = build_doc.get("build_config")

        # dump it here for minimum information, in case we don't go further
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)

        got_error = False
        if "mapping" in steps:

            def diff_mapping(old, new, diff_folder):
                summary = {}
                old_build = get_src_build().find_one(
                    {"_id": old.target_collection.name})
                new_build = get_src_build().find_one(
                    {"_id": new.target_collection.name})
                if old_build and new_build:
                    # mapping diff always in jsondiff
                    mapping_diff = jsondiff(old_build["mapping"],
                                            new_build["mapping"])
                    if mapping_diff:
                        file_name = os.path.join(diff_folder, "mapping.pyobj")
                        dump(mapping_diff, file_name)
                        md5 = md5sum(file_name)
                        summary["mapping_file"] = {
                            "name": os.path.basename(file_name),
                            "md5sum": md5
                        }
                else:
                    self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \
                            (old.target_collection.name,new.target_collection.name))
                return summary

            def mapping_diffed(f):
                res = f.result()
                if res.get("mapping_file"):
                    nonlocal got_error
                    # check mapping differences: only "add" ops are allowed, as any others actions would be
                    # ingored by ES once applied (you can't update/delete elements of an existing mapping)
                    mf = os.path.join(diff_folder, res["mapping_file"]["name"])
                    ops = loadobj(mf)
                    for op in ops:
                        if op["op"] != "add":
                            err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \
                                " only 'add' operations are allowed. You can still produce the " + \
                                "diff by removing 'mapping' from 'steps' arguments. " + \
                                "Ex: steps=['count','content']. Diff operation was: %s" % op)
                            got_error = err
                    metadata["diff"]["mapping_file"] = mf
                    diff_stats["mapping_changed"] = True
                self.logger.info(
                    "Diff file containing mapping differences generated: %s" %
                    res.get("mapping_file"))

            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "mapping: old vs new",
                "description": ""
            }
            job = yield from self.job_manager.defer_to_thread(
                pinfo, partial(diff_mapping, old, new, diff_folder))
            job.add_done_callback(mapping_diffed)
            yield from job
            if got_error:
                raise got_error

        if "count" in steps:
            cnt = 0
            pinfo = {
                "category": "diff",
                "step": "count",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "description": ""
            }

            self.logger.info("Counting root keys in '%s'" % new.target_name)
            diff_stats["root_keys"] = {}
            jobs = []
            data_new = id_feeder(new, batch_size=batch_size)
            for id_list in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt
                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_count, id_list, new_db_col_names, cnt))
                jobs.append(job)

            def counted(f):
                root_keys = {}
                # merge the counts
                for d in f.result():
                    for k in d:
                        root_keys.setdefault(k, 0)
                        root_keys[k] += d[k]
                self.logger.info("root keys count: %s" % root_keys)
                diff_stats["root_keys"] = root_keys

            tasks = asyncio.gather(*jobs)
            tasks.add_done_callback(counted)
            yield from tasks
            self.logger.info(
                "Finished counting keys in the new collection: %s" %
                diff_stats["root_keys"])

        if "content" in steps:
            skip = 0
            cnt = 0
            jobs = []
            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "content: new vs old",
                "description": ""
            }
            data_new = id_feeder(new, batch_size=batch_size)
            selfcontained = "selfcontained" in self.diff_type
            for id_list_new in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["update"] += res["update"]
                    diff_stats["add"] += res["add"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Updated: {}, Added: {})".format(
                        res["update"], res["add"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_new_vs_old, id_list_new,
                            old_db_col_names, new_db_col_names, cnt,
                            diff_folder, self.diff_func, exclude,
                            selfcontained))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
                .format(diff_stats["update"], diff_stats["add"]))

            data_old = id_feeder(old, batch_size=batch_size)
            jobs = []
            pinfo["step"] = "content: old vs new"
            for id_list_old in data_old:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["delete"] += res["delete"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Deleted: {})".format(res["delete"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_old_vs_new, id_list_old,
                            new_db_col_names, cnt, diff_folder))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the old collection. Total number of docs deleted: {}"
                .format(diff_stats["delete"]))

        self.logger.info(
            "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})"
            .format(diff_stats["update"], diff_stats["add"],
                    diff_stats["delete"], diff_stats["mapping_changed"]))

        # pickle again with potentially more information (diff_stats)
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)
        strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % (
            old_db_col_names, new_db_col_names, steps, diff_stats)
        self.logger.info("success %s" % strargs, extra={"notify": True})
        return diff_stats