Beispiel #1
0
    def update_tasks(self):
        try:
            definition, hold_tasks = self.middleware.call_sync(
                "zettarepl.get_definition")
        except Exception as e:
            self.logger.error("Error generating zettarepl definition",
                              exc_info=True)
            self.middleware.call_sync(
                "zettarepl.set_error", {
                    "state": "ERROR",
                    "datetime": datetime.utcnow(),
                    "error": make_sentence(str(e)),
                })
            return
        else:
            self.middleware.call_sync("zettarepl.set_error", None)

        if self._is_empty_definition(definition):
            self.middleware.call_sync("zettarepl.stop")
        else:
            self.middleware.call_sync("zettarepl.start")
            self.queue.put(("tasks", definition))

        self.middleware.call_sync("zettarepl.notify_definition", definition,
                                  hold_tasks)
Beispiel #2
0
    def start(self):
        try:
            definition, hold_tasks = self.middleware.call_sync("zettarepl.get_definition")
        except Exception as e:
            self.logger.error("Error generating zettarepl definition", exc_info=True)
            self.middleware.call_sync("zettarepl.set_error", {
                "state": "ERROR",
                "datetime": datetime.utcnow(),
                "error": make_sentence(str(e)),
            })
            raise CallError(f"Internal error: {e!r}")
        else:
            self.middleware.call_sync("zettarepl.set_error", None)

        with self.lock:
            if not self.is_running():
                self.queue = multiprocessing.Queue()
                self.process = multiprocessing.Process(
                    name="zettarepl",
                    target=ZettareplProcess(definition, self.middleware.debug_level, self.middleware.log_handler,
                                            self.queue, self.observer_queue)
                )
                self.process.start()
                start_daemon_thread(target=self._join, args=(self.process,))

                if self.observer_queue_reader is None:
                    self.observer_queue_reader = start_daemon_thread(target=self._observer_queue_reader)

                self.middleware.call_sync("zettarepl.notify_definition", definition, hold_tasks)
Beispiel #3
0
    def run_replication_task(self, id, really_run, job):
        if really_run:
            try:
                self.queue.put(("run_task", ("ReplicationTask", f"task_{id}")))
            except Exception:
                raise CallError("Replication service is not running")

        channels = self.replication_jobs_channels[f"task_{id}"]
        channel = queue.Queue()
        channels.append(channel)
        try:
            while True:
                message = channel.get()

                if isinstance(message, ReplicationTaskLog):
                    job.logs_fd.write(
                        message.log.encode("utf8", "ignore") + b"\n")

                if isinstance(message, ReplicationTaskSuccess):
                    return

                if isinstance(message, ReplicationTaskError):
                    raise CallError(make_sentence(message.error))
        finally:
            channels.remove(channel)
Beispiel #4
0
    def run_replication_task(self, id, really_run, job):
        if really_run:
            try:
                self.queue.put(("run_task", ("ReplicationTask", f"task_{id}")))
            except Exception:
                raise CallError("Replication service is not running")

        channels = self.replication_jobs_channels[f"task_{id}"]
        channel = queue.Queue()
        channels.append(channel)
        try:
            while True:
                message = channel.get()

                if isinstance(message, ReplicationTaskLog):
                    job.logs_fd.write(message.log.encode("utf8", "ignore") + b"\n")

                if isinstance(message, ReplicationTaskSnapshotProgress):
                    job.set_progress(
                        100 * message.current / message.total,
                        f"Sending {message.dataset}@{message.snapshot} "
                        f"({humanfriendly.format_size(message.current)} / {humanfriendly.format_size(message.total)})",
                    )

                if isinstance(message, ReplicationTaskSuccess):
                    return

                if isinstance(message, ReplicationTaskError):
                    raise CallError(make_sentence(message.error))
        finally:
            channels.remove(channel)
Beispiel #5
0
    async def get_definition(self):
        config = await self.middleware.call("replication.config.config")
        timezone = (await self.middleware.call("system.general.config"))["timezone"]

        pools = {pool["name"]: pool for pool in await self.middleware.call("pool.query")}

        hold_tasks = {}

        periodic_snapshot_tasks = {}
        for periodic_snapshot_task in await self.middleware.call("pool.snapshottask.query", [["enabled", "=", True]]):
            hold_task_reason = self._hold_task_reason(pools, periodic_snapshot_task["dataset"])
            if hold_task_reason:
                hold_tasks[f"periodic_snapshot_task_{periodic_snapshot_task['id']}"] = hold_task_reason
                continue

            periodic_snapshot_tasks[f"task_{periodic_snapshot_task['id']}"] = self.periodic_snapshot_task_definition(
                periodic_snapshot_task,
            )

        replication_tasks = {}
        for replication_task in await self.middleware.call("replication.query", [["enabled", "=", True]]):
            try:
                replication_tasks[f"task_{replication_task['id']}"] = await self._replication_task_definition(
                    pools, replication_task
                )
            except HoldReplicationTaskException as e:
                hold_tasks[f"replication_task_{replication_task['id']}"] = e.reason

        for job_id, replication_task in self.onetime_replication_tasks.items():
            try:
                replication_tasks[f"job_{job_id}"] = await self._replication_task_definition(pools, replication_task)
            except HoldReplicationTaskException as e:
                hold_tasks[f"job_{job_id}"] = e.reason

        definition = {
            "max-parallel-replication-tasks": config["max_parallel_replication_tasks"],
            "timezone": timezone,
            "use-removal-dates": True,
            "periodic-snapshot-tasks": periodic_snapshot_tasks,
            "replication-tasks": replication_tasks,
        }

        # Test if does not cause exceptions
        Definition.from_data(definition, raise_on_error=False)

        hold_tasks = {
            task_id: {
                "state": "HOLD",
                "datetime": datetime.utcnow(),
                "reason": make_sentence(reason),
            }
            for task_id, reason in hold_tasks.items()
        }

        return definition, hold_tasks
Beispiel #6
0
    def _run_replication_task_job(self, id, job):
        channels = self.replication_jobs_channels[id]
        channel = queue.Queue()
        channels.append(channel)
        snapshot_start_message = None
        snapshot_progress_message = None
        data_progress_message = None
        try:
            while True:
                message = channel.get()

                if isinstance(message, ReplicationTaskLog):
                    job.logs_fd.write(
                        message.log.encode("utf8", "ignore") + b"\n")

                if isinstance(message, ReplicationTaskSnapshotStart):
                    snapshot_start_message = message
                    snapshot_progress_message = None
                    self._set_replication_task_progress(
                        job, snapshot_start_message, snapshot_progress_message,
                        data_progress_message)

                if isinstance(message, ReplicationTaskSnapshotProgress):
                    snapshot_progress_message = message
                    self._set_replication_task_progress(
                        job, snapshot_start_message, snapshot_progress_message,
                        data_progress_message)

                if isinstance(message, ReplicationTaskDataProgress):
                    data_progress_message = message
                    self._set_replication_task_progress(
                        job, snapshot_start_message, snapshot_progress_message,
                        data_progress_message)

                if isinstance(message, ReplicationTaskSuccess):
                    return

                if isinstance(message, ReplicationTaskError):
                    raise CallError(make_sentence(message.error))
        finally:
            channels.remove(channel)
Beispiel #7
0
    def _observer_queue_reader(self):
        while True:
            message = self.observer_queue.get()

            try:
                self.logger.trace("Observer queue got %r", message)

                # Global events

                if isinstance(message, DefinitionErrors):
                    definition_errors = {}
                    for error in message.errors:
                        if isinstance(error, PeriodicSnapshotTaskDefinitionError):
                            definition_errors[f"periodic_snapshot_{error.task_id}"] = {
                                "state": "ERROR",
                                "datetime": datetime.utcnow(),
                                "error": make_sentence(str(error)),
                            }
                        if isinstance(error, ReplicationTaskDefinitionError):
                            definition_errors[f"replication_{error.task_id}"] = {
                                "state": "ERROR",
                                "datetime": datetime.utcnow(),
                                "error": make_sentence(str(error)),
                            }

                    self.middleware.call_sync("zettarepl.set_definition_errors", definition_errors)

                # Periodic snapshot task

                if isinstance(message, PeriodicSnapshotTaskStart):
                    self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                    })

                if isinstance(message, PeriodicSnapshotTaskSuccess):
                    self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", {
                        "state": "FINISHED",
                        "datetime": datetime.utcnow(),
                    })

                if isinstance(message, PeriodicSnapshotTaskError):
                    self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", {
                        "state": "ERROR",
                        "datetime": datetime.utcnow(),
                        "error": make_sentence(message.error),
                    })

                # Replication task events

                if isinstance(message, ReplicationTaskScheduled):
                    if (
                            (self.middleware.call_sync(
                                "zettarepl.get_state_internal", f"replication_{message.task_id}"
                            ) or {}).get("state") != "RUNNING"
                    ):
                        self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                            "state": "WAITING",
                            "datetime": datetime.utcnow(),
                        })

                if isinstance(message, ReplicationTaskStart):
                    self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                    })

                    # Start fake job if none are already running
                    if not self.replication_jobs_channels[message.task_id]:
                        self.middleware.call_sync("replication.run", int(message.task_id[5:]), False)

                if isinstance(message, ReplicationTaskLog):
                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSnapshotStart):
                    self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                        "progress": {
                            "dataset": message.dataset,
                            "snapshot": message.snapshot,
                            "snapshots_sent": message.snapshots_sent,
                            "snapshots_total": message.snapshots_total,
                            "bytes_sent": 0,
                            "bytes_total": 0,
                            # legacy
                            "current": 0,
                            "total": 0,
                        }
                    })

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSnapshotProgress):
                    self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                        "progress": {
                            "dataset": message.dataset,
                            "snapshot": message.snapshot,
                            "snapshots_sent": message.snapshots_sent,
                            "snapshots_total": message.snapshots_total,
                            "bytes_sent": message.bytes_sent,
                            "bytes_total": message.bytes_total,
                            # legacy
                            "current": message.bytes_sent,
                            "total": message.bytes_total,
                        }
                    })

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSnapshotSuccess):
                    self.middleware.call_sync("zettarepl.set_last_snapshot", f"replication_{message.task_id}",
                                              f"{message.dataset}@{message.snapshot}")

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskDataProgress):
                    task_id = f"replication_{message.task_id}"
                    try:
                        state = self.middleware.call_sync("zettarepl.get_internal_task_state", task_id)
                    except KeyError:
                        pass
                    else:
                        if state["state"] == "RUNNING" and "progress" in state:
                            state["progress"].update({
                                "root_dataset": message.dataset,
                                "src_size": message.src_size,
                                "dst_size": message.dst_size,
                            })
                            self.middleware.call_sync("zettarepl.set_state", task_id, state)

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSuccess):
                    self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                        "state": "FINISHED",
                        "datetime": datetime.utcnow(),
                        "warnings": message.warnings,
                    })

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskError):
                    self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", {
                        "state": "ERROR",
                        "datetime": datetime.utcnow(),
                        "error": make_sentence(message.error),
                    })

                    for channel in self.replication_jobs_channels[message.task_id]:
                        channel.put(message)

            except Exception:
                self.logger.warning("Unhandled exception in observer_queue_reader", exc_info=True)
Beispiel #8
0
    async def get_definition(self):
        timezone = (await
                    self.middleware.call("system.general.config"))["timezone"]

        pools = {
            pool["name"]: pool
            for pool in await self.middleware.call("pool.query")
        }

        hold_tasks = {}

        periodic_snapshot_tasks = {}
        for periodic_snapshot_task in await self.middleware.call(
                "pool.snapshottask.query", [["enabled", "=", True]]):
            hold_task_reason = self._hold_task_reason(
                pools, periodic_snapshot_task["dataset"])
            if hold_task_reason:
                hold_tasks[
                    f"periodic_snapshot_task_{periodic_snapshot_task['id']}"] = hold_task_reason
                continue

            periodic_snapshot_tasks[f"task_{periodic_snapshot_task['id']}"] = {
                "dataset":
                periodic_snapshot_task["dataset"],
                "recursive":
                periodic_snapshot_task["recursive"],
                "exclude":
                periodic_snapshot_task["exclude"],
                "lifetime":
                lifetime_iso8601(periodic_snapshot_task["lifetime_value"],
                                 periodic_snapshot_task["lifetime_unit"]),
                "naming-schema":
                periodic_snapshot_task["naming_schema"],
                "schedule":
                zettarepl_schedule(periodic_snapshot_task["schedule"]),
                "allow-empty":
                periodic_snapshot_task["allow_empty"],
            }

        replication_tasks = {}
        for replication_task in await self.middleware.call(
                "replication.query", [["enabled", "=", True]]):
            if replication_task["direction"] == "PUSH":
                hold = False
                for source_dataset in replication_task["source_datasets"]:
                    hold_task_reason = self._hold_task_reason(
                        pools, source_dataset)
                    if hold_task_reason:
                        hold_tasks[
                            f"replication_task_{replication_task['id']}"] = hold_task_reason
                        hold = True
                        break
                if hold:
                    continue

            if replication_task["direction"] == "PULL":
                hold_task_reason = self._hold_task_reason(
                    pools, replication_task["target_dataset"])
                if hold_task_reason:
                    hold_tasks[
                        f"replication_task_{replication_task['id']}"] = hold_task_reason
                    continue

            if replication_task["transport"] != "LOCAL":
                if not await self.middleware.call(
                        "network.general.can_perform_activity", "replication"):
                    hold_tasks[
                        f"replication_task_{replication_task['id']}"] = (
                            "Replication network activity is disabled")
                    continue

            try:
                transport = await self._define_transport(
                    replication_task["transport"],
                    (replication_task["ssh_credentials"] or {}).get("id"),
                    replication_task["netcat_active_side"],
                    replication_task["netcat_active_side_listen_address"],
                    replication_task["netcat_active_side_port_min"],
                    replication_task["netcat_active_side_port_max"],
                    replication_task["netcat_passive_side_connect_address"],
                )
            except CallError as e:
                hold_tasks[
                    f"replication_task_{replication_task['id']}"] = e.errmsg
                continue

            my_periodic_snapshot_tasks = [
                f"task_{periodic_snapshot_task['id']}"
                for periodic_snapshot_task in
                replication_task["periodic_snapshot_tasks"]
            ]
            my_schedule = replication_task["schedule"]

            definition = {
                "direction":
                replication_task["direction"].lower(),
                "transport":
                transport,
                "source-dataset":
                replication_task["source_datasets"],
                "target-dataset":
                replication_task["target_dataset"],
                "recursive":
                replication_task["recursive"],
                "exclude":
                replication_task_exclude(replication_task),
                "properties":
                replication_task["properties"],
                "properties-exclude":
                replication_task["properties_exclude"],
                "properties-override":
                replication_task["properties_override"],
                "replicate":
                replication_task["replicate"],
                "periodic-snapshot-tasks":
                my_periodic_snapshot_tasks,
                "auto":
                replication_task["auto"],
                "only-matching-schedule":
                replication_task["only_matching_schedule"],
                "allow-from-scratch":
                replication_task["allow_from_scratch"],
                "readonly":
                replication_task["readonly"].lower(),
                "hold-pending-snapshots":
                replication_task["hold_pending_snapshots"],
                "retention-policy":
                replication_task["retention_policy"].lower(),
                "large-block":
                replication_task["large_block"],
                "embed":
                replication_task["embed"],
                "compressed":
                replication_task["compressed"],
                "retries":
                replication_task["retries"],
                "logging-level": (replication_task["logging_level"]
                                  or "NOTSET").lower(),
            }

            if replication_task["encryption"]:
                definition["encryption"] = {
                    "key": replication_task["encryption_key"],
                    "key-format": replication_task["encryption_key_format"],
                    "key-location":
                    replication_task["encryption_key_location"],
                }
            if replication_task["naming_schema"]:
                definition["naming-schema"] = replication_task["naming_schema"]
            if replication_task["also_include_naming_schema"]:
                definition["also-include-naming-schema"] = replication_task[
                    "also_include_naming_schema"]
            if my_schedule is not None:
                definition["schedule"] = zettarepl_schedule(my_schedule)
            if replication_task["restrict_schedule"] is not None:
                definition["restrict-schedule"] = zettarepl_schedule(
                    replication_task["restrict_schedule"])
            if replication_task[
                    "lifetime_value"] is not None and replication_task[
                        "lifetime_unit"] is not None:
                definition["lifetime"] = lifetime_iso8601(
                    replication_task["lifetime_value"],
                    replication_task["lifetime_unit"])
            if replication_task["compression"] is not None:
                definition["compression"] = replication_task[
                    "compression"].lower()
            if replication_task["speed_limit"] is not None:
                definition["speed-limit"] = replication_task["speed_limit"]

            replication_tasks[f"task_{replication_task['id']}"] = definition

        definition = {
            "timezone": timezone,
            "periodic-snapshot-tasks": periodic_snapshot_tasks,
            "replication-tasks": replication_tasks,
        }

        # Test if does not cause exceptions
        Definition.from_data(definition, raise_on_error=False)

        hold_tasks = {
            task_id: {
                "state": "HOLD",
                "datetime": datetime.utcnow(),
                "reason": make_sentence(reason),
            }
            for task_id, reason in hold_tasks.items()
        }

        return definition, hold_tasks
Beispiel #9
0
    def _observer_queue_reader(self):
        while True:
            message = self.observer_queue.get()

            try:
                self.logger.debug("Observer queue got %r", message)

                # Global events

                if isinstance(message, DefinitionErrors):
                    self.definition_errors = {}
                    for error in message.errors:
                        if isinstance(error,
                                      PeriodicSnapshotTaskDefinitionError):
                            self.definition_errors[
                                f"periodic_snapshot_{error.task_id}"] = {
                                    "state": "ERROR",
                                    "datetime": datetime.utcnow(),
                                    "error": make_sentence(str(error)),
                                }
                        if isinstance(error, ReplicationTaskDefinitionError):
                            self.definition_errors[
                                f"replication_{error.task_id}"] = {
                                    "state": "ERROR",
                                    "datetime": datetime.utcnow(),
                                    "error": make_sentence(str(error)),
                                }

                # Periodic snapshot task

                if isinstance(message, PeriodicSnapshotTaskStart):
                    self.state[f"periodic_snapshot_{message.task_id}"] = {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                    }

                if isinstance(message, PeriodicSnapshotTaskSuccess):
                    self.state[f"periodic_snapshot_{message.task_id}"] = {
                        "state": "FINISHED",
                        "datetime": datetime.utcnow(),
                    }

                if isinstance(message, PeriodicSnapshotTaskError):
                    self.state[f"periodic_snapshot_{message.task_id}"] = {
                        "state": "ERROR",
                        "datetime": datetime.utcnow(),
                        "error": make_sentence(message.error),
                    }

                # Replication task events

                if isinstance(message, ReplicationTaskScheduled):
                    self.state[f"replication_{message.task_id}"] = {
                        "state": "WAITING",
                        "datetime": datetime.utcnow(),
                    }

                if isinstance(message, ReplicationTaskStart):
                    self.state[f"replication_{message.task_id}"] = {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                    }

                    # Start fake job if none are already running
                    if not self.replication_jobs_channels[message.task_id]:
                        self.middleware.call_sync("replication.run",
                                                  int(message.task_id[5:]),
                                                  False)

                if isinstance(message, ReplicationTaskLog):
                    for channel in self.replication_jobs_channels[
                            message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSnapshotProgress):
                    self.state[f"replication_{message.task_id}"] = {
                        "state": "RUNNING",
                        "datetime": datetime.utcnow(),
                        "progress": {
                            "dataset": message.dataset,
                            "snapshot": message.snapshot,
                            "current": message.current,
                            "total": message.total,
                        }
                    }

                    for channel in self.replication_jobs_channels[
                            message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSnapshotSuccess):
                    last_snapshot = f"{message.dataset}@{message.snapshot}"
                    self.last_snapshot[
                        f"replication_{message.task_id}"] = last_snapshot
                    self.serializable_state[int(message.task_id.split(
                        "_")[1])]["last_snapshot"] = last_snapshot

                    for channel in self.replication_jobs_channels[
                            message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskSuccess):
                    state = {
                        "state": "FINISHED",
                        "datetime": datetime.utcnow(),
                    }
                    self.state[f"replication_{message.task_id}"] = state
                    self.serializable_state[int(
                        message.task_id.split("_")[1])]["state"] = state

                    for channel in self.replication_jobs_channels[
                            message.task_id]:
                        channel.put(message)

                if isinstance(message, ReplicationTaskError):
                    state = {
                        "state": "ERROR",
                        "datetime": datetime.utcnow(),
                        "error": make_sentence(message.error),
                    }

                    self.state[f"replication_{message.task_id}"] = state
                    self.serializable_state[int(
                        message.task_id.split("_")[1])]["state"] = state

                    for channel in self.replication_jobs_channels[
                            message.task_id]:
                        channel.put(message)

            except Exception:
                self.logger.warning(
                    "Unhandled exception in observer_queue_reader",
                    exc_info=True)