def update_tasks(self): try: definition, hold_tasks = self.middleware.call_sync( "zettarepl.get_definition") except Exception as e: self.logger.error("Error generating zettarepl definition", exc_info=True) self.middleware.call_sync( "zettarepl.set_error", { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(e)), }) return else: self.middleware.call_sync("zettarepl.set_error", None) if self._is_empty_definition(definition): self.middleware.call_sync("zettarepl.stop") else: self.middleware.call_sync("zettarepl.start") self.queue.put(("tasks", definition)) self.middleware.call_sync("zettarepl.notify_definition", definition, hold_tasks)
def start(self): try: definition, hold_tasks = self.middleware.call_sync("zettarepl.get_definition") except Exception as e: self.logger.error("Error generating zettarepl definition", exc_info=True) self.middleware.call_sync("zettarepl.set_error", { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(e)), }) raise CallError(f"Internal error: {e!r}") else: self.middleware.call_sync("zettarepl.set_error", None) with self.lock: if not self.is_running(): self.queue = multiprocessing.Queue() self.process = multiprocessing.Process( name="zettarepl", target=ZettareplProcess(definition, self.middleware.debug_level, self.middleware.log_handler, self.queue, self.observer_queue) ) self.process.start() start_daemon_thread(target=self._join, args=(self.process,)) if self.observer_queue_reader is None: self.observer_queue_reader = start_daemon_thread(target=self._observer_queue_reader) self.middleware.call_sync("zettarepl.notify_definition", definition, hold_tasks)
def run_replication_task(self, id, really_run, job): if really_run: try: self.queue.put(("run_task", ("ReplicationTask", f"task_{id}"))) except Exception: raise CallError("Replication service is not running") channels = self.replication_jobs_channels[f"task_{id}"] channel = queue.Queue() channels.append(channel) try: while True: message = channel.get() if isinstance(message, ReplicationTaskLog): job.logs_fd.write( message.log.encode("utf8", "ignore") + b"\n") if isinstance(message, ReplicationTaskSuccess): return if isinstance(message, ReplicationTaskError): raise CallError(make_sentence(message.error)) finally: channels.remove(channel)
def run_replication_task(self, id, really_run, job): if really_run: try: self.queue.put(("run_task", ("ReplicationTask", f"task_{id}"))) except Exception: raise CallError("Replication service is not running") channels = self.replication_jobs_channels[f"task_{id}"] channel = queue.Queue() channels.append(channel) try: while True: message = channel.get() if isinstance(message, ReplicationTaskLog): job.logs_fd.write(message.log.encode("utf8", "ignore") + b"\n") if isinstance(message, ReplicationTaskSnapshotProgress): job.set_progress( 100 * message.current / message.total, f"Sending {message.dataset}@{message.snapshot} " f"({humanfriendly.format_size(message.current)} / {humanfriendly.format_size(message.total)})", ) if isinstance(message, ReplicationTaskSuccess): return if isinstance(message, ReplicationTaskError): raise CallError(make_sentence(message.error)) finally: channels.remove(channel)
async def get_definition(self): config = await self.middleware.call("replication.config.config") timezone = (await self.middleware.call("system.general.config"))["timezone"] pools = {pool["name"]: pool for pool in await self.middleware.call("pool.query")} hold_tasks = {} periodic_snapshot_tasks = {} for periodic_snapshot_task in await self.middleware.call("pool.snapshottask.query", [["enabled", "=", True]]): hold_task_reason = self._hold_task_reason(pools, periodic_snapshot_task["dataset"]) if hold_task_reason: hold_tasks[f"periodic_snapshot_task_{periodic_snapshot_task['id']}"] = hold_task_reason continue periodic_snapshot_tasks[f"task_{periodic_snapshot_task['id']}"] = self.periodic_snapshot_task_definition( periodic_snapshot_task, ) replication_tasks = {} for replication_task in await self.middleware.call("replication.query", [["enabled", "=", True]]): try: replication_tasks[f"task_{replication_task['id']}"] = await self._replication_task_definition( pools, replication_task ) except HoldReplicationTaskException as e: hold_tasks[f"replication_task_{replication_task['id']}"] = e.reason for job_id, replication_task in self.onetime_replication_tasks.items(): try: replication_tasks[f"job_{job_id}"] = await self._replication_task_definition(pools, replication_task) except HoldReplicationTaskException as e: hold_tasks[f"job_{job_id}"] = e.reason definition = { "max-parallel-replication-tasks": config["max_parallel_replication_tasks"], "timezone": timezone, "use-removal-dates": True, "periodic-snapshot-tasks": periodic_snapshot_tasks, "replication-tasks": replication_tasks, } # Test if does not cause exceptions Definition.from_data(definition, raise_on_error=False) hold_tasks = { task_id: { "state": "HOLD", "datetime": datetime.utcnow(), "reason": make_sentence(reason), } for task_id, reason in hold_tasks.items() } return definition, hold_tasks
def _run_replication_task_job(self, id, job): channels = self.replication_jobs_channels[id] channel = queue.Queue() channels.append(channel) snapshot_start_message = None snapshot_progress_message = None data_progress_message = None try: while True: message = channel.get() if isinstance(message, ReplicationTaskLog): job.logs_fd.write( message.log.encode("utf8", "ignore") + b"\n") if isinstance(message, ReplicationTaskSnapshotStart): snapshot_start_message = message snapshot_progress_message = None self._set_replication_task_progress( job, snapshot_start_message, snapshot_progress_message, data_progress_message) if isinstance(message, ReplicationTaskSnapshotProgress): snapshot_progress_message = message self._set_replication_task_progress( job, snapshot_start_message, snapshot_progress_message, data_progress_message) if isinstance(message, ReplicationTaskDataProgress): data_progress_message = message self._set_replication_task_progress( job, snapshot_start_message, snapshot_progress_message, data_progress_message) if isinstance(message, ReplicationTaskSuccess): return if isinstance(message, ReplicationTaskError): raise CallError(make_sentence(message.error)) finally: channels.remove(channel)
def _observer_queue_reader(self): while True: message = self.observer_queue.get() try: self.logger.trace("Observer queue got %r", message) # Global events if isinstance(message, DefinitionErrors): definition_errors = {} for error in message.errors: if isinstance(error, PeriodicSnapshotTaskDefinitionError): definition_errors[f"periodic_snapshot_{error.task_id}"] = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(error)), } if isinstance(error, ReplicationTaskDefinitionError): definition_errors[f"replication_{error.task_id}"] = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(error)), } self.middleware.call_sync("zettarepl.set_definition_errors", definition_errors) # Periodic snapshot task if isinstance(message, PeriodicSnapshotTaskStart): self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", { "state": "RUNNING", "datetime": datetime.utcnow(), }) if isinstance(message, PeriodicSnapshotTaskSuccess): self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", { "state": "FINISHED", "datetime": datetime.utcnow(), }) if isinstance(message, PeriodicSnapshotTaskError): self.middleware.call_sync("zettarepl.set_state", f"periodic_snapshot_{message.task_id}", { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(message.error), }) # Replication task events if isinstance(message, ReplicationTaskScheduled): if ( (self.middleware.call_sync( "zettarepl.get_state_internal", f"replication_{message.task_id}" ) or {}).get("state") != "RUNNING" ): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "WAITING", "datetime": datetime.utcnow(), }) if isinstance(message, ReplicationTaskStart): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "RUNNING", "datetime": datetime.utcnow(), }) # Start fake job if none are already running if not self.replication_jobs_channels[message.task_id]: self.middleware.call_sync("replication.run", int(message.task_id[5:]), False) if isinstance(message, ReplicationTaskLog): for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSnapshotStart): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "RUNNING", "datetime": datetime.utcnow(), "progress": { "dataset": message.dataset, "snapshot": message.snapshot, "snapshots_sent": message.snapshots_sent, "snapshots_total": message.snapshots_total, "bytes_sent": 0, "bytes_total": 0, # legacy "current": 0, "total": 0, } }) for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSnapshotProgress): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "RUNNING", "datetime": datetime.utcnow(), "progress": { "dataset": message.dataset, "snapshot": message.snapshot, "snapshots_sent": message.snapshots_sent, "snapshots_total": message.snapshots_total, "bytes_sent": message.bytes_sent, "bytes_total": message.bytes_total, # legacy "current": message.bytes_sent, "total": message.bytes_total, } }) for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSnapshotSuccess): self.middleware.call_sync("zettarepl.set_last_snapshot", f"replication_{message.task_id}", f"{message.dataset}@{message.snapshot}") for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskDataProgress): task_id = f"replication_{message.task_id}" try: state = self.middleware.call_sync("zettarepl.get_internal_task_state", task_id) except KeyError: pass else: if state["state"] == "RUNNING" and "progress" in state: state["progress"].update({ "root_dataset": message.dataset, "src_size": message.src_size, "dst_size": message.dst_size, }) self.middleware.call_sync("zettarepl.set_state", task_id, state) for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSuccess): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "FINISHED", "datetime": datetime.utcnow(), "warnings": message.warnings, }) for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskError): self.middleware.call_sync("zettarepl.set_state", f"replication_{message.task_id}", { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(message.error), }) for channel in self.replication_jobs_channels[message.task_id]: channel.put(message) except Exception: self.logger.warning("Unhandled exception in observer_queue_reader", exc_info=True)
async def get_definition(self): timezone = (await self.middleware.call("system.general.config"))["timezone"] pools = { pool["name"]: pool for pool in await self.middleware.call("pool.query") } hold_tasks = {} periodic_snapshot_tasks = {} for periodic_snapshot_task in await self.middleware.call( "pool.snapshottask.query", [["enabled", "=", True]]): hold_task_reason = self._hold_task_reason( pools, periodic_snapshot_task["dataset"]) if hold_task_reason: hold_tasks[ f"periodic_snapshot_task_{periodic_snapshot_task['id']}"] = hold_task_reason continue periodic_snapshot_tasks[f"task_{periodic_snapshot_task['id']}"] = { "dataset": periodic_snapshot_task["dataset"], "recursive": periodic_snapshot_task["recursive"], "exclude": periodic_snapshot_task["exclude"], "lifetime": lifetime_iso8601(periodic_snapshot_task["lifetime_value"], periodic_snapshot_task["lifetime_unit"]), "naming-schema": periodic_snapshot_task["naming_schema"], "schedule": zettarepl_schedule(periodic_snapshot_task["schedule"]), "allow-empty": periodic_snapshot_task["allow_empty"], } replication_tasks = {} for replication_task in await self.middleware.call( "replication.query", [["enabled", "=", True]]): if replication_task["direction"] == "PUSH": hold = False for source_dataset in replication_task["source_datasets"]: hold_task_reason = self._hold_task_reason( pools, source_dataset) if hold_task_reason: hold_tasks[ f"replication_task_{replication_task['id']}"] = hold_task_reason hold = True break if hold: continue if replication_task["direction"] == "PULL": hold_task_reason = self._hold_task_reason( pools, replication_task["target_dataset"]) if hold_task_reason: hold_tasks[ f"replication_task_{replication_task['id']}"] = hold_task_reason continue if replication_task["transport"] != "LOCAL": if not await self.middleware.call( "network.general.can_perform_activity", "replication"): hold_tasks[ f"replication_task_{replication_task['id']}"] = ( "Replication network activity is disabled") continue try: transport = await self._define_transport( replication_task["transport"], (replication_task["ssh_credentials"] or {}).get("id"), replication_task["netcat_active_side"], replication_task["netcat_active_side_listen_address"], replication_task["netcat_active_side_port_min"], replication_task["netcat_active_side_port_max"], replication_task["netcat_passive_side_connect_address"], ) except CallError as e: hold_tasks[ f"replication_task_{replication_task['id']}"] = e.errmsg continue my_periodic_snapshot_tasks = [ f"task_{periodic_snapshot_task['id']}" for periodic_snapshot_task in replication_task["periodic_snapshot_tasks"] ] my_schedule = replication_task["schedule"] definition = { "direction": replication_task["direction"].lower(), "transport": transport, "source-dataset": replication_task["source_datasets"], "target-dataset": replication_task["target_dataset"], "recursive": replication_task["recursive"], "exclude": replication_task_exclude(replication_task), "properties": replication_task["properties"], "properties-exclude": replication_task["properties_exclude"], "properties-override": replication_task["properties_override"], "replicate": replication_task["replicate"], "periodic-snapshot-tasks": my_periodic_snapshot_tasks, "auto": replication_task["auto"], "only-matching-schedule": replication_task["only_matching_schedule"], "allow-from-scratch": replication_task["allow_from_scratch"], "readonly": replication_task["readonly"].lower(), "hold-pending-snapshots": replication_task["hold_pending_snapshots"], "retention-policy": replication_task["retention_policy"].lower(), "large-block": replication_task["large_block"], "embed": replication_task["embed"], "compressed": replication_task["compressed"], "retries": replication_task["retries"], "logging-level": (replication_task["logging_level"] or "NOTSET").lower(), } if replication_task["encryption"]: definition["encryption"] = { "key": replication_task["encryption_key"], "key-format": replication_task["encryption_key_format"], "key-location": replication_task["encryption_key_location"], } if replication_task["naming_schema"]: definition["naming-schema"] = replication_task["naming_schema"] if replication_task["also_include_naming_schema"]: definition["also-include-naming-schema"] = replication_task[ "also_include_naming_schema"] if my_schedule is not None: definition["schedule"] = zettarepl_schedule(my_schedule) if replication_task["restrict_schedule"] is not None: definition["restrict-schedule"] = zettarepl_schedule( replication_task["restrict_schedule"]) if replication_task[ "lifetime_value"] is not None and replication_task[ "lifetime_unit"] is not None: definition["lifetime"] = lifetime_iso8601( replication_task["lifetime_value"], replication_task["lifetime_unit"]) if replication_task["compression"] is not None: definition["compression"] = replication_task[ "compression"].lower() if replication_task["speed_limit"] is not None: definition["speed-limit"] = replication_task["speed_limit"] replication_tasks[f"task_{replication_task['id']}"] = definition definition = { "timezone": timezone, "periodic-snapshot-tasks": periodic_snapshot_tasks, "replication-tasks": replication_tasks, } # Test if does not cause exceptions Definition.from_data(definition, raise_on_error=False) hold_tasks = { task_id: { "state": "HOLD", "datetime": datetime.utcnow(), "reason": make_sentence(reason), } for task_id, reason in hold_tasks.items() } return definition, hold_tasks
def _observer_queue_reader(self): while True: message = self.observer_queue.get() try: self.logger.debug("Observer queue got %r", message) # Global events if isinstance(message, DefinitionErrors): self.definition_errors = {} for error in message.errors: if isinstance(error, PeriodicSnapshotTaskDefinitionError): self.definition_errors[ f"periodic_snapshot_{error.task_id}"] = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(error)), } if isinstance(error, ReplicationTaskDefinitionError): self.definition_errors[ f"replication_{error.task_id}"] = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(str(error)), } # Periodic snapshot task if isinstance(message, PeriodicSnapshotTaskStart): self.state[f"periodic_snapshot_{message.task_id}"] = { "state": "RUNNING", "datetime": datetime.utcnow(), } if isinstance(message, PeriodicSnapshotTaskSuccess): self.state[f"periodic_snapshot_{message.task_id}"] = { "state": "FINISHED", "datetime": datetime.utcnow(), } if isinstance(message, PeriodicSnapshotTaskError): self.state[f"periodic_snapshot_{message.task_id}"] = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(message.error), } # Replication task events if isinstance(message, ReplicationTaskScheduled): self.state[f"replication_{message.task_id}"] = { "state": "WAITING", "datetime": datetime.utcnow(), } if isinstance(message, ReplicationTaskStart): self.state[f"replication_{message.task_id}"] = { "state": "RUNNING", "datetime": datetime.utcnow(), } # Start fake job if none are already running if not self.replication_jobs_channels[message.task_id]: self.middleware.call_sync("replication.run", int(message.task_id[5:]), False) if isinstance(message, ReplicationTaskLog): for channel in self.replication_jobs_channels[ message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSnapshotProgress): self.state[f"replication_{message.task_id}"] = { "state": "RUNNING", "datetime": datetime.utcnow(), "progress": { "dataset": message.dataset, "snapshot": message.snapshot, "current": message.current, "total": message.total, } } for channel in self.replication_jobs_channels[ message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSnapshotSuccess): last_snapshot = f"{message.dataset}@{message.snapshot}" self.last_snapshot[ f"replication_{message.task_id}"] = last_snapshot self.serializable_state[int(message.task_id.split( "_")[1])]["last_snapshot"] = last_snapshot for channel in self.replication_jobs_channels[ message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskSuccess): state = { "state": "FINISHED", "datetime": datetime.utcnow(), } self.state[f"replication_{message.task_id}"] = state self.serializable_state[int( message.task_id.split("_")[1])]["state"] = state for channel in self.replication_jobs_channels[ message.task_id]: channel.put(message) if isinstance(message, ReplicationTaskError): state = { "state": "ERROR", "datetime": datetime.utcnow(), "error": make_sentence(message.error), } self.state[f"replication_{message.task_id}"] = state self.serializable_state[int( message.task_id.split("_")[1])]["state"] = state for channel in self.replication_jobs_channels[ message.task_id]: channel.put(message) except Exception: self.logger.warning( "Unhandled exception in observer_queue_reader", exc_info=True)