def __start_live_migration(self): """ Start migration in live mode Migrate memory and fs to target host iteratively while possible, checkpoint process tree on source host and restore it on target host. """ self.fs.set_work_dir(self.img.work_dir()) self.__validate_cpu() self.__validate_criu_version() use_pre_dumps = self.__check_use_pre_dumps() root_pid = self.htype.root_task_pid() migration_stats = mstats.live_stats() migration_stats.handle_start() # Handle preliminary FS migration logging.info("Preliminary FS migration") fsstats = self.fs.start_migration() migration_stats.handle_preliminary(fsstats) iter_index = 0 prev_dstats = None while use_pre_dumps: # Handle predump logging.info("* Iteration %d", iter_index) self.target_host.start_iter(True) self.img.new_image_dir() criu_cr.criu_predump(self.htype, root_pid, self.img, self.criu_connection, self.fs) self.target_host.end_iter() # Handle FS migration iteration fsstats = self.fs.next_iteration() dstats = criu_api.criu_get_dstats(self.img) migration_stats.handle_iteration(dstats, fsstats) # Decide whether we continue iteration or stop and do final dump if not self.__check_live_iter_progress(iter_index, dstats, prev_dstats): break iter_index += 1 prev_dstats = dstats # Dump htype on source and leave its tasks in frozen state logging.info("Final dump and restore") self.target_host.start_iter(self.htype.dump_need_page_server()) self.img.new_image_dir() self.htype.final_dump(root_pid, self.img, self.criu_connection, self.fs) self.target_host.end_iter() try: # Handle final FS and images sync on frozen htype logging.info("Final FS and images sync") fsstats = self.fs.stop_migration() self.img.sync_imgs_to_target(self.target_host, self.htype, self.connection.mem_sk) # Restore htype on target logging.info("Asking target host to restore") self.target_host.restore_from_images() except: self.htype.migration_fail(self.fs) raise # Restored on target, can't fail starting from this point try: # Ack previous dump request to terminate all frozen tasks resp = self.criu_connection.ack_notify() if not resp.success: logging.warning("Bad notification from target host") dstats = criu_api.criu_get_dstats(self.img) migration_stats.handle_iteration(dstats, fsstats) logging.info("Migration succeeded") self.htype.migration_complete(self.fs, self.target_host) migration_stats.handle_stop(self) self.img.close() self.criu_connection.close() except Exception as e: logging.warning("Exception during final cleanup: %s", e)
def start_migration(self): migration_stats = mstats.migration_stats() prev_dstats = None iter_index = 0 migration_stats.start() if not self.__force: self.validate_cpu() logging.info("Preliminary FS migration") self.fs.set_work_dir(self.img.work_dir()) self.fs.start_migration() logging.info("Checking for Dirty Tracking") if self.pre_dump == PRE_DUMP_AUTO_DETECT: # pre-dump auto-detection try: self.pre_dump = self.pre_dump_check() if self.pre_dump: logging.info("\t`- Auto Enabled") else: logging.info("\t`- Auto Disabled") except: # The available criu seems to not # support memory tracking auto detection. self.pre_dump = PRE_DUMP_DISABLE logging.info("\t`- Auto detection not possible " "- Disabled") elif self.pre_dump == PRE_DUMP_DISABLE: logging.info("\t`- Command-line disabled") else: logging.info("\t`- Command-line enabled") if self.pre_dump: logging.info("Starting iterations") else: self.criu_connection.memory_tracking(False) while self.pre_dump: logging.info("* Iteration %d", iter_index) self.target_host.start_iter() self.img.new_image_dir() logging.info("\tIssuing pre-dump command to service") req = criu_req.make_predump_req(self.pid, self.htype, self.img, self.criu_connection, self.fs) resp = self.criu_connection.send_req(req) if not resp.success: raise Exception("Pre-dump failed") logging.info("\tPre-dump succeeded") self.target_host.end_iter() dstats = criu_api.criu_get_dstats(self.img) migration_stats.iteration(dstats) # # Need to decide whether we do next iteration # or stop on the existing and go do full dump # and restore # logging.info("Checking iteration progress:") if dstats.pages_written <= phaul_iter_min_size: logging.info("\t> Small dump") break if prev_dstats: w_add = dstats.pages_written - prev_dstats.pages_written w_add = w_add * 100 / prev_dstats.pages_written if w_add > phaul_iter_grow_max: logging.info("\t> Iteration grows") break if iter_index >= phaul_iter_max: logging.info("\t> Too many iterations") break iter_index += 1 prev_dstats = dstats logging.info("\t> Proceed to next iteration") self.fs.next_iteration() # # Finish with iterations -- do full dump, send images # to target host and restore from them there # logging.info("Final dump and restore") self.target_host.start_iter() self.img.new_image_dir() logging.info("\tIssuing dump command to service") req = criu_req.make_dump_req(self.pid, self.htype, self.img, self.criu_connection, self.fs) resp = self.criu_connection.send_req(req) while True: if resp.type != pycriu.rpc.NOTIFY: raise Exception("Dump failed") if resp.notify.script == "post-dump": # # Dump is effectively over. Now CRIU # waits for us to do whatever we want # and keeps the tasks frozen. # break elif resp.notify.script == "network-lock": self.htype.net_lock() elif resp.notify.script == "network-unlock": self.htype.net_unlock() logging.info("\t\tNotify (%s)", resp.notify.script) resp = self.criu_connection.ack_notify() logging.info("Dump complete") self.target_host.end_iter() # # Dump is complete -- go to target node, # restore them there and kill (if required) # tasks on source node # logging.info("Final FS and images sync") self.fs.stop_migration() self.img.sync_imgs_to_target(self.target_host, self.htype, self.connection.mem_sk) logging.info("Asking target host to restore") self.target_host.restore_from_images() # # Ack the notify after restore -- CRIU would # then terminate all tasks and send us back # DUMP/success message # resp = self.criu_connection.ack_notify() if not resp.success: raise Exception("Dump screwed up") self.htype.umount() dstats = criu_api.criu_get_dstats(self.img) migration_stats.iteration(dstats) migration_stats.stop(self) self.img.close() self.criu_connection.close()
def start_migration(self): migration_stats = mstats.migration_stats() prev_dstats = None iter_index = 0 migration_stats.start() self.validate_cpu() logging.info("Preliminary FS migration") self.fs.set_work_dir(self.img.work_dir()) self.fs.start_migration() logging.info("Checking for Dirty Tracking") if self.pre_dump == PRE_DUMP_AUTO_DETECT: # pre-dump auto-detection try: self.pre_dump = (self.pre_dump_check() and self.htype.can_pre_dump()) logging.info("\t`- Auto %s" % (self.pre_dump and 'enabled' or 'disabled')) except: # The available criu seems to not # support memory tracking auto detection. self.pre_dump = PRE_DUMP_DISABLE logging.info("\t`- Auto detection not possible " "- Disabled") else: logging.info("\t`- Command-line %s" % (self.pre_dump and 'enabled' or 'disabled')) if self.pre_dump: logging.info("Starting iterations") else: self.criu_connection.memory_tracking(False) while self.pre_dump: logging.info("* Iteration %d", iter_index) self.target_host.start_iter(True) self.img.new_image_dir() logging.info("\tIssuing pre-dump command to service") req = criu_req.make_predump_req( self.pid, self.img, self.criu_connection, self.fs) resp = self.criu_connection.send_req(req) if not resp.success: raise Exception("Pre-dump failed") logging.info("\tPre-dump succeeded") self.target_host.end_iter() dstats = criu_api.criu_get_dstats(self.img) migration_stats.iteration(dstats) # # Need to decide whether we do next iteration # or stop on the existing and go do full dump # and restore # logging.info("Checking iteration progress:") if dstats.pages_written <= phaul_iter_min_size: logging.info("\t> Small dump") break if prev_dstats: w_add = dstats.pages_written - prev_dstats.pages_written w_add = w_add * 100 / prev_dstats.pages_written if w_add > phaul_iter_grow_max: logging.info("\t> Iteration grows") break if iter_index >= phaul_iter_max: logging.info("\t> Too many iterations") break iter_index += 1 prev_dstats = dstats logging.info("\t> Proceed to next iteration") self.fs.next_iteration() # # Finish with iterations -- do full dump, send images # to target host and restore from them there # logging.info("Final dump and restore") self.target_host.start_iter(self.htype.dump_need_ps()) self.img.new_image_dir() logging.info("\tIssuing dump command to service") self.htype.final_dump(self.pid, self.img, self.criu_connection, self.fs) logging.info("Dump complete") self.target_host.end_iter() # # Dump is complete -- go to target node, # restore them there and kill (if required) # tasks on source node # logging.info("Final FS and images sync") self.fs.stop_migration() self.img.sync_imgs_to_target(self.target_host, self.htype, self.connection.mem_sk) logging.info("Asking target host to restore") self.target_host.restore_from_images() # # Ack the notify after restore -- CRIU would # then terminate all tasks and send us back # DUMP/success message # resp = self.criu_connection.ack_notify() if not resp.success: raise Exception("Dump screwed up") self.htype.umount() dstats = criu_api.criu_get_dstats(self.img) migration_stats.iteration(dstats) migration_stats.stop(self) self.img.close() self.criu_connection.close()
def __start_live_migration(self): """Start migration in live mode Migrate memory and fs to target host iteratively while possible, checkpoint process tree on source host and restore it on target host. """ self.fs.set_work_dir(self.img.work_dir()) self.__validate_cpu() self.__validate_criu_version() use_pre_dumps = self.__check_use_pre_dumps() root_pid = self.htype.root_task_pid() migration_stats = mstats.live_stats() migration_stats.handle_start() # Handle preliminary FS migration logging.info("Preliminary FS migration") fsstats = self.fs.start_migration() migration_stats.handle_preliminary(fsstats) iter_index = 0 prev_dstats = None while use_pre_dumps: # Handle predump logging.info("* Iteration %d", iter_index) self.target_host.start_iter(True) self.img.new_image_dir() criu_cr.criu_predump(self.htype, root_pid, self.img, self.criu_connection, self.fs) self.target_host.end_iter() # Handle FS migration iteration fsstats = self.fs.next_iteration() dstats = criu_api.criu_get_dstats(self.img) migration_stats.handle_iteration(dstats, fsstats) # Decide whether we continue iteration or stop and do final dump if not self.__check_live_iter_progress(iter_index, dstats, prev_dstats): break iter_index += 1 prev_dstats = dstats # Dump htype on source and leave its tasks in frozen state logging.info("Final dump and restore") self.target_host.start_iter(self.htype.dump_need_page_server()) self.img.new_image_dir() self.htype.final_dump(root_pid, self.img, self.criu_connection, self.fs) self.target_host.end_iter() try: # Handle final FS and images sync on frozen htype logging.info("Final FS and images sync") fsstats = self.fs.stop_migration() self.img.sync_imgs_to_target(self.target_host, self.htype, self.connection.mem_sk) # Restore htype on target logging.info("Asking target host to restore") self.target_host.restore_from_images() except Exception: self.htype.migration_fail(self.fs) raise # Restored on target, can't fail starting from this point try: # Ack previous dump request to terminate all frozen tasks resp = self.criu_connection.ack_notify() if not resp.success: logging.warning("Bad notification from target host") dstats = criu_api.criu_get_dstats(self.img) migration_stats.handle_iteration(dstats, fsstats) logging.info("Migration succeeded") self.htype.migration_complete(self.fs, self.target_host) migration_stats.handle_stop(self) self.img.close() self.criu_connection.close() except Exception as e: logging.warning("Exception during final cleanup: %s", e)
def start_migration(self): self._mstat.start() if not self.__force: self.validate_cpu() print "Preliminary FS migration" self.fs.set_work_dir(self.img.work_dir()) self.fs.start_migration() print "Starting iterations" cc = self.criu while True: print "* Iteration %d" % self.iteration self.th.start_iter() self.img.new_image_dir() print "\tIssuing pre-dump command to service" req = self.make_dump_req(cr_rpc.PRE_DUMP) resp = cc.send_req(req) if not resp.success: raise Exception("Pre-dump failed") print "\tPre-dump succeeded" self.th.end_iter() stats = criu_api.criu_get_dstats(self.img) self._mstat.iteration(stats) # # Need to decide whether we do next iteration # or stop on the existing and go do full dump # and restore # print "Checking iteration progress:" if stats.pages_written <= phaul_iter_min_size: print "\t> Small dump" break; if self.prev_stats: w_add = stats.pages_written - self.prev_stats.pages_written w_add = w_add * 100 / self.prev_stats.pages_written if w_add > phaul_iter_grow_max: print "\t> Iteration grows" break if self.iteration >= phaul_iter_max: print "\t> Too many iterations" break self.iteration += 1 self.prev_stats = stats print "\t> Proceed to next iteration" self.fs.next_iteration() # # Finish with iterations -- do full dump, send images # to target host and restore from them there # print "Final dump and restore" self.th.start_iter() self.img.new_image_dir() print "\tIssuing dump command to service" req = self.make_dump_req(cr_rpc.DUMP) req.opts.notify_scripts = True req.opts.file_locks = True req.opts.evasive_devices = True req.opts.link_remap = True if self.htype.can_migrate_tcp(): req.opts.tcp_established = True resp = cc.send_req(req) while True: if resp.type != cr_rpc.NOTIFY: raise Exception("Dump failed") if resp.notify.script == "post-dump": # # Dump is effectively over. Now CRIU # waits for us to do whatever we want # and keeps the tasks frozen. # break elif resp.notify.script == "network-lock": self.htype.net_lock() elif resp.notify.script == "network-unlock": self.htype.net_unlock() print "\t\tNotify (%s)" % resp.notify.script resp = cc.ack_notify() print "Dump complete" self.th.end_iter() # # Dump is complete -- go to target node, # restore them there and kill (if required) # tasks on source node # print "Final FS and images sync" self.fs.stop_migration() self.img.sync_imgs_to_target(self.th, self.htype, self.data_sk) print "Asking target host to restore" self.th.restore_from_images() # # Ack the notify after restore -- CRIU would # then terminate all tasks and send us back # DUMP/success message # resp = cc.ack_notify() if not resp.success: raise Exception("Dump screwed up") self.htype.umount() stats = criu_api.criu_get_dstats(self.img) self._mstat.iteration(stats) self._mstat.stop(self) self.img.close() cc.close()
def start_migration(self): self._mstat.start() if not self.__force: self.validate_cpu() print "Preliminary FS migration" self.fs.set_work_dir(self.img.work_dir()) self.fs.start_migration() print "Starting iterations" cc = self.criu while True: print "* Iteration %d" % self.iteration self.th.start_iter() self.img.new_image_dir() print "\tIssuing pre-dump command to service" req = self.make_dump_req(cr_rpc.PRE_DUMP) resp = cc.send_req(req) if not resp.success: raise Exception("Pre-dump failed") print "\tPre-dump succeeded" self.th.end_iter() stats = criu_api.criu_get_dstats(self.img) self._mstat.iteration(stats) # # Need to decide whether we do next iteration # or stop on the existing and go do full dump # and restore # print "Checking iteration progress:" if stats.pages_written <= phaul_iter_min_size: print "\t> Small dump" break if self.prev_stats: w_add = stats.pages_written - self.prev_stats.pages_written w_add = w_add * 100 / self.prev_stats.pages_written if w_add > phaul_iter_grow_max: print "\t> Iteration grows" break if self.iteration >= phaul_iter_max: print "\t> Too many iterations" break self.iteration += 1 self.prev_stats = stats print "\t> Proceed to next iteration" self.fs.next_iteration() # # Finish with iterations -- do full dump, send images # to target host and restore from them there # print "Final dump and restore" self.th.start_iter() self.img.new_image_dir() print "\tIssuing dump command to service" req = self.make_dump_req(cr_rpc.DUMP) req.opts.notify_scripts = True req.opts.file_locks = True req.opts.evasive_devices = True req.opts.link_remap = True if self.htype.can_migrate_tcp(): req.opts.tcp_established = True resp = cc.send_req(req) while True: if resp.type != cr_rpc.NOTIFY: raise Exception("Dump failed") if resp.notify.script == "post-dump": # # Dump is effectively over. Now CRIU # waits for us to do whatever we want # and keeps the tasks frozen. # break elif resp.notify.script == "network-lock": self.htype.net_lock() elif resp.notify.script == "network-unlock": self.htype.net_unlock() print "\t\tNotify (%s)" % resp.notify.script resp = cc.ack_notify() print "Dump complete" self.th.end_iter() # # Dump is complete -- go to target node, # restore them there and kill (if required) # tasks on source node # print "Final FS and images sync" self.fs.stop_migration() self.img.sync_imgs_to_target(self.th, self.htype, self.data_sk) print "Asking target host to restore" self.th.restore_from_images() # # Ack the notify after restore -- CRIU would # then terminate all tasks and send us back # DUMP/success message # resp = cc.ack_notify() if not resp.success: raise Exception("Dump screwed up") self.htype.umount() stats = criu_api.criu_get_dstats(self.img) self._mstat.iteration(stats) self._mstat.stop(self) self.img.close() cc.close()