def _VerifyDrbdStates(self, node_errors, offline_disk_instance_names): node_to_inst = {} for inst in self.instances.values(): disks = self.cfg.GetInstanceDisks(inst.uuid) if not (inst.disks_active and utils.AnyDiskOfType(disks, [constants.DT_DRBD8])): continue secondary_nodes = self.cfg.GetInstanceSecondaryNodes(inst.uuid) for node_uuid in itertools.chain([inst.primary_node], secondary_nodes): node_to_inst.setdefault(node_uuid, []).append(inst) for (node_uuid, insts) in node_to_inst.items(): node_disks = [(self.cfg.GetInstanceDisks(inst.uuid), inst) for inst in insts] node_res = self.rpc.call_drbd_needs_activation( node_uuid, node_disks) msg = node_res.fail_msg if msg: logging.warning("Error getting DRBD status on node %s: %s", self.cfg.GetNodeName(node_uuid), msg) node_errors[node_uuid] = msg continue faulty_disk_uuids = set(node_res.payload) for inst in self.instances.values(): disks = self.cfg.GetInstanceDisks(inst.uuid) inst_disk_uuids = set([disk.uuid for disk in disks]) if inst_disk_uuids.intersection(faulty_disk_uuids): offline_disk_instance_names.add(inst.name)
def GetRequest(self, cfg): """Request an relocation of an instance The checks for the completeness of the opcode must have already been done. """ instance = cfg.GetInstanceInfo(self.inst_uuid) disks = cfg.GetInstanceDisks(self.inst_uuid) if instance is None: raise errors.ProgrammerError("Unknown instance '%s' passed to" " IAllocator" % self.inst_uuid) if not utils.AllDiskOfType(disks, constants.DTS_MIRRORED): raise errors.OpPrereqError("Can't relocate non-mirrored instances", errors.ECODE_INVAL) secondary_nodes = cfg.GetInstanceSecondaryNodes(instance.uuid) if (utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR) and len(secondary_nodes) != 1): raise errors.OpPrereqError("Instance has not exactly one secondary node", errors.ECODE_STATE) disk_sizes = [{constants.IDISK_SIZE: disk.size, constants.IDISK_TYPE: disk.dev_type} for disk in disks] disk_space = gmi.ComputeDiskSize(disk_sizes) return { "name": instance.name, "disk_space_total": disk_space, "required_nodes": 1, "relocate_from": cfg.GetNodeNames(self.relocate_from_node_uuids), }
def _DeclareLocksForMigration(lu, level): """Declares locks for L{TLMigrateInstance}. @type lu: L{LogicalUnit} @param level: Lock level """ if level == locking.LEVEL_NODE: assert lu.op.instance_name in lu.owned_locks(locking.LEVEL_INSTANCE) instance = lu.cfg.GetInstanceInfo(lu.op.instance_uuid) disks = lu.cfg.GetInstanceDisks(instance.uuid) if utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): if lu.op.target_node is None: lu.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET else: lu.needed_locks[locking.LEVEL_NODE] = [instance.primary_node, lu.op.target_node_uuid] else: lu._LockInstancesNodes() # pylint: disable=W0212 assert (lu.needed_locks[locking.LEVEL_NODE] or lu.needed_locks[locking.LEVEL_NODE] is locking.ALL_SET) elif level == locking.LEVEL_NODE_RES: # Copy node locks lu.needed_locks[locking.LEVEL_NODE_RES] = \ CopyLockList(lu.needed_locks[locking.LEVEL_NODE])
def BuildHooksEnv(self): """Build hooks env. This runs on master, primary and secondary nodes of the instance. """ instance = self._migrater.instance source_node_uuid = instance.primary_node target_node_uuid = self._migrater.target_node_uuid env = { "IGNORE_CONSISTENCY": self.op.ignore_consistency, "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout, "OLD_PRIMARY": self.cfg.GetNodeName(source_node_uuid), "NEW_PRIMARY": self.cfg.GetNodeName(target_node_uuid), "FAILOVER_CLEANUP": self.op.cleanup, } disks = self.cfg.GetInstanceDisks(instance.uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) env["OLD_SECONDARY"] = self.cfg.GetNodeName(secondary_nodes[0]) env["NEW_SECONDARY"] = self.cfg.GetNodeName(source_node_uuid) else: env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = "" env.update(BuildInstanceHookEnvByObject(self, instance)) return env
def BuildHooksEnv(self): """Build hooks env. This runs on master, primary and secondary nodes of the instance. """ instance = self._migrater.instance source_node_uuid = instance.primary_node target_node_uuid = self._migrater.target_node_uuid env = BuildInstanceHookEnvByObject(self, instance) env.update({ "MIGRATE_LIVE": self._migrater.live, "MIGRATE_CLEANUP": self.op.cleanup, "OLD_PRIMARY": self.cfg.GetNodeName(source_node_uuid), "NEW_PRIMARY": self.cfg.GetNodeName(target_node_uuid), "ALLOW_RUNTIME_CHANGES": self.op.allow_runtime_changes, }) disks = self.cfg.GetInstanceDisks(instance.uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) env["OLD_SECONDARY"] = self.cfg.GetNodeName(secondary_nodes[0]) env["NEW_SECONDARY"] = self.cfg.GetNodeName(source_node_uuid) else: env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = "" return env
def CheckAssignmentForSplitInstances(self, changes, node_data, instance_data): """Check for split instances after a node assignment. This method considers a series of node assignments as an atomic operation, and returns information about split instances after applying the set of changes. In particular, it returns information about newly split instances, and instances that were already split, and remain so after the change. Only disks whose template is listed in constants.DTS_INT_MIRROR are considered. @type changes: list of (node_uuid, new_group_uuid) pairs. @param changes: list of node assignments to consider. @param node_data: a dict with data for all nodes @param instance_data: a dict with all instances to consider @rtype: a two-tuple @return: a list of instances that were previously okay and result split as a consequence of this change, and a list of instances that were previously split and this change does not fix. """ changed_nodes = dict((uuid, group) for uuid, group in changes if node_data[uuid].group != group) all_split_instances = set() previously_split_instances = set() for inst in instance_data.values(): inst_disks = self.cfg.GetInstanceDisks(inst.uuid) if not utils.AnyDiskOfType(inst_disks, constants.DTS_INT_MIRROR): continue inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) if len(set(node_data[node_uuid].group for node_uuid in inst_nodes)) > 1: previously_split_instances.add(inst.uuid) if len( set( changed_nodes.get(node_uuid, node_data[node_uuid].group) for node_uuid in inst_nodes)) > 1: all_split_instances.add(inst.uuid) return (list(all_split_instances - previously_split_instances), list(previously_split_instances & all_split_instances))
def _RevertDiskStatus(self): """Try to revert the disk status after a failed migration. """ disks = self.cfg.GetInstanceDisks(self.instance.uuid) self._CloseInstanceDisks(self.target_node_uuid) unmap_types = (constants.DT_RBD, constants.DT_EXT) if utils.AnyDiskOfType(disks, unmap_types): # If the instance's disk template is `rbd' or `ext' and there was an # unsuccessful migration, unmap the device from the target node. unmap_disks = [d for d in disks if d.dev_type in unmap_types] disks = ExpandCheckDisks(unmap_disks, unmap_disks) self.feedback_fn("* unmapping instance's disks %s from %s" % (utils.CommaJoin(d.name for d in unmap_disks), self.cfg.GetNodeName(self.target_node_uuid))) for disk in disks: result = self.rpc.call_blockdev_shutdown(self.target_node_uuid, (disk, self.instance)) msg = result.fail_msg if msg: logging.error("Migration failed and I couldn't unmap the block device" " %s on target node %s: %s", disk.iv_name, self.cfg.GetNodeName(self.target_node_uuid), msg) logging.error("You need to unmap the device %s manually on %s", disk.iv_name, self.cfg.GetNodeName(self.target_node_uuid)) if utils.AllDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.source_node_uuid, True) return try: self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() except errors.OpExecError as err: self.lu.LogWarning("Migration failed and I can't reconnect the drives," " please try to recover the instance manually;" " error '%s'" % str(err))
def testHeterogeneous(self): self.assertTrue( utils.AnyDiskOfType([Rbd(), Drbd()], [constants.DT_DRBD8]))
def testNotRbdDiskless(self): self.assertFalse(utils.AnyDiskOfType([Rbd()], [constants.DT_DISKLESS]))
def testNotDiskless(self): self.assertFalse(utils.AnyDiskOfType([], [constants.DT_DRBD8]))
def testNotRbd(self): self.assertFalse(utils.AnyDiskOfType([Rbd()], [constants.DT_DRBD8]))
def testOrRbd(self): self.assertTrue( utils.AnyDiskOfType([Rbd()], [constants.DT_RBD, constants.DT_DRBD8]))
def testOrDrbd(self): self.assertTrue( utils.AnyDiskOfType([Drbd()], [constants.DT_DISKLESS, constants.DT_DRBD8]))
def _ExecMigration(self): """Migrate an instance. The migrate is done by: - change the disks into dual-master mode - wait until disks are fully synchronized again - migrate the instance - change disks on the new secondary node (the old primary) to secondary - wait until disks are fully synchronized - change disks into single-master mode """ # Check for hypervisor version mismatch and warn the user. hvspecs = [ (self.instance.hypervisor, self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor]) ] nodeinfo = self.rpc.call_node_info( [self.source_node_uuid, self.target_node_uuid], None, hvspecs) for ninfo in nodeinfo.values(): ninfo.Raise("Unable to retrieve node information from node '%s'" % ninfo.node) (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and (constants.HV_NODEINFO_KEY_VERSION in dst_info)): src_version = src_info[constants.HV_NODEINFO_KEY_VERSION] dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION] if src_version != dst_version: self.feedback_fn( "* warning: hypervisor version mismatch between" " source (%s) and target (%s) node" % (src_version, dst_version)) hv = hypervisor.GetHypervisor(self.instance.hypervisor) if hv.VersionsSafeForMigration(src_version, dst_version): self.feedback_fn( " migrating from hypervisor version %s to %s should" " be safe" % (src_version, dst_version)) else: self.feedback_fn( " migrating from hypervisor version %s to %s is" " likely unsupported" % (src_version, dst_version)) if self.ignore_hvversions: self.feedback_fn( " continuing anyway (told to ignore version" " mismatch)") else: raise errors.OpExecError( "Unsupported migration between hypervisor" " versions (%s to %s)" % (src_version, dst_version)) self.feedback_fn( "* checking disk consistency between source and target") for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)): if not CheckDiskConsistency(self.lu, self.instance, dev, self.target_node_uuid, False): raise errors.OpExecError("Disk %s is degraded or not fully" " synchronized on target node," " aborting migration" % idx) if self.current_mem > self.tgt_free_mem: if not self.allow_runtime_changes: raise errors.OpExecError( "Memory ballooning not allowed and not enough" " free memory to fit instance %s on target" " node %s (have %dMB, need %dMB)" % (self.instance.name, self.cfg.GetNodeName(self.target_node_uuid), self.tgt_free_mem, self.current_mem)) self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem) rpcres = self.rpc.call_instance_balloon_memory( self.instance.primary_node, self.instance, self.tgt_free_mem) rpcres.Raise("Cannot modify instance runtime memory") # First get the migration information from the remote node result = self.rpc.call_migration_info(self.source_node_uuid, self.instance) msg = result.fail_msg if msg: log_err = ( "Failed fetching source migration information from %s: %s" % (self.cfg.GetNodeName(self.source_node_uuid), msg)) logging.error(log_err) raise errors.OpExecError(log_err) self.migration_info = migration_info = result.payload disks = self.cfg.GetInstanceDisks(self.instance.uuid) self._CloseInstanceDisks(self.target_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): # Then switch the disks to master/master mode self._GoStandalone() self._GoReconnect(True) self._WaitUntilSync() self._OpenInstanceDisks(self.source_node_uuid, False) self._OpenInstanceDisks(self.target_node_uuid, False) self.feedback_fn("* preparing %s to accept the instance" % self.cfg.GetNodeName(self.target_node_uuid)) result = self.rpc.call_accept_instance( self.target_node_uuid, self.instance, migration_info, self.nodes_ip[self.target_node_uuid]) msg = result.fail_msg if msg: logging.error( "Instance pre-migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Pre-migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not pre-migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* migrating instance to %s" % self.cfg.GetNodeName(self.target_node_uuid)) cluster = self.cfg.GetClusterInfo() result = self.rpc.call_instance_migrate( self.source_node_uuid, cluster.cluster_name, self.instance, self.nodes_ip[self.target_node_uuid], self.live) msg = result.fail_msg if msg: logging.error( "Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* starting memory transfer") last_feedback = time.time() while True: result = self.rpc.call_instance_get_migration_status( self.source_node_uuid, self.instance) msg = result.fail_msg ms = result.payload # MigrationStatus instance if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): logging.error( "Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() if not msg: msg = "hypervisor returned failure" raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) if result.payload.status != constants.HV_MIGRATION_ACTIVE: self.feedback_fn("* memory transfer complete") break if (utils.TimeoutExpired(last_feedback, self._MIGRATION_FEEDBACK_INTERVAL) and ms.transferred_ram is not None): mem_progress = 100 * float(ms.transferred_ram) / float( ms.total_ram) self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) last_feedback = time.time() time.sleep(self._MIGRATION_POLL_INTERVAL) result = self.rpc.call_instance_finalize_migration_src( self.source_node_uuid, self.instance, True, self.live) msg = result.fail_msg if msg: logging.error( "Instance migration succeeded, but finalization failed" " on the source node: %s", msg) raise errors.OpExecError( "Could not finalize instance migration: %s" % msg) self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid) self.instance = self.cfg.GetInstanceInfo(self.instance_uuid) disks = self.cfg.GetInstanceDisks(self.instance_uuid) result = self.rpc.call_instance_finalize_migration_dst( self.target_node_uuid, self.instance, migration_info, True) msg = result.fail_msg if msg: logging.error( "Instance migration succeeded, but finalization failed" " on the target node: %s", msg) raise errors.OpExecError( "Could not finalize instance migration: %s" % msg) self._CloseInstanceDisks(self.source_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): self._WaitUntilSync() self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.target_node_uuid, True) # If the instance's disk template is `rbd' or `ext' and there was a # successful migration, unmap the device from the source node. unmap_types = (constants.DT_RBD, constants.DT_EXT) if utils.AnyDiskOfType(disks, unmap_types): unmap_disks = [d for d in disks if d.dev_type in unmap_types] disks = ExpandCheckDisks(unmap_disks, unmap_disks) self.feedback_fn("* unmapping instance's disks %s from %s" % (utils.CommaJoin(d.name for d in unmap_disks), self.cfg.GetNodeName(self.source_node_uuid))) for disk in disks: result = self.rpc.call_blockdev_shutdown( self.source_node_uuid, (disk, self.instance)) msg = result.fail_msg if msg: logging.error( "Migration was successful, but couldn't unmap the" " block device %s on source node %s: %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid), msg) logging.error( "You need to unmap the device %s manually on %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid)) self.feedback_fn("* done")
def _ExecMigration(self): """Migrate an instance. The migrate is done by: - change the disks into dual-master mode - wait until disks are fully synchronized again - migrate the instance - change disks on the new secondary node (the old primary) to secondary - wait until disks are fully synchronized - change disks into single-master mode """ # Check for hypervisor version mismatch and warn the user. hvspecs = [(self.instance.hypervisor, self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])] nodeinfo = self.rpc.call_node_info( [self.source_node_uuid, self.target_node_uuid], None, hvspecs) for ninfo in nodeinfo.values(): ninfo.Raise("Unable to retrieve node information from node '%s'" % ninfo.node) (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and (constants.HV_NODEINFO_KEY_VERSION in dst_info)): src_version = src_info[constants.HV_NODEINFO_KEY_VERSION] dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION] if src_version != dst_version: self.feedback_fn("* warning: hypervisor version mismatch between" " source (%s) and target (%s) node" % (src_version, dst_version)) hv = hypervisor.GetHypervisorClass(self.instance.hypervisor) if hv.VersionsSafeForMigration(src_version, dst_version): self.feedback_fn(" migrating from hypervisor version %s to %s should" " be safe" % (src_version, dst_version)) else: self.feedback_fn(" migrating from hypervisor version %s to %s is" " likely unsupported" % (src_version, dst_version)) if self.ignore_hvversions: self.feedback_fn(" continuing anyway (told to ignore version" " mismatch)") else: raise errors.OpExecError("Unsupported migration between hypervisor" " versions (%s to %s)" % (src_version, dst_version)) self.feedback_fn("* checking disk consistency between source and target") for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)): if not CheckDiskConsistency(self.lu, self.instance, dev, self.target_node_uuid, False): raise errors.OpExecError("Disk %s is degraded or not fully" " synchronized on target node," " aborting migration" % idx) if self.current_mem > self.tgt_free_mem: if not self.allow_runtime_changes: raise errors.OpExecError("Memory ballooning not allowed and not enough" " free memory to fit instance %s on target" " node %s (have %dMB, need %dMB)" % (self.instance.name, self.cfg.GetNodeName(self.target_node_uuid), self.tgt_free_mem, self.current_mem)) self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem) rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node, self.instance, self.tgt_free_mem) rpcres.Raise("Cannot modify instance runtime memory") # First get the migration information from the remote node result = self.rpc.call_migration_info(self.source_node_uuid, self.instance) msg = result.fail_msg if msg: log_err = ("Failed fetching source migration information from %s: %s" % (self.cfg.GetNodeName(self.source_node_uuid), msg)) logging.error(log_err) raise errors.OpExecError(log_err) self.migration_info = migration_info = result.payload disks = self.cfg.GetInstanceDisks(self.instance.uuid) self._CloseInstanceDisks(self.target_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): # Then switch the disks to master/master mode self._GoStandalone() self._GoReconnect(True) self._WaitUntilSync() self._OpenInstanceDisks(self.source_node_uuid, False) self._OpenInstanceDisks(self.target_node_uuid, False) self.feedback_fn("* preparing %s to accept the instance" % self.cfg.GetNodeName(self.target_node_uuid)) result = self.rpc.call_accept_instance(self.target_node_uuid, self.instance, migration_info, self.nodes_ip[self.target_node_uuid]) msg = result.fail_msg if msg: logging.error("Instance pre-migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Pre-migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not pre-migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* migrating instance to %s" % self.cfg.GetNodeName(self.target_node_uuid)) cluster = self.cfg.GetClusterInfo() result = self.rpc.call_instance_migrate( self.source_node_uuid, cluster.cluster_name, self.instance, self.nodes_ip[self.target_node_uuid], self.live) msg = result.fail_msg if msg: logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* starting memory transfer") last_feedback = time.time() cluster_migration_caps = \ cluster.hvparams.get("kvm", {}).get(constants.HV_KVM_MIGRATION_CAPS, "") migration_caps = \ self.instance.hvparams.get(constants.HV_KVM_MIGRATION_CAPS, cluster_migration_caps) # migration_caps is a ':' delimited string, so checking # if 'postcopy-ram' is a substring also covers using # x-postcopy-ram for QEMU 2.5 postcopy_enabled = "postcopy-ram" in migration_caps while True: result = self.rpc.call_instance_get_migration_status( self.source_node_uuid, self.instance) msg = result.fail_msg ms = result.payload # MigrationStatus instance if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() if not msg: msg = "hypervisor returned failure" raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) if (postcopy_enabled and ms.status == constants.HV_MIGRATION_ACTIVE and int(ms.dirty_sync_count) >= self._POSTCOPY_SYNC_COUNT_THRESHOLD): self.feedback_fn("* finishing memory transfer with postcopy") self.rpc.call_instance_start_postcopy(self.source_node_uuid, self.instance) if self.instance.hypervisor == 'kvm': migration_active = \ ms.status in constants.HV_KVM_MIGRATION_ACTIVE_STATUSES else: migration_active = \ ms.status == constants.HV_MIGRATION_ACTIVE if not migration_active: self.feedback_fn("* memory transfer complete") break if (utils.TimeoutExpired(last_feedback, self._MIGRATION_FEEDBACK_INTERVAL) and ms.transferred_ram is not None): mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram) self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) last_feedback = time.time() time.sleep(self._MIGRATION_POLL_INTERVAL) # Always call finalize on both source and target, they should compose # a single operation, consisting of (potentially) parallel steps, that # should be always attempted/retried together (like in _AbortMigration) # without setting any expecetations in what order they execute. result_src = self.rpc.call_instance_finalize_migration_src( self.source_node_uuid, self.instance, True, self.live) result_dst = self.rpc.call_instance_finalize_migration_dst( self.target_node_uuid, self.instance, migration_info, True) err_msg = [] if result_src.fail_msg: logging.error("Instance migration succeeded, but finalization failed" " on the source node: %s", result_src.fail_msg) err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': ' + result_src.fail_msg) if result_dst.fail_msg: logging.error("Instance migration succeeded, but finalization failed" " on the target node: %s", result_dst.fail_msg) err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': ' + result_dst.fail_msg) if err_msg: raise errors.OpExecError( "Could not finalize instance migration: %s" % ' '.join(err_msg)) # Update instance location only after finalize completed. This way, if # either finalize fails, the config still stores the old primary location, # so we can know which instance to delete if we need to (manually) clean up. self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid) self.instance = self.cfg.GetInstanceInfo(self.instance_uuid) self._CloseInstanceDisks(self.source_node_uuid) disks = self.cfg.GetInstanceDisks(self.instance_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): self._WaitUntilSync() self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.target_node_uuid, True) # If the instance's disk template is `rbd' or `ext' and there was a # successful migration, unmap the device from the source node. unmap_types = (constants.DT_RBD, constants.DT_EXT) if utils.AnyDiskOfType(disks, unmap_types): unmap_disks = [d for d in disks if d.dev_type in unmap_types] disks = ExpandCheckDisks(unmap_disks, unmap_disks) self.feedback_fn("* unmapping instance's disks %s from %s" % (utils.CommaJoin(d.name for d in unmap_disks), self.cfg.GetNodeName(self.source_node_uuid))) for disk in disks: result = self.rpc.call_blockdev_shutdown(self.source_node_uuid, (disk, self.instance)) msg = result.fail_msg if msg: logging.error("Migration was successful, but couldn't unmap the" " block device %s on source node %s: %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid), msg) logging.error("You need to unmap the device %s manually on %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid)) self.feedback_fn("* done")
def _ExecCleanup(self): """Try to cleanup after a failed migration. The cleanup is done by: - check that the instance is running only on one node - try 'aborting' migration if it is running on two nodes - update the config if needed - change disks on its secondary node to secondary - wait until disks are fully synchronized - disconnect from the network - change disks into single-master mode - wait again until disks are fully synchronized """ instance_locations = self._FindInstanceLocations(self.instance.name) runningon_source = self.source_node_uuid in instance_locations runningon_target = self.target_node_uuid in instance_locations if runningon_source and runningon_target: # If we have an instance on both the source and the destination, we know # that instance migration was interrupted in the middle, we can try to # do effectively the same as when aborting an interrupted migration. self.feedback_fn("Trying to cleanup after failed migration") result = self.rpc.call_migration_info( self.source_node_uuid, self.instance) if result.fail_msg: raise errors.OpExecError( "Failed fetching source migration information from %s: %s" % (self.cfg.GetNodeName(self.source_node_uuid), result.fail_msg)) self.migration_info = result.payload abort_results = self._AbortMigration() if abort_results[0].fail_msg or abort_results[1].fail_msg: raise errors.OpExecError( "Instance migration cleanup failed: %s" % ','.join([ abort_results[0].fail_msg, abort_results[1].fail_msg])) # AbortMigration() should have fixed instance locations, so query again instance_locations = self._FindInstanceLocations(self.instance.name) runningon_source = self.source_node_uuid in instance_locations runningon_target = self.target_node_uuid in instance_locations # Abort didn't work, manual intervention required if runningon_source and runningon_target: raise errors.OpExecError("Instance seems to be running on two nodes," " or the hypervisor is confused; you will have" " to ensure manually that it runs only on one" " and restart this operation") if not (runningon_source or runningon_target): if len(instance_locations) == 1: # The instance is running on a differrent node than expected, let's # adopt it as if it was running on the secondary self.target_node_uuid = instance_locations[0] self.feedback_fn("* instance running on unexpected node (%s)," " updating as the new secondary" % self.cfg.GetNodeName(self.target_node_uuid)) runningon_target = True else: raise errors.OpExecError("Instance does not seem to be running at all;" " in this case it's safer to repair by" " running 'gnt-instance stop' to ensure disk" " shutdown, and then restarting it") if runningon_target: # the migration has actually succeeded, we need to update the config self.feedback_fn("* instance running on secondary node (%s)," " updating config" % self.cfg.GetNodeName(self.target_node_uuid)) self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid) demoted_node_uuid = self.source_node_uuid else: self.feedback_fn("* instance confirmed to be running on its" " primary node (%s)" % self.cfg.GetNodeName(self.source_node_uuid)) demoted_node_uuid = self.target_node_uuid disks = self.cfg.GetInstanceDisks(self.instance.uuid) # TODO: Cleanup code duplication of _RevertDiskStatus() self._CloseInstanceDisks(demoted_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): try: self._WaitUntilSync() except errors.OpExecError: # we ignore here errors, since if the device is standalone, it # won't be able to sync pass self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.instance.primary_node, True) self.feedback_fn("* done")
def testHeterogeneousDiskless(self): self.assertFalse( utils.AnyDiskOfType([Rbd(), Drbd()], [constants.DT_DISKLESS]))
def _ExecCleanup(self): """Try to cleanup after a failed migration. The cleanup is done by: - check that the instance is running only on one node (and update the config if needed) - change disks on its secondary node to secondary - wait until disks are fully synchronized - disconnect from the network - change disks into single-master mode - wait again until disks are fully synchronized """ # check running on only one node self.feedback_fn("* checking where the instance actually runs" " (if this hangs, the hypervisor might be in" " a bad state)") cluster_hvparams = self.cfg.GetClusterInfo().hvparams ins_l = self.rpc.call_instance_list(self.all_node_uuids, [self.instance.hypervisor], cluster_hvparams) for node_uuid, result in ins_l.items(): result.Raise("Can't contact node %s" % node_uuid) runningon_source = self.instance.name in \ ins_l[self.source_node_uuid].payload runningon_target = self.instance.name in \ ins_l[self.target_node_uuid].payload if runningon_source and runningon_target: raise errors.OpExecError( "Instance seems to be running on two nodes," " or the hypervisor is confused; you will have" " to ensure manually that it runs only on one" " and restart this operation") if not (runningon_source or runningon_target): raise errors.OpExecError( "Instance does not seem to be running at all;" " in this case it's safer to repair by" " running 'gnt-instance stop' to ensure disk" " shutdown, and then restarting it") if runningon_target: # the migration has actually succeeded, we need to update the config self.feedback_fn("* instance running on secondary node (%s)," " updating config" % self.cfg.GetNodeName(self.target_node_uuid)) self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid) demoted_node_uuid = self.source_node_uuid else: self.feedback_fn("* instance confirmed to be running on its" " primary node (%s)" % self.cfg.GetNodeName(self.source_node_uuid)) demoted_node_uuid = self.target_node_uuid disks = self.cfg.GetInstanceDisks(self.instance.uuid) self._CloseInstanceDisks(demoted_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): try: self._WaitUntilSync() except errors.OpExecError: # we ignore here errors, since if the device is standalone, it # won't be able to sync pass self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.instance.primary_node, True) self.feedback_fn("* done")
def testAnyDiskless(self): self.assertTrue(utils.AnyDiskOfType([], [constants.DT_DISKLESS]))