Example #1
0
    def _VerifyDrbdStates(self, node_errors, offline_disk_instance_names):
        node_to_inst = {}
        for inst in self.instances.values():
            disks = self.cfg.GetInstanceDisks(inst.uuid)
            if not (inst.disks_active
                    and utils.AnyDiskOfType(disks, [constants.DT_DRBD8])):
                continue

            secondary_nodes = self.cfg.GetInstanceSecondaryNodes(inst.uuid)
            for node_uuid in itertools.chain([inst.primary_node],
                                             secondary_nodes):
                node_to_inst.setdefault(node_uuid, []).append(inst)

        for (node_uuid, insts) in node_to_inst.items():
            node_disks = [(self.cfg.GetInstanceDisks(inst.uuid), inst)
                          for inst in insts]
            node_res = self.rpc.call_drbd_needs_activation(
                node_uuid, node_disks)
            msg = node_res.fail_msg
            if msg:
                logging.warning("Error getting DRBD status on node %s: %s",
                                self.cfg.GetNodeName(node_uuid), msg)
                node_errors[node_uuid] = msg
                continue

            faulty_disk_uuids = set(node_res.payload)
            for inst in self.instances.values():
                disks = self.cfg.GetInstanceDisks(inst.uuid)
                inst_disk_uuids = set([disk.uuid for disk in disks])
                if inst_disk_uuids.intersection(faulty_disk_uuids):
                    offline_disk_instance_names.add(inst.name)
  def GetRequest(self, cfg):
    """Request an relocation of an instance

    The checks for the completeness of the opcode must have already been
    done.

    """
    instance = cfg.GetInstanceInfo(self.inst_uuid)
    disks = cfg.GetInstanceDisks(self.inst_uuid)
    if instance is None:
      raise errors.ProgrammerError("Unknown instance '%s' passed to"
                                   " IAllocator" % self.inst_uuid)

    if not utils.AllDiskOfType(disks, constants.DTS_MIRRORED):
      raise errors.OpPrereqError("Can't relocate non-mirrored instances",
                                 errors.ECODE_INVAL)

    secondary_nodes = cfg.GetInstanceSecondaryNodes(instance.uuid)
    if (utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR) and
        len(secondary_nodes) != 1):
      raise errors.OpPrereqError("Instance has not exactly one secondary node",
                                 errors.ECODE_STATE)

    disk_sizes = [{constants.IDISK_SIZE: disk.size,
                   constants.IDISK_TYPE: disk.dev_type} for disk in disks]
    disk_space = gmi.ComputeDiskSize(disk_sizes)

    return {
      "name": instance.name,
      "disk_space_total": disk_space,
      "required_nodes": 1,
      "relocate_from": cfg.GetNodeNames(self.relocate_from_node_uuids),
      }
Example #3
0
def _DeclareLocksForMigration(lu, level):
  """Declares locks for L{TLMigrateInstance}.

  @type lu: L{LogicalUnit}
  @param level: Lock level

  """
  if level == locking.LEVEL_NODE:
    assert lu.op.instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)

    instance = lu.cfg.GetInstanceInfo(lu.op.instance_uuid)

    disks = lu.cfg.GetInstanceDisks(instance.uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
      if lu.op.target_node is None:
        lu.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
      else:
        lu.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
                                               lu.op.target_node_uuid]
    else:
      lu._LockInstancesNodes() # pylint: disable=W0212

    assert (lu.needed_locks[locking.LEVEL_NODE] or
            lu.needed_locks[locking.LEVEL_NODE] is locking.ALL_SET)

  elif level == locking.LEVEL_NODE_RES:
    # Copy node locks
    lu.needed_locks[locking.LEVEL_NODE_RES] = \
      CopyLockList(lu.needed_locks[locking.LEVEL_NODE])
Example #4
0
  def BuildHooksEnv(self):
    """Build hooks env.

    This runs on master, primary and secondary nodes of the instance.

    """
    instance = self._migrater.instance
    source_node_uuid = instance.primary_node
    target_node_uuid = self._migrater.target_node_uuid
    env = {
      "IGNORE_CONSISTENCY": self.op.ignore_consistency,
      "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
      "OLD_PRIMARY": self.cfg.GetNodeName(source_node_uuid),
      "NEW_PRIMARY": self.cfg.GetNodeName(target_node_uuid),
      "FAILOVER_CLEANUP": self.op.cleanup,
      }

    disks = self.cfg.GetInstanceDisks(instance.uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
      env["OLD_SECONDARY"] = self.cfg.GetNodeName(secondary_nodes[0])
      env["NEW_SECONDARY"] = self.cfg.GetNodeName(source_node_uuid)
    else:
      env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""

    env.update(BuildInstanceHookEnvByObject(self, instance))

    return env
Example #5
0
  def BuildHooksEnv(self):
    """Build hooks env.

    This runs on master, primary and secondary nodes of the instance.

    """
    instance = self._migrater.instance
    source_node_uuid = instance.primary_node
    target_node_uuid = self._migrater.target_node_uuid
    env = BuildInstanceHookEnvByObject(self, instance)
    env.update({
      "MIGRATE_LIVE": self._migrater.live,
      "MIGRATE_CLEANUP": self.op.cleanup,
      "OLD_PRIMARY": self.cfg.GetNodeName(source_node_uuid),
      "NEW_PRIMARY": self.cfg.GetNodeName(target_node_uuid),
      "ALLOW_RUNTIME_CHANGES": self.op.allow_runtime_changes,
      })

    disks = self.cfg.GetInstanceDisks(instance.uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
      env["OLD_SECONDARY"] = self.cfg.GetNodeName(secondary_nodes[0])
      env["NEW_SECONDARY"] = self.cfg.GetNodeName(source_node_uuid)
    else:
      env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""

    return env
Example #6
0
    def CheckAssignmentForSplitInstances(self, changes, node_data,
                                         instance_data):
        """Check for split instances after a node assignment.

    This method considers a series of node assignments as an atomic operation,
    and returns information about split instances after applying the set of
    changes.

    In particular, it returns information about newly split instances, and
    instances that were already split, and remain so after the change.

    Only disks whose template is listed in constants.DTS_INT_MIRROR are
    considered.

    @type changes: list of (node_uuid, new_group_uuid) pairs.
    @param changes: list of node assignments to consider.
    @param node_data: a dict with data for all nodes
    @param instance_data: a dict with all instances to consider
    @rtype: a two-tuple
    @return: a list of instances that were previously okay and result split as a
      consequence of this change, and a list of instances that were previously
      split and this change does not fix.

    """
        changed_nodes = dict((uuid, group) for uuid, group in changes
                             if node_data[uuid].group != group)

        all_split_instances = set()
        previously_split_instances = set()

        for inst in instance_data.values():
            inst_disks = self.cfg.GetInstanceDisks(inst.uuid)
            if not utils.AnyDiskOfType(inst_disks, constants.DTS_INT_MIRROR):
                continue

            inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
            if len(set(node_data[node_uuid].group
                       for node_uuid in inst_nodes)) > 1:
                previously_split_instances.add(inst.uuid)

            if len(
                    set(
                        changed_nodes.get(node_uuid,
                                          node_data[node_uuid].group)
                        for node_uuid in inst_nodes)) > 1:
                all_split_instances.add(inst.uuid)

        return (list(all_split_instances - previously_split_instances),
                list(previously_split_instances & all_split_instances))
Example #7
0
  def _RevertDiskStatus(self):
    """Try to revert the disk status after a failed migration.

    """

    disks = self.cfg.GetInstanceDisks(self.instance.uuid)

    self._CloseInstanceDisks(self.target_node_uuid)

    unmap_types = (constants.DT_RBD, constants.DT_EXT)
    if utils.AnyDiskOfType(disks, unmap_types):
      # If the instance's disk template is `rbd' or `ext' and there was an
      # unsuccessful migration, unmap the device from the target node.
      unmap_disks = [d for d in disks if d.dev_type in unmap_types]
      disks = ExpandCheckDisks(unmap_disks, unmap_disks)
      self.feedback_fn("* unmapping instance's disks %s from %s" %
                       (utils.CommaJoin(d.name for d in unmap_disks),
                        self.cfg.GetNodeName(self.target_node_uuid)))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.target_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration failed and I couldn't unmap the block device"
                        " %s on target node %s: %s", disk.iv_name,
                        self.cfg.GetNodeName(self.target_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.target_node_uuid))

    if utils.AllDiskOfType(disks, constants.DTS_EXT_MIRROR):
      self._OpenInstanceDisks(self.source_node_uuid, True)
      return

    try:
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()
    except errors.OpExecError as err:
      self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
                         " please try to recover the instance manually;"
                         " error '%s'" % str(err))
Example #8
0
 def testHeterogeneous(self):
     self.assertTrue(
         utils.AnyDiskOfType([Rbd(), Drbd()], [constants.DT_DRBD8]))
Example #9
0
 def testNotRbdDiskless(self):
     self.assertFalse(utils.AnyDiskOfType([Rbd()], [constants.DT_DISKLESS]))
Example #10
0
 def testNotDiskless(self):
     self.assertFalse(utils.AnyDiskOfType([], [constants.DT_DRBD8]))
Example #11
0
 def testNotRbd(self):
     self.assertFalse(utils.AnyDiskOfType([Rbd()], [constants.DT_DRBD8]))
Example #12
0
 def testOrRbd(self):
     self.assertTrue(
         utils.AnyDiskOfType([Rbd()],
                             [constants.DT_RBD, constants.DT_DRBD8]))
Example #13
0
 def testOrDrbd(self):
     self.assertTrue(
         utils.AnyDiskOfType([Drbd()],
                             [constants.DT_DISKLESS, constants.DT_DRBD8]))
Example #14
0
    def _ExecMigration(self):
        """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
        # Check for hypervisor version mismatch and warn the user.
        hvspecs = [
            (self.instance.hypervisor,
             self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])
        ]
        nodeinfo = self.rpc.call_node_info(
            [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
        for ninfo in nodeinfo.values():
            ninfo.Raise("Unable to retrieve node information from node '%s'" %
                        ninfo.node)
        (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
        (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

        if ((constants.HV_NODEINFO_KEY_VERSION in src_info)
                and (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
            src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
            dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
            if src_version != dst_version:
                self.feedback_fn(
                    "* warning: hypervisor version mismatch between"
                    " source (%s) and target (%s) node" %
                    (src_version, dst_version))
                hv = hypervisor.GetHypervisor(self.instance.hypervisor)
                if hv.VersionsSafeForMigration(src_version, dst_version):
                    self.feedback_fn(
                        "  migrating from hypervisor version %s to %s should"
                        " be safe" % (src_version, dst_version))
                else:
                    self.feedback_fn(
                        "  migrating from hypervisor version %s to %s is"
                        " likely unsupported" % (src_version, dst_version))
                    if self.ignore_hvversions:
                        self.feedback_fn(
                            "  continuing anyway (told to ignore version"
                            " mismatch)")
                    else:
                        raise errors.OpExecError(
                            "Unsupported migration between hypervisor"
                            " versions (%s to %s)" %
                            (src_version, dst_version))

        self.feedback_fn(
            "* checking disk consistency between source and target")
        for (idx,
             dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)):
            if not CheckDiskConsistency(self.lu, self.instance, dev,
                                        self.target_node_uuid, False):
                raise errors.OpExecError("Disk %s is degraded or not fully"
                                         " synchronized on target node,"
                                         " aborting migration" % idx)

        if self.current_mem > self.tgt_free_mem:
            if not self.allow_runtime_changes:
                raise errors.OpExecError(
                    "Memory ballooning not allowed and not enough"
                    " free memory to fit instance %s on target"
                    " node %s (have %dMB, need %dMB)" %
                    (self.instance.name,
                     self.cfg.GetNodeName(self.target_node_uuid),
                     self.tgt_free_mem, self.current_mem))
            self.feedback_fn("* setting instance memory to %s" %
                             self.tgt_free_mem)
            rpcres = self.rpc.call_instance_balloon_memory(
                self.instance.primary_node, self.instance, self.tgt_free_mem)
            rpcres.Raise("Cannot modify instance runtime memory")

        # First get the migration information from the remote node
        result = self.rpc.call_migration_info(self.source_node_uuid,
                                              self.instance)
        msg = result.fail_msg
        if msg:
            log_err = (
                "Failed fetching source migration information from %s: %s" %
                (self.cfg.GetNodeName(self.source_node_uuid), msg))
            logging.error(log_err)
            raise errors.OpExecError(log_err)

        self.migration_info = migration_info = result.payload

        disks = self.cfg.GetInstanceDisks(self.instance.uuid)

        self._CloseInstanceDisks(self.target_node_uuid)

        if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
            # Then switch the disks to master/master mode
            self._GoStandalone()
            self._GoReconnect(True)
            self._WaitUntilSync()

        self._OpenInstanceDisks(self.source_node_uuid, False)
        self._OpenInstanceDisks(self.target_node_uuid, False)

        self.feedback_fn("* preparing %s to accept the instance" %
                         self.cfg.GetNodeName(self.target_node_uuid))
        result = self.rpc.call_accept_instance(
            self.target_node_uuid, self.instance, migration_info,
            self.nodes_ip[self.target_node_uuid])

        msg = result.fail_msg
        if msg:
            logging.error(
                "Instance pre-migration failed, trying to revert"
                " disk status: %s", msg)
            self.feedback_fn("Pre-migration failed, aborting")
            self._AbortMigration()
            self._RevertDiskStatus()
            raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                                     (self.instance.name, msg))

        self.feedback_fn("* migrating instance to %s" %
                         self.cfg.GetNodeName(self.target_node_uuid))
        cluster = self.cfg.GetClusterInfo()
        result = self.rpc.call_instance_migrate(
            self.source_node_uuid, cluster.cluster_name, self.instance,
            self.nodes_ip[self.target_node_uuid], self.live)
        msg = result.fail_msg
        if msg:
            logging.error(
                "Instance migration failed, trying to revert"
                " disk status: %s", msg)
            self.feedback_fn("Migration failed, aborting")
            self._AbortMigration()
            self._RevertDiskStatus()
            raise errors.OpExecError("Could not migrate instance %s: %s" %
                                     (self.instance.name, msg))

        self.feedback_fn("* starting memory transfer")
        last_feedback = time.time()
        while True:
            result = self.rpc.call_instance_get_migration_status(
                self.source_node_uuid, self.instance)
            msg = result.fail_msg
            ms = result.payload  # MigrationStatus instance
            if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
                logging.error(
                    "Instance migration failed, trying to revert"
                    " disk status: %s", msg)
                self.feedback_fn("Migration failed, aborting")
                self._AbortMigration()
                self._RevertDiskStatus()
                if not msg:
                    msg = "hypervisor returned failure"
                raise errors.OpExecError("Could not migrate instance %s: %s" %
                                         (self.instance.name, msg))

            if result.payload.status != constants.HV_MIGRATION_ACTIVE:
                self.feedback_fn("* memory transfer complete")
                break

            if (utils.TimeoutExpired(last_feedback,
                                     self._MIGRATION_FEEDBACK_INTERVAL)
                    and ms.transferred_ram is not None):
                mem_progress = 100 * float(ms.transferred_ram) / float(
                    ms.total_ram)
                self.feedback_fn("* memory transfer progress: %.2f %%" %
                                 mem_progress)
                last_feedback = time.time()

            time.sleep(self._MIGRATION_POLL_INTERVAL)

        result = self.rpc.call_instance_finalize_migration_src(
            self.source_node_uuid, self.instance, True, self.live)
        msg = result.fail_msg
        if msg:
            logging.error(
                "Instance migration succeeded, but finalization failed"
                " on the source node: %s", msg)
            raise errors.OpExecError(
                "Could not finalize instance migration: %s" % msg)

        self.cfg.SetInstancePrimaryNode(self.instance.uuid,
                                        self.target_node_uuid)
        self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)
        disks = self.cfg.GetInstanceDisks(self.instance_uuid)

        result = self.rpc.call_instance_finalize_migration_dst(
            self.target_node_uuid, self.instance, migration_info, True)
        msg = result.fail_msg
        if msg:
            logging.error(
                "Instance migration succeeded, but finalization failed"
                " on the target node: %s", msg)
            raise errors.OpExecError(
                "Could not finalize instance migration: %s" % msg)

        self._CloseInstanceDisks(self.source_node_uuid)

        if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
            self._WaitUntilSync()
            self._GoStandalone()
            self._GoReconnect(False)
            self._WaitUntilSync()
        elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
            self._OpenInstanceDisks(self.target_node_uuid, True)

        # If the instance's disk template is `rbd' or `ext' and there was a
        # successful migration, unmap the device from the source node.
        unmap_types = (constants.DT_RBD, constants.DT_EXT)

        if utils.AnyDiskOfType(disks, unmap_types):
            unmap_disks = [d for d in disks if d.dev_type in unmap_types]
            disks = ExpandCheckDisks(unmap_disks, unmap_disks)
            self.feedback_fn("* unmapping instance's disks %s from %s" %
                             (utils.CommaJoin(d.name for d in unmap_disks),
                              self.cfg.GetNodeName(self.source_node_uuid)))
            for disk in disks:
                result = self.rpc.call_blockdev_shutdown(
                    self.source_node_uuid, (disk, self.instance))
                msg = result.fail_msg
                if msg:
                    logging.error(
                        "Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s", disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
                    logging.error(
                        "You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

        self.feedback_fn("* done")
Example #15
0
  def _ExecMigration(self):
    """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
    # Check for hypervisor version mismatch and warn the user.
    hvspecs = [(self.instance.hypervisor,
                self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])]
    nodeinfo = self.rpc.call_node_info(
                 [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
    for ninfo in nodeinfo.values():
      ninfo.Raise("Unable to retrieve node information from node '%s'" %
                  ninfo.node)
    (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
    (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

    if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
        (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
      src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
      dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
      if src_version != dst_version:
        self.feedback_fn("* warning: hypervisor version mismatch between"
                         " source (%s) and target (%s) node" %
                         (src_version, dst_version))
        hv = hypervisor.GetHypervisorClass(self.instance.hypervisor)
        if hv.VersionsSafeForMigration(src_version, dst_version):
          self.feedback_fn("  migrating from hypervisor version %s to %s should"
                           " be safe" % (src_version, dst_version))
        else:
          self.feedback_fn("  migrating from hypervisor version %s to %s is"
                           " likely unsupported" % (src_version, dst_version))
          if self.ignore_hvversions:
            self.feedback_fn("  continuing anyway (told to ignore version"
                             " mismatch)")
          else:
            raise errors.OpExecError("Unsupported migration between hypervisor"
                                     " versions (%s to %s)" %
                                     (src_version, dst_version))

    self.feedback_fn("* checking disk consistency between source and target")
    for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)):
      if not CheckDiskConsistency(self.lu, self.instance, dev,
                                  self.target_node_uuid,
                                  False):
        raise errors.OpExecError("Disk %s is degraded or not fully"
                                 " synchronized on target node,"
                                 " aborting migration" % idx)

    if self.current_mem > self.tgt_free_mem:
      if not self.allow_runtime_changes:
        raise errors.OpExecError("Memory ballooning not allowed and not enough"
                                 " free memory to fit instance %s on target"
                                 " node %s (have %dMB, need %dMB)" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  self.tgt_free_mem, self.current_mem))
      self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
      rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node,
                                                     self.instance,
                                                     self.tgt_free_mem)
      rpcres.Raise("Cannot modify instance runtime memory")

    # First get the migration information from the remote node
    result = self.rpc.call_migration_info(self.source_node_uuid, self.instance)
    msg = result.fail_msg
    if msg:
      log_err = ("Failed fetching source migration information from %s: %s" %
                 (self.cfg.GetNodeName(self.source_node_uuid), msg))
      logging.error(log_err)
      raise errors.OpExecError(log_err)

    self.migration_info = migration_info = result.payload

    disks = self.cfg.GetInstanceDisks(self.instance.uuid)

    self._CloseInstanceDisks(self.target_node_uuid)

    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      # Then switch the disks to master/master mode
      self._GoStandalone()
      self._GoReconnect(True)
      self._WaitUntilSync()

    self._OpenInstanceDisks(self.source_node_uuid, False)
    self._OpenInstanceDisks(self.target_node_uuid, False)

    self.feedback_fn("* preparing %s to accept the instance" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    result = self.rpc.call_accept_instance(self.target_node_uuid,
                                           self.instance,
                                           migration_info,
                                           self.nodes_ip[self.target_node_uuid])

    msg = result.fail_msg
    if msg:
      logging.error("Instance pre-migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Pre-migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* migrating instance to %s" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    cluster = self.cfg.GetClusterInfo()
    result = self.rpc.call_instance_migrate(
        self.source_node_uuid, cluster.cluster_name, self.instance,
        self.nodes_ip[self.target_node_uuid], self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* starting memory transfer")
    last_feedback = time.time()

    cluster_migration_caps = \
      cluster.hvparams.get("kvm", {}).get(constants.HV_KVM_MIGRATION_CAPS, "")
    migration_caps = \
      self.instance.hvparams.get(constants.HV_KVM_MIGRATION_CAPS,
                                 cluster_migration_caps)
    # migration_caps is a ':' delimited string, so checking
    # if 'postcopy-ram' is a substring also covers using
    # x-postcopy-ram for QEMU 2.5
    postcopy_enabled = "postcopy-ram" in migration_caps
    while True:
      result = self.rpc.call_instance_get_migration_status(
                 self.source_node_uuid, self.instance)
      msg = result.fail_msg
      ms = result.payload   # MigrationStatus instance
      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
        self.feedback_fn("Migration failed, aborting")
        self._AbortMigration()
        self._RevertDiskStatus()
        if not msg:
          msg = "hypervisor returned failure"
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (self.instance.name, msg))

      if (postcopy_enabled
          and ms.status == constants.HV_MIGRATION_ACTIVE
          and int(ms.dirty_sync_count) >= self._POSTCOPY_SYNC_COUNT_THRESHOLD):
        self.feedback_fn("* finishing memory transfer with postcopy")
        self.rpc.call_instance_start_postcopy(self.source_node_uuid,
                                              self.instance)

      if self.instance.hypervisor == 'kvm':
        migration_active = \
          ms.status in constants.HV_KVM_MIGRATION_ACTIVE_STATUSES
      else:
        migration_active = \
          ms.status == constants.HV_MIGRATION_ACTIVE
      if not migration_active:
        self.feedback_fn("* memory transfer complete")
        break

      if (utils.TimeoutExpired(last_feedback,
                               self._MIGRATION_FEEDBACK_INTERVAL) and
          ms.transferred_ram is not None):
        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
        last_feedback = time.time()

      time.sleep(self._MIGRATION_POLL_INTERVAL)

    # Always call finalize on both source and target, they should compose
    # a single operation, consisting of (potentially) parallel steps, that
    # should be always attempted/retried together (like in _AbortMigration)
    # without setting any expecetations in what order they execute.
    result_src = self.rpc.call_instance_finalize_migration_src(
        self.source_node_uuid, self.instance, True, self.live)

    result_dst = self.rpc.call_instance_finalize_migration_dst(
        self.target_node_uuid, self.instance, migration_info, True)

    err_msg = []
    if result_src.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the source node: %s", result_src.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': '
                     + result_src.fail_msg)

    if result_dst.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the target node: %s", result_dst.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': '
                     + result_dst.fail_msg)

    if err_msg:
      raise errors.OpExecError(
          "Could not finalize instance migration: %s" % ' '.join(err_msg))

    # Update instance location only after finalize completed. This way, if
    # either finalize fails, the config still stores the old primary location,
    # so we can know which instance to delete if we need to (manually) clean up.
    self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid)
    self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)

    self._CloseInstanceDisks(self.source_node_uuid)
    disks = self.cfg.GetInstanceDisks(self.instance_uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      self._WaitUntilSync()
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()
    elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
      self._OpenInstanceDisks(self.target_node_uuid, True)

    # If the instance's disk template is `rbd' or `ext' and there was a
    # successful migration, unmap the device from the source node.
    unmap_types = (constants.DT_RBD, constants.DT_EXT)

    if utils.AnyDiskOfType(disks, unmap_types):
      unmap_disks = [d for d in disks if d.dev_type in unmap_types]
      disks = ExpandCheckDisks(unmap_disks, unmap_disks)
      self.feedback_fn("* unmapping instance's disks %s from %s" %
                       (utils.CommaJoin(d.name for d in unmap_disks),
                        self.cfg.GetNodeName(self.source_node_uuid)))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.source_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

    self.feedback_fn("* done")
Example #16
0
  def _ExecCleanup(self):
    """Try to cleanup after a failed migration.

    The cleanup is done by:
      - check that the instance is running only on one node
      - try 'aborting' migration if it is running on two nodes
      - update the config if needed
      - change disks on its secondary node to secondary
      - wait until disks are fully synchronized
      - disconnect from the network
      - change disks into single-master mode
      - wait again until disks are fully synchronized

    """
    instance_locations = self._FindInstanceLocations(self.instance.name)
    runningon_source = self.source_node_uuid in instance_locations
    runningon_target = self.target_node_uuid in instance_locations

    if runningon_source and runningon_target:
      # If we have an instance on both the source and the destination, we know
      # that instance migration was interrupted in the middle, we can try to
      # do effectively the same as when aborting an interrupted migration.
      self.feedback_fn("Trying to cleanup after failed migration")
      result = self.rpc.call_migration_info(
          self.source_node_uuid, self.instance)
      if result.fail_msg:
        raise errors.OpExecError(
            "Failed fetching source migration information from %s: %s" %
            (self.cfg.GetNodeName(self.source_node_uuid), result.fail_msg))
      self.migration_info = result.payload
      abort_results = self._AbortMigration()

      if abort_results[0].fail_msg or abort_results[1].fail_msg:
        raise errors.OpExecError(
            "Instance migration cleanup failed: %s" % ','.join([
                abort_results[0].fail_msg, abort_results[1].fail_msg]))

      # AbortMigration() should have fixed instance locations, so query again
      instance_locations = self._FindInstanceLocations(self.instance.name)
      runningon_source = self.source_node_uuid in instance_locations
      runningon_target = self.target_node_uuid in instance_locations

    # Abort didn't work, manual intervention required
    if runningon_source and runningon_target:
      raise errors.OpExecError("Instance seems to be running on two nodes,"
                               " or the hypervisor is confused; you will have"
                               " to ensure manually that it runs only on one"
                               " and restart this operation")

    if not (runningon_source or runningon_target):
      if len(instance_locations) == 1:
        # The instance is running on a differrent node than expected, let's
        # adopt it as if it was running on the secondary
        self.target_node_uuid = instance_locations[0]
        self.feedback_fn("* instance running on unexpected node (%s),"
                         " updating as the new secondary" %
                         self.cfg.GetNodeName(self.target_node_uuid))
        runningon_target = True
      else:
        raise errors.OpExecError("Instance does not seem to be running at all;"
                                 " in this case it's safer to repair by"
                                 " running 'gnt-instance stop' to ensure disk"
                                 " shutdown, and then restarting it")

    if runningon_target:
      # the migration has actually succeeded, we need to update the config
      self.feedback_fn("* instance running on secondary node (%s),"
                       " updating config" %
                       self.cfg.GetNodeName(self.target_node_uuid))
      self.cfg.SetInstancePrimaryNode(self.instance.uuid,
                                      self.target_node_uuid)
      demoted_node_uuid = self.source_node_uuid
    else:
      self.feedback_fn("* instance confirmed to be running on its"
                       " primary node (%s)" %
                       self.cfg.GetNodeName(self.source_node_uuid))
      demoted_node_uuid = self.target_node_uuid

    disks = self.cfg.GetInstanceDisks(self.instance.uuid)

    # TODO: Cleanup code duplication of _RevertDiskStatus()
    self._CloseInstanceDisks(demoted_node_uuid)

    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      try:
        self._WaitUntilSync()
      except errors.OpExecError:
        # we ignore here errors, since if the device is standalone, it
        # won't be able to sync
        pass
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()
    elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
      self._OpenInstanceDisks(self.instance.primary_node, True)

    self.feedback_fn("* done")
Example #17
0
 def testHeterogeneousDiskless(self):
     self.assertFalse(
         utils.AnyDiskOfType([Rbd(), Drbd()], [constants.DT_DISKLESS]))
Example #18
0
    def _ExecCleanup(self):
        """Try to cleanup after a failed migration.

    The cleanup is done by:
      - check that the instance is running only on one node
        (and update the config if needed)
      - change disks on its secondary node to secondary
      - wait until disks are fully synchronized
      - disconnect from the network
      - change disks into single-master mode
      - wait again until disks are fully synchronized

    """
        # check running on only one node
        self.feedback_fn("* checking where the instance actually runs"
                         " (if this hangs, the hypervisor might be in"
                         " a bad state)")
        cluster_hvparams = self.cfg.GetClusterInfo().hvparams
        ins_l = self.rpc.call_instance_list(self.all_node_uuids,
                                            [self.instance.hypervisor],
                                            cluster_hvparams)
        for node_uuid, result in ins_l.items():
            result.Raise("Can't contact node %s" % node_uuid)

        runningon_source = self.instance.name in \
                             ins_l[self.source_node_uuid].payload
        runningon_target = self.instance.name in \
                             ins_l[self.target_node_uuid].payload

        if runningon_source and runningon_target:
            raise errors.OpExecError(
                "Instance seems to be running on two nodes,"
                " or the hypervisor is confused; you will have"
                " to ensure manually that it runs only on one"
                " and restart this operation")

        if not (runningon_source or runningon_target):
            raise errors.OpExecError(
                "Instance does not seem to be running at all;"
                " in this case it's safer to repair by"
                " running 'gnt-instance stop' to ensure disk"
                " shutdown, and then restarting it")

        if runningon_target:
            # the migration has actually succeeded, we need to update the config
            self.feedback_fn("* instance running on secondary node (%s),"
                             " updating config" %
                             self.cfg.GetNodeName(self.target_node_uuid))
            self.cfg.SetInstancePrimaryNode(self.instance.uuid,
                                            self.target_node_uuid)
            demoted_node_uuid = self.source_node_uuid
        else:
            self.feedback_fn("* instance confirmed to be running on its"
                             " primary node (%s)" %
                             self.cfg.GetNodeName(self.source_node_uuid))
            demoted_node_uuid = self.target_node_uuid

        disks = self.cfg.GetInstanceDisks(self.instance.uuid)

        self._CloseInstanceDisks(demoted_node_uuid)

        if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
            try:
                self._WaitUntilSync()
            except errors.OpExecError:
                # we ignore here errors, since if the device is standalone, it
                # won't be able to sync
                pass
            self._GoStandalone()
            self._GoReconnect(False)
            self._WaitUntilSync()
        elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
            self._OpenInstanceDisks(self.instance.primary_node, True)

        self.feedback_fn("* done")
Example #19
0
 def testAnyDiskless(self):
     self.assertTrue(utils.AnyDiskOfType([], [constants.DT_DISKLESS]))