Exemple #1
0
    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool("mailbox-spm", tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run, name="mailbox-spm",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
Exemple #2
0
    def __init__(self, vm, dst='', dstparams='',
                 mode=MODE_REMOTE, method=METHOD_ONLINE,
                 tunneled=False, dstqemu='', abortOnError=False,
                 consoleAddress=None, compressed=False,
                 autoConverge=False, **kwargs):
        self.log = vm.log
        self._vm = vm
        self._dst = dst
        self._mode = mode
        if method != METHOD_ONLINE:
            self.log.warning(
                'migration method %s is deprecated, forced to "online"',
                method)
        self._dstparams = dstparams
        self._machineParams = {}
        self._tunneled = utils.tobool(tunneled)
        self._abortOnError = utils.tobool(abortOnError)
        self._consoleAddress = consoleAddress
        self._dstqemu = dstqemu
        self._downtime = kwargs.get('downtime') or \
            config.get('vars', 'migration_downtime')
        self._maxBandwidth = int(
            kwargs.get('maxBandwidth') or
            config.getint('vars', 'migration_max_bandwidth')
        )
        self._autoConverge = autoConverge
        self._compressed = compressed
        self.status = {
            'status': {
                'code': 0,
                'message': 'Migration in progress'}}
        self._progress = 0
        threading.Thread.__init__(self)
        self._preparingMigrationEvt = True
        self._migrationCanceledEvt = False
        self._monitorThread = None
        self._destServer = None

        progress_timeout = config.getint('vars', 'migration_progress_timeout')

        self._convergence_schedule = {
            'init': [],
            'stalling': [
                {
                    'limit': progress_timeout,
                    'action': {
                        'name': CONVERGENCE_SCHEDULE_SET_ABORT,
                        'params': []
                    }
                }
            ]
        }

        self._use_convergence_schedule = False
        if 'convergenceSchedule' in kwargs:
            self._convergence_schedule = kwargs.get('convergenceSchedule')
            self._use_convergence_schedule = True

        self.log.debug('convergence schedule set to: %s',
                       str(self._convergence_schedule))
Exemple #3
0
def start(cif):
    global _operations

    _scheduler.start()
    _executor.start()

    def per_vm_operation(func, period):
        disp = VmDispatcher(cif.getVMs, _executor, func, _timeout_from(period))
        return Operation(disp, period)

    _operations = [
        # needs dispatching becuse updating the volume stats needs the
        # access the storage, thus can block.
        per_vm_operation(UpdateVolumes, config.getint("irs", "vol_size_sample_interval")),
        # needs dispatching becuse access FS and libvirt data
        per_vm_operation(NumaInfoMonitor, config.getint("vars", "vm_sample_numa_interval")),
        # Job monitoring need QEMU monitor access.
        per_vm_operation(BlockjobMonitor, config.getint("vars", "vm_sample_jobs_interval")),
        # libvirt sampling using bulk stats can block, but unresponsive
        # domains are handled inside VMBulkSampler for performance reasons;
        # thus, does not need dispatching.
        Operation(
            sampling.VMBulkSampler(libvirtconnection.get(cif), cif.getVMs, sampling.stats_cache),
            config.getint("vars", "vm_sample_interval"),
        ),
        # we do this only until we get high water mark notifications
        # from qemu. Access storage and/or qemu monitor, so can block,
        # thus we need dispatching.
        per_vm_operation(DriveWatermarkMonitor, config.getint("vars", "vm_watermark_interval")),
    ]

    for op in _operations:
        op.start()
Exemple #4
0
    def __init__(self, irs, log, scheduler):
        """
        Initialize the (single) clientIF instance

        :param irs: a Dispatcher object to be used as this object's irs.
        :type irs: :class:`storage.dispatcher.Dispatcher`
        :param log: a log object to be used for this object's logging.
        :type log: :class:`logging.Logger`
        """
        self.vmContainerLock = threading.Lock()
        self._networkSemaphore = threading.Semaphore()
        self._shutdownSemaphore = threading.Semaphore()
        self.irs = irs
        if self.irs:
            self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self))
            self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB)
        self.log = log
        self._recovery = True
        self.channelListener = Listener(self.log)
        self._generationID = str(uuid.uuid4())
        self.mom = None
        self.bindings = {}
        self._broker_client = None
        self._subscriptions = defaultdict(list)
        self._scheduler = scheduler
        if _glusterEnabled:
            self.gluster = gapi.GlusterApi(self, log)
        else:
            self.gluster = None
        try:
            self.vmContainer = {}
            self._hostStats = sampling.HostStatsThread(
                sampling.host_samples)
            self._hostStats.start()
            self.lastRemoteAccess = 0
            self._enabled = True
            self._netConfigDirty = False
            self._prepareMOM()
            secret.clear()
            concurrent.thread(self._recoverThread, name='clientIFinit').start()
            self.channelListener.settimeout(
                config.getint('vars', 'guest_agent_timeout'))
            self.channelListener.start()
            self.threadLocal = threading.local()
            self.threadLocal.client = ''

            host = config.get('addresses', 'management_ip')
            port = config.getint('addresses', 'management_port')

            self._createAcceptor(host, port)
            self._prepareXMLRPCBinding()
            self._prepareJSONRPCBinding()
            self._connectToBroker()
        except:
            self.log.error('failed to init clientIF, '
                           'shutting down storage dispatcher')
            if self.irs:
                self.irs.prepareForShutdown()
            raise
Exemple #5
0
    def __init__(self, poolID, maxHostID, inbox, outbox, monitorInterval=2):
        """
        Note: inbox paramerter here should point to the HSM's outbox
        mailbox file, and vice versa.
        """
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = poolID
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = wait_timeout(monitorInterval)
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool("mailbox-spm", tpSize, waitTimeout, maxTasks)
        self._inbox = inbox
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = outbox
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        self._thread = concurrent.thread(
            self.run, name="mailbox-spm", log=self.log)
        self._thread.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
Exemple #6
0
 def __init__(self,
              tpSize=config.getint('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getint('irs', 'max_tasks')):
     self.storage_repository = config.get('irs', 'repository')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []
Exemple #7
0
 def __init__(self,
              tpSize=config.getint('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getint('irs', 'max_tasks')):
     self.tp = ThreadPool("tasks", tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []
     self._insertTaskLock = threading.Lock()
Exemple #8
0
    def _regular_run(self):
        self.log.debug("Starting migration source thread")
        self._recovery = False
        self._update_outgoing_limit()
        try:
            startTime = time.time()
            machineParams = self._setupRemoteMachineParams()
            self._setupVdsConnection()
            self._prepareGuest()

            while not self._started:
                try:
                    self.log.info("Migration semaphore: acquiring")
                    with SourceThread.ongoingMigrations:
                        self.log.info("Migration semaphore: acquired")
                        timeout = config.getint(
                            'vars', 'guest_lifecycle_event_reply_timeout')
                        if self.hibernating:
                            self._vm.guestAgent.events.before_hibernation(
                                wait_timeout=timeout)
                        elif self._enableGuestEvents:
                            self._vm.guestAgent.events.before_migration(
                                wait_timeout=timeout)
                        if self._migrationCanceledEvt.is_set():
                            self._raiseAbortError()
                        self.log.debug("migration semaphore acquired "
                                       "after %d seconds",
                                       time.time() - startTime)
                        migrationParams = {
                            'dst': self._dst,
                            'mode': self._mode,
                            'method': METHOD_ONLINE,
                            'dstparams': self._dstparams,
                            'dstqemu': self._dstqemu,
                        }
                        self._startUnderlyingMigration(
                            time.time(), migrationParams, machineParams
                        )
                        self._finishSuccessfully(machineParams)
                except libvirt.libvirtError as e:
                    if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED:
                        self.status = response.error(
                            'migCancelErr', message='Migration canceled')
                    raise
                except MigrationLimitExceeded:
                    retry_timeout = config.getint('vars',
                                                  'migration_retry_timeout')
                    self.log.debug("Migration destination busy. Initiating "
                                   "retry in %d seconds.", retry_timeout)
                    self._migrationCanceledEvt.wait(retry_timeout)
        except MigrationDestinationSetupError as e:
            self._recover(str(e))
            # we know what happened, no need to dump hollow stack trace
        except Exception as e:
            self._recover(str(e))
            self.log.exception("Failed to migrate")
Exemple #9
0
def get():
    caps = {}

    caps['kvmEnabled'] = \
        str(config.getboolean('vars', 'fake_kvm_support') or
            os.path.exists('/dev/kvm')).lower()

    cpuInfo = CpuInfo()
    cpuTopology = CpuTopology()
    if config.getboolean('vars', 'report_host_threads_as_cores'):
        caps['cpuCores'] = str(cpuTopology.threads())
    else:
        caps['cpuCores'] = str(cpuTopology.cores())

    caps['cpuThreads'] = str(cpuTopology.threads())
    caps['cpuSockets'] = str(cpuTopology.sockets())
    caps['cpuSpeed'] = cpuInfo.mhz()
    if config.getboolean('vars', 'fake_kvm_support'):
        caps['cpuModel'] = 'Intel(Fake) CPU'
        flags = set(cpuInfo.flags() + ['vmx', 'sse2', 'nx'])
        caps['cpuFlags'] = ','.join(flags) + 'model_486,model_pentium,' \
            'model_pentium2,model_pentium3,model_pentiumpro,model_qemu32,' \
            'model_coreduo,model_core2duo,model_n270,model_Conroe,' \
            'model_Penryn,model_Nehalem,model_Opteron_G1'
    else:
        caps['cpuModel'] = cpuInfo.model()
        caps['cpuFlags'] = ','.join(cpuInfo.flags() +
                                    _getCompatibleCpuModels())

    caps.update(dsaversion.version_info)
    caps.update(netinfo.get())

    try:
        caps['hooks'] = hooks.installed()
    except:
        logging.debug('not reporting hooks', exc_info=True)

    caps['operatingSystem'] = osversion()
    caps['uuid'] = utils.getHostUUID()
    caps['packages2'] = _getKeyPackages()
    caps['emulatedMachines'] = _getEmulatedMachines()
    caps['ISCSIInitiatorName'] = _getIscsiIniName()
    caps['HBAInventory'] = storage.hba.HBAInventory()
    caps['vmTypes'] = ['kvm']

    caps['memSize'] = str(utils.readMemInfo()['MemTotal'] / 1024)
    caps['reservedMem'] = str(config.getint('vars', 'host_mem_reserve') +
                              config.getint('vars', 'extra_mem_reserve'))
    caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead')

    return caps
Exemple #10
0
    def __init__(self, vm, downtime):
        super(DowntimeThread, self).__init__()
        self.DOWNTIME_STEPS = config.getint('vars', 'migration_downtime_steps')

        self._vm = vm
        self._downtime = downtime
        self._stop = threading.Event()

        delay_per_gib = config.getint('vars', 'migration_downtime_delay')
        memSize = int(vm.conf['memSize'])
        self._wait = (delay_per_gib * max(memSize, 2048) + 1023) / 1024

        self.daemon = True
        self.start()
Exemple #11
0
def forceScsiScan():
    processes = []
    minTimeout = config.getint('irs', 'scsi_rescan_minimal_timeout')
    maxTimeout = config.getint('irs', 'scsi_rescan_maximal_timeout')
    for hba in glob.glob(SCAN_PATTERN):
        cmd = [constants.EXT_DD, 'of=' + hba]
        p = misc.execCmd(cmd, sudo=False, sync=False)
        try:
            p.stdin.write("- - -")
            p.stdin.flush()
            p.stdin.close()
        except OSError as e:
            if p.wait(0) is False:
                log.error("pid %s still running", p.pid)
            log.warning("Error in rescan of hba:%s with returncode:%s and "
                        "error message: %s", hba, p.returncode,
                        p.stderr.read(1000))
            if e.errno != errno.EPIPE:
                raise
            else:
                log.warning("Ignoring error in rescan of hba %s: ",
                            hba, exc_info=True)
                continue
        processes.append((hba, p))
    if (minTimeout > maxTimeout or minTimeout < 0):
        minTimeout = 2
        maxTimeout = 30
        log.warning("One of the following configuration arguments has an "
                    "illegal value: scsi_rescan_minimal_timeout or "
                    "scsi_rescan_maximal_timeout. Set to %s and %s seconds "
                    "respectively.", minTimeout, maxTimeout)
    log.debug("Performing SCSI scan, this will take up to %s seconds",
              maxTimeout)
    time.sleep(minTimeout)
    for i in xrange(maxTimeout - minTimeout):
        for p in processes[:]:
            (hba, proc) = p
            if proc.wait(0):
                if proc.returncode != 0:
                    log.warning('returncode for: %s is: %s', hba,
                                proc.returncode)
                processes.remove(p)
        if not processes:
            break
        else:
            time.sleep(1)
    else:
        log.warning("Still waiting for scsi scan of hbas: %s",
                    tuple(hba for p in processes))
Exemple #12
0
Fichier : vm.py Projet : ekohl/vdsm
 def shutdown(self, timeout, message):
     try:
         now = time.time()
         if self.lastStatus == 'Down':
             return
         if self.guestAgent and self.guestAgent.isResponsive():
             self._guestEventTime = now
             self._guestEvent = 'Powering down'
             self.log.debug('guestAgent shutdown called')
             self.guestAgent.desktopShutdown(timeout, message)
             agent_timeout = int(timeout) + config.getint('vars', 'sys_shutdown_timeout')
             timer = threading.Timer(agent_timeout, self._timedShutdown)
             timer.start()
         elif self.conf['acpiEnable'].lower() == "true":
             self._guestEventTime = now
             self._guestEvent = 'Powering down'
             self._acpiShutdown()
         # No tools, no ACPI
         else:
             return {'status': {'code': errCode['exist']['status']['code'],
                     'message': 'VM without ACPI or active SolidICE tools. Try Forced Shutdown.'}}
     except:
         self.log.error("Shutdown failed", exc_info=True)
     return {'status': {'code': doneCode['code'],
             'message': 'Machine shut down'}}
Exemple #13
0
Fichier : vm.py Projet : ekohl/vdsm
    def _lvExtend(self, block_dev, newsize=None):
        volID = None
        for d in self._devices[DISK_DEVICES]:
            if not d.blockDev: continue
            if d.name != block_dev: continue
            if newsize is None:
                newsize = config.getint('irs',
                    'volume_utilization_chunk_mb') + (d.apparentsize + 2**20
                                                     - 1) / 2**20
            # TODO cap newsize by max volume size
            volDict = {'poolID': d.poolID, 'domainID': d.domainID,
                       'imageID': d.imageID, 'volumeID': d.volumeID}
            d.needExtend = True
            d.reqsize = newsize
            # sendExtendMsg expects size in bytes
            self.cif.irs.sendExtendMsg(d.poolID, volDict, newsize * 2**20,
                                           self._afterLvExtend)
            self.log.debug('%s/%s (%s): apparentsize %s req %s', d.domainID,
                           d.volumeID, d.name, d.apparentsize / constants.MEGAB,
                           newsize) #in MiB

            volID = d.volumeID
            break

        # store most recently requested size in conf, to be re-requested on
        # migration destination
        for dev in self.conf['devices']:
            if dev['type'] == DISK_DEVICES and dev.get('volumeID') == volID:
                    dev['reqsize'] = str(newsize)
Exemple #14
0
    def _wait_for_shutting_down_vms(self):
        """
        Wait loop checking remaining VMs in vm container

        This method is helper method that highers the
        probability of engine to properly acknowledge
        that all VMs are terminated by host shutdown.

        The VMs are shutdown by external service: libvirt-guests
        The service pauses system shutdown on systemd shutdown
        and gracefully shutdowns the running VMs.

        This method applies only when the host is in shutdown.
        If the host is running, the method ends immediately.
        """
        # how long to wait before release shutdown
        # we are waiting in whole seconds
        # if config is not present, do not wait
        timeout = config.getint('vars', 'timeout_engine_clear_vms')
        # time to wait in the final phase in seconds
        # it allows host to flush its final state to the engine
        final_wait = 2

        if not host_in_shutdown():
            return

        self.log.info('host in shutdown waiting')

        for _ in range((timeout - final_wait) * 10):
            if not self.vmContainer:
                # once all VMs are cleared exit
                break
            time.sleep(0.1)

        time.sleep(final_wait)
Exemple #15
0
    def _setupVdsConnection(self):
        if self.hibernating:
            return

        hostPort = vdscli.cannonizeHostPort(
            self._dst,
            config.getint('addresses', 'management_port'))
        self.remoteHost, port = hostPort.rsplit(':', 1)

        try:
            client = self._createClient(port)
            requestQueues = config.get('addresses', 'request_queues')
            requestQueue = requestQueues.split(",")[0]
            self._destServer = jsonrpcvdscli.connect(requestQueue, client)
            self.log.debug('Initiating connection with destination')
            self._destServer.ping()

        except (JsonRpcBindingsError, JsonRpcNoResponseError):
            if config.getboolean('vars', 'ssl'):
                self._destServer = vdscli.connect(
                    hostPort,
                    useSSL=True,
                    TransportClass=kaxmlrpclib.TcpkeepSafeTransport)
            else:
                self._destServer = kaxmlrpclib.Server('http://' + hostPort)

        self.log.debug('Destination server is: ' + hostPort)
Exemple #16
0
    def _recoverExistingVms(self):
        start_time = utils.monotonic_time()
        try:
            self.log.debug('recovery: started')

            # Starting up libvirt might take long when host under high load,
            # we prefer running this code in external thread to avoid blocking
            # API response.
            mog = min(config.getint('vars', 'max_outgoing_migrations'),
                      numa.cpu_topology().cores)
            migration.SourceThread.setMaxOutgoingMigrations(mog)

            recovery.all_vms(self)

            # recover stage 3: waiting for domains to go up
            self._waitForDomainsUp()

            recovery.clean_vm_files(self)

            self._recovery = False

            # Now if we have VMs to restore we should wait pool connection
            # and then prepare all volumes.
            # Actually, we need it just to get the resources for future
            # volumes manipulations
            self._waitForStoragePool()

            self._preparePathsForRecoveredVMs()

            self.log.info('recovery: completed in %is',
                          utils.monotonic_time() - start_time)

        except:
            self.log.exception("recovery: failed")
            raise
Exemple #17
0
    def _perform_migration(self, duri, muri):
        if self._vm.hasSpice and self._vm.conf.get('clientIp'):
            SPICE_MIGRATION_HANDOVER_TIME = 120
            self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME)

        maxBandwidth = config.getint('vars', 'migration_max_bandwidth')
        # FIXME: there still a race here with libvirt,
        # if we call stop() and libvirt migrateToURI3 didn't start
        # we may return migration stop but it will start at libvirt
        # side
        self._preparingMigrationEvt = False
        if not self._migrationCanceledEvt:
            # TODO: use libvirt constants when bz#1222795 is fixed
            params = {VIR_MIGRATE_PARAM_URI: str(muri),
                      VIR_MIGRATE_PARAM_BANDWIDTH: maxBandwidth}

            flags = (libvirt.VIR_MIGRATE_LIVE |
                     libvirt.VIR_MIGRATE_PEER2PEER |
                     (libvirt.VIR_MIGRATE_TUNNELLED if
                         self._tunneled else 0) |
                     (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if
                         self._abortOnError else 0) |
                     (libvirt.VIR_MIGRATE_COMPRESSED if
                         self._compressed else 0) |
                     (libvirt.VIR_MIGRATE_AUTO_CONVERGE if
                         self._autoConverge else 0))

            self._vm._dom.migrateToURI3(duri, params, flags)
        else:
            self._raiseAbortError()
Exemple #18
0
    def _perform_migration(self, duri, muri):
        if self._vm.hasSpice and self._vm.conf.get('clientIp'):
            SPICE_MIGRATION_HANDOVER_TIME = 120
            self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME)

        maxBandwidth = config.getint('vars', 'migration_max_bandwidth')
        # FIXME: there still a race here with libvirt,
        # if we call stop() and libvirt migrateToURI2 didn't start
        # we may return migration stop but it will start at libvirt
        # side
        self._preparingMigrationEvt = False
        if not self._migrationCanceledEvt:
            self._vm._dom.migrateToURI2(
                duri, muri, None,
                libvirt.VIR_MIGRATE_LIVE |
                libvirt.VIR_MIGRATE_PEER2PEER |
                (libvirt.VIR_MIGRATE_TUNNELLED if
                    self._tunneled else 0) |
                (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if
                    self._abortOnError else 0) |
                (libvirt.VIR_MIGRATE_COMPRESSED if
                    self._compressed else 0) |
                (libvirt.VIR_MIGRATE_AUTO_CONVERGE if
                    self._autoConverge else 0),
                None, maxBandwidth)
        else:
            self._raiseAbortError()
Exemple #19
0
def _mem_committed(mem_size_mb):
    """
    Legacy algorithm found in oVirt <= 4.1
    """
    memory = mem_size_mb
    memory += config.getint('vars', 'guest_ram_overhead')
    return 2 ** 20 * memory
Exemple #20
0
    def calculate_volume_alloc_size(cls, preallocate, capacity, initial_size):
        """ Calculate the allocation size in mb of the volume
        'preallocate' - Sparse or Preallocated
        'capacity' - the volume size in blocks
        'initial_size' - optional, if provided the initial allocated
                         size in blocks for sparse volumes
         """
        if initial_size and preallocate == sc.PREALLOCATED_VOL:
            log.error("Initial size is not supported for preallocated volumes")
            raise se.InvalidParameterException("initial size",
                                               initial_size)

        if initial_size:
            capacity_bytes = capacity * sc.BLOCK_SIZE
            initial_size_bytes = initial_size * sc.BLOCK_SIZE
            max_size = cls.max_size(capacity_bytes, sc.COW_FORMAT)
            if initial_size_bytes > max_size:
                log.error("The requested initial %s is bigger "
                          "than the max size %s", initial_size_bytes, max_size)
                raise se.InvalidParameterException("initial size",
                                                   initial_size)

        if preallocate == sc.SPARSE_VOL:
            if initial_size:
                initial_size = int(initial_size * QCOW_OVERHEAD_FACTOR)
                alloc_size = (utils.round(initial_size, BLOCKS_TO_MB) //
                              BLOCKS_TO_MB)
            else:
                alloc_size = config.getint("irs",
                                           "volume_utilization_chunk_mb")
        else:
            alloc_size = utils.round(capacity, BLOCKS_TO_MB) // BLOCKS_TO_MB

        return alloc_size
Exemple #21
0
    def calculate_volume_alloc_size(cls, preallocate, capacity, initial_size):
        """ Calculate the allocation size in mb of the volume
        'preallocate' - Sparse or Preallocated
        'capacity' - the volume size in sectors
        'initial_size' - optional, if provided the initial allocated
                         size in sectors for sparse volumes
         """
        if initial_size and initial_size > capacity:
            log.error("The volume size %s is smaller "
                      "than the requested initial size %s",
                      capacity, initial_size)
            raise se.InvalidParameterException("initial size",
                                               initial_size)

        if initial_size and preallocate == sc.PREALLOCATED_VOL:
            log.error("Initial size is not supported for preallocated volumes")
            raise se.InvalidParameterException("initial size",
                                               initial_size)

        if preallocate == sc.SPARSE_VOL:
            if initial_size:
                initial_size = int(initial_size * QCOW_OVERHEAD_FACTOR)
                alloc_size = ((initial_size + SECTORS_TO_MB - 1)
                              / SECTORS_TO_MB)
            else:
                alloc_size = config.getint("irs",
                                           "volume_utilization_chunk_mb")
        else:
            alloc_size = (capacity + SECTORS_TO_MB - 1) / SECTORS_TO_MB

        return alloc_size
Exemple #22
0
 def __init__(self, name=None):
     self._name = name or config.get("containers", "network_name")
     self._gw = config.get("containers", "network_gateway")
     self._nic = config.get("containers", "network_interface")
     self._subnet = config.get("containers", "network_subnet")
     self._mask = config.getint("containers", "network_mask")
     self._existing = False
Exemple #23
0
 def _loadBindingJsonRpc(self):
     from BindingJsonRpc import BindingJsonRpc
     from Bridge import DynamicBridge
     ip = config.get('addresses', 'management_ip')
     port = config.getint('addresses', 'json_port')
     conf = [('tcp', {"ip": ip, "port": port})]
     self.bindings['json'] = BindingJsonRpc(DynamicBridge(), conf)
Exemple #24
0
    def _setupVdsConnection(self):
        if self.hibernating:
            return

        # FIXME: The port will depend on the binding being used.
        # This assumes xmlrpc
        hostPort = vdscli.cannonizeHostPort(
            self._dst,
            config.getint('addresses', 'management_port'))
        self.remoteHost, _ = hostPort.rsplit(':', 1)

        if config.getboolean('vars', 'ssl'):
            self._destServer = vdscli.connect(
                hostPort,
                useSSL=True,
                TransportClass=kaxmlrpclib.TcpkeepSafeTransport)
        else:
            self._destServer = kaxmlrpclib.Server('http://' + hostPort)
        self.log.debug('Destination server is: ' + hostPort)
        try:
            self.log.debug('Initiating connection with destination')
            status = self._destServer.getVmStats(self._vm.id)
            if not status['status']['code']:
                self.log.error("Machine already exists on the destination")
                self.status = errCode['exist']
        except Exception:
            self.log.exception("Error initiating connection")
            self.status = errCode['noConPeer']
Exemple #25
0
 def _autodelete_if_required(self):
     if self.autodelete:
         timeout = config.getint("jobs", "autodelete_delay")
         if timeout >= 0:
             logging.info("Job %r will be deleted in %d seconds",
                          self.id, timeout)
             _scheduler.schedule(timeout, self._delete)
Exemple #26
0
 def __init__(self, name=None):
     self._name = name or config.get(
         'containers', 'network_name')
     self._gw = config.get('containers', 'network_gateway')
     self._nic = config.get('containers', 'network_interface')
     self._subnet = config.get('containers', 'network_subnet')
     self._mask = config.getint('containers', 'network_mask')
     self._existing = False
Exemple #27
0
 def getMaximumSupportedDomains(self):
     msdInfo = self.masterDomain.getInfo()
     msdType = sd.name2type(msdInfo["type"])
     msdVersion = int(msdInfo["version"])
     if msdType in sd.BLOCK_DOMAIN_TYPES and msdVersion in blockSD.VERS_METADATA_LV:
         return MAX_DOMAINS
     else:
         return config.getint("irs", "maximum_domains_in_pool")
Exemple #28
0
def _memory_viewer():
    cherrypy.tree.mount(dowser.Root())

    cherrypy.config.update({
        'server.socket_host': '0.0.0.0',
        'server.socket_port': config.getint('devel', 'memory_profile_port')})

    cherrypy.engine.start()
Exemple #29
0
    def monitor_migration(self):
        def update_progress(remaining, total):
            if remaining == 0 and total:
                return 100
            progress = 100 - 100 * remaining / total if total else 0
            return progress if (progress < 100) else 99

        self._vm.log.debug('starting migration monitor thread')

        memSize = int(self._vm.conf['memSize'])
        maxTimePerGiB = config.getint('vars',
                                      'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024
        lastProgressTime = time.time()
        lowmark = None
        self._execute_init(self._conv_schedule['init'])

        while not self._stop.isSet():
            self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            (jobType, timeElapsed, _,
             dataTotal, dataProcessed, dataRemaining,
             memTotal, memProcessed, memRemaining,
             fileTotal, fileProcessed, _) = self._vm._dom.jobInfo()
            # from libvirt sources: data* = file* + mem*.
            # docs can be misleading due to misaligned lines.
            now = time.time()
            if not self._use_conv_schedule and\
                    (0 < migrationMaxTime < now - self._startTime):
                self._vm.log.warn('The migration took %d seconds which is '
                                  'exceeding the configured maximum time '
                                  'for migrations of %d seconds. The '
                                  'migration will be aborted.',
                                  now - self._startTime,
                                  migrationMaxTime)
                self._vm._dom.abortJob()
                self.stop()
                break
            elif (lowmark is None) or (lowmark > dataRemaining):
                lowmark = dataRemaining
                lastProgressTime = now
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).'
                    ' Refer to RHBZ#919201.',
                    dataRemaining / Mbytes, lowmark / Mbytes)

            self._next_action(now - lastProgressTime)

            if self._stop.isSet():
                break

            if jobType != libvirt.VIR_DOMAIN_JOB_NONE:
                self.progress = update_progress(dataRemaining, dataTotal)

                self._vm.log.info('Migration Progress: %s seconds elapsed,'
                                  ' %s%% of data processed' %
                                  (timeElapsed / 1000, self.progress))
Exemple #30
0
    def __init__ (self, log):
        """
        Initialize the (single) clientIF instance

        :param log: a log object to be used for this object's logging.
        :type log: :class:`logging.Logger`
        """
        self.vmContainerLock = threading.Lock()
        self._networkSemaphore = threading.Semaphore()
        self._shutdownSemaphore = threading.Semaphore()
        self.log = log
        self._recovery = True
        self._libvirt = libvirtconnection.get()
        self._syncLibvirtNetworks()
        self.channelListener = Listener(self.log)
        self._generationID = str(uuid.uuid4())
        self._initIRS()
        try:
            self.vmContainer = {}
            ifids = netinfo.nics() + netinfo.bondings()
            ifrates = map(netinfo.speed, ifids)
            self._hostStats = utils.HostStatsThread(cif=self, log=log, ifids=ifids,
                                                ifrates=ifrates)
            self._hostStats.start()
            mog = min(config.getint('vars', 'max_outgoing_migrations'),
                      caps.CpuInfo().cores())
            vm.MigrationSourceThread.setMaxOutgoingMigrations(mog)

            self.lastRemoteAccess = 0
            self._memLock = threading.Lock()
            self._enabled = True
            self.ksmMonitor = ksm.KsmMonitorThread(self)
            self._netConfigDirty = False
            threading.Thread(target=self._recoverExistingVms,
                             name='clientIFinit').start()
            self.channelListener.settimeout(config.getint('vars', 'guest_agent_timeout'))
            self.channelListener.start()
            self.threadLocal = threading.local()
            self.threadLocal.client = ''
        except:
            self.log.error('failed to init clientIF, shutting down storage dispatcher')
            if self.irs:
                self.irs.prepareForShutdown()
            raise
        self._prepareBindings()
Exemple #31
0
    def __init__(self, irs, log, scheduler):
        """
        Initialize the (single) clientIF instance

        :param irs: a Dispatcher object to be used as this object's irs.
        :type irs: :class:`vdsm.storage.dispatcher.Dispatcher`
        :param log: a log object to be used for this object's logging.
        :type log: :class:`logging.Logger`
        """
        self.vmContainerLock = threading.Lock()
        self._networkSemaphore = threading.Semaphore()
        self._shutdownSemaphore = threading.Semaphore()
        self.irs = irs
        if self.irs:
            self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self))
            self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB)
        self.log = log
        self._recovery = True
        # TODO: The guest agent related code spreads around too much. There is
        # QemuGuestAgentPoller and ChannelListner here and then many instances
        # of GuestAgent per VM in vm.py. This should be refactored and
        # operated by single object. Idealy the distinction between what is
        # served by QEMU-GA and what is server by oVirt GA should not be
        # visible to the rest of the code.
        self.channelListener = Listener(self.log)
        self.qga_poller = QemuGuestAgentPoller(self, log, scheduler)
        self.mom = None
        self.servers = {}
        self._broker_client = None
        self._subscriptions = defaultdict(list)
        self._scheduler = scheduler
        self._unknown_vm_ids = set()
        if _glusterEnabled:
            self.gluster = gapi.GlusterApi()
        else:
            self.gluster = None
        try:
            self.vmContainer = {}
            self.lastRemoteAccess = 0
            self._enabled = True
            self._netConfigDirty = False
            self.mom = MomClient(config.get("mom", "socket_path"))
            self.mom.connect()
            secret.clear()
            concurrent.thread(self._recoverThread, name='vmrecovery').start()
            self.channelListener.settimeout(
                config.getint('vars', 'guest_agent_timeout'))
            self.channelListener.start()
            self.qga_poller.start()
            self.threadLocal = threading.local()
            self.threadLocal.client = ''

            host = config.get('addresses', 'management_ip')
            port = config.getint('addresses', 'management_port')

            # When IPv6 is not enabled, fallback to listen on IPv4 address
            try:
                self._createAcceptor(host, port)
            except socket.error as e:
                if e.errno == errno.EAFNOSUPPORT and host in ('::', '::1'):
                    fallback_host = '0.0.0.0'
                    self._createAcceptor(fallback_host, port)
                else:
                    raise

            self._prepareHttpServer()
            self._prepareJSONRPCServer()
            self._connectToBroker()
        except:
            self.log.error('failed to init clientIF, '
                           'shutting down storage dispatcher')
            if self.irs:
                self.irs.prepareForShutdown()
            raise
Exemple #32
0
class MonitorThread(threading.Thread):
    _MIGRATION_MONITOR_INTERVAL = config.getint(
        'vars', 'migration_monitor_interval')  # seconds

    def __init__(self, vm, startTime):
        super(MonitorThread, self).__init__()
        self._stop = threading.Event()
        self._vm = vm
        self._startTime = startTime
        self.daemon = True
        self.progress = 0

    @property
    def enabled(self):
        return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0

    def run(self):
        if self.enabled:
            self.monitor_migration()
        else:
            self._vm.log.debug('migration monitor thread disabled'
                               ' (monitoring interval set to 0)')

    def monitor_migration(self):
        def calculateProgress(remaining, total):
            if remaining == 0 and total:
                return 100
            progress = 100 - 100 * remaining / total if total else 0
            return progress if (progress < 100) else 99

        self._vm.log.debug('starting migration monitor thread')

        memSize = int(self._vm.conf['memSize'])
        maxTimePerGiB = config.getint('vars',
                                      'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024
        lastProgressTime = time.time()
        lowmark = None
        progress_timeout = config.getint('vars', 'migration_progress_timeout')

        while not self._stop.isSet():
            self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            (jobType, timeElapsed, _,
             dataTotal, dataProcessed, dataRemaining,
             memTotal, memProcessed, memRemaining,
             fileTotal, fileProcessed, _) = self._vm._dom.jobInfo()
            # from libvirt sources: data* = file* + mem*.
            # docs can be misleading due to misaligned lines.
            abort = False
            now = time.time()
            if 0 < migrationMaxTime < now - self._startTime:
                self._vm.log.warn('The migration took %d seconds which is '
                                  'exceeding the configured maximum time '
                                  'for migrations of %d seconds. The '
                                  'migration will be aborted.',
                                  now - self._startTime,
                                  migrationMaxTime)
                abort = True
            elif (lowmark is None) or (lowmark > dataRemaining):
                lowmark = dataRemaining
                lastProgressTime = now
            elif (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                abort = True

            if abort:
                self._vm._dom.abortJob()
                self.stop()
                break

            if dataRemaining > lowmark:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).'
                    ' Refer to RHBZ#919201.',
                    dataRemaining / Mbytes, lowmark / Mbytes)

            if jobType == 0:
                continue

            self.progress = calculateProgress(dataRemaining, dataTotal)

            self._vm.log.info('Migration Progress: %s seconds elapsed, %s%% of'
                              ' data processed' %
                              (timeElapsed / 1000, self.progress))

    def stop(self):
        self._vm.log.debug('stopping migration monitor thread')
        self._stop.set()
Exemple #33
0
_QEMU_OSINFO_COMMAND = 'guest-get-osinfo'
_QEMU_TIMEZONE_COMMAND = 'guest-get-timezone'
_QEMU_FSINFO_COMMAND = 'guest-get-fsinfo'
_QEMU_DISKS_COMMAND = 'guest-get-disks'

_HOST_NAME_FIELD = 'host-name'
_OS_ID_FIELD = 'id'
_TIMEZONE_OFFSET_FIELD = 'offset'
_TIMEZONE_ZONE_FIELD = 'zone'
_FS_DISK_FIELD = 'disk'
_FS_DISK_DEVICE_FIELD = 'dev'
_FS_DISK_SERIAL_FIELD = 'serial'

_GUEST_OS_WINDOWS = 'mswindows'

_WORKERS = config.getint('guest_agent', 'periodic_workers')
_TASK_PER_WORKER = config.getint('guest_agent', 'periodic_task_per_worker')
_TASKS = _WORKERS * _TASK_PER_WORKER
_MAX_WORKERS = config.getint('guest_agent', 'max_workers')

_COMMAND_TIMEOUT = config.getint('guest_agent', 'qga_command_timeout')
_INITIAL_INTERVAL = config.getint('guest_agent', 'qga_initial_info_interval')
_TASK_TIMEOUT = config.getint('guest_agent', 'qga_task_timeout')
_THROTTLING_INTERVAL = 60

from libvirt import \
    VIR_DOMAIN_GUEST_INFO_USERS,  \
    VIR_DOMAIN_GUEST_INFO_OS, \
    VIR_DOMAIN_GUEST_INFO_TIMEZONE, \
    VIR_DOMAIN_GUEST_INFO_HOSTNAME, \
    VIR_DOMAIN_GUEST_INFO_FILESYSTEM
Exemple #34
0
class GuestAgent(object):
    MAX_MESSAGE_SIZE = 2 ** 20  # 1 MiB for now
    SEEN_SHUTDOWN_TIMEOUT = config.getint('vars', 'sys_shutdown_timeout') * 2

    def __init__(self, socketName, channelListener, log, onStatusChange,
                 qgaCaps, qgaGuestInfo, api_version=None, user='******',
                 ips=''):
        self.effectiveApiVersion = min(
            api_version or _IMPLICIT_API_VERSION_ZERO,
            _MAX_SUPPORTED_API_VERSION)
        self._onStatusChange = onStatusChange
        self.log = log
        self._socketName = socketName
        self._sock = _create_socket()
        self._stopped = True
        self._status = None
        self.guestDiskMapping = {}
        self.guestInfo = {
            'username': user,
            'memUsage': 0,
            'guestCPUCount': -1,
            'guestIPs': ips,
            'guestFQDN': '',
            'session': 'Unknown',
            'appsList': (),
            'disksUsage': [],
            'netIfaces': [],
            'memoryStats': {}}
        self._agentTimestamp = 0
        self._channelListener = channelListener
        self._messageState = MessageState.NORMAL
        self.events = GuestAgentEvents(self)
        self._completion_lock = threading.Lock()
        self._completion_events = {}
        self._first_connect = threading.Event()
        self._seen_shutdown = None
        self._qgaCaps = qgaCaps
        self._qgaGuestInfo = qgaGuestInfo

    def has_seen_shutdown(self):
        if self._seen_shutdown is None:
            return True
        diff = time.time() - self._agentTimestamp
        if diff < GuestAgent.SEEN_SHUTDOWN_TIMEOUT:
            return self._seen_shutdown
        return False

    def _on_completion(self, reply_id):
        with self._completion_lock:
            event = self._completion_events.pop(reply_id, None)
        if event is not None:
            event.set()

    @property
    def can_reply(self):
        active = self.isResponsive()
        return active and self.effectiveApiVersion >= _REPLY_CAP_MIN_VERSION

    @contextlib.contextmanager
    def _waitable_message(self, wait_timeout, reply_id):
        if self.can_reply and wait_timeout is not None:
            event = threading.Event()
            with self._completion_lock:
                self._completion_events[reply_id] = event
            yield
            event.wait(wait_timeout)
            with self._completion_lock:
                self._completion_events.pop(reply_id, None)
        else:
            yield

    @property
    def guestStatus(self):
        return self._status

    @guestStatus.setter
    def guestStatus(self, value):
        oldValue = self._status
        self._status = value
        if oldValue != value and self._onStatusChange:
            self._onStatusChange()

    @property
    def guestDiskMapping(self):
        return self._guestDiskMapping

    @guestDiskMapping.setter
    def guestDiskMapping(self, value):
        self._guestDiskMapping = value
        if value:
            self._diskMappingHash = hash(json.dumps(value, sort_keys=True))
        else:
            self._diskMappingHash = None

    @property
    def diskMappingHash(self):
        return self._diskMappingHash

    def start(self):
        self.log.info("Starting connection")
        self._prepare_socket()
        self._channelListener.register(
            self._create,
            self._connect,
            self._onChannelRead,
            self._onChannelTimeout)

    def _handleAPIVersion(self, version):
        """ Handles the API version value from the heartbeat

            If the value `version` is an valid int the highest possible
            API version in common will be determined and set to the
            attribute `self.effectiveApiVersion` if the value has changed. If
            the value changed the `api-version` message will  be sent to the
            guest agent to notify it about the changed common API version.

            If the value of `version` is not an int, the API version support
            will be disabled by assigning _IMPLICIT_API_VERSION_ZERO to
            `self.effectiveApiVersion`

        Args:
        version - the api version reported by the guest agent
        """
        try:
            commonVersion = int(version)
        except ValueError:
            self.log.warning("Received invalid version value: %s", version)
            commonVersion = _IMPLICIT_API_VERSION_ZERO
        else:
            commonVersion = max(commonVersion, _IMPLICIT_API_VERSION_ZERO)
            commonVersion = min(commonVersion, _MAX_SUPPORTED_API_VERSION)

        if commonVersion != self.effectiveApiVersion:
            # Only update if the value changed
            self.log.info("Guest API version changed from %d to %d",
                          self.effectiveApiVersion, commonVersion)
            self.effectiveApiVersion = commonVersion
            if commonVersion != _IMPLICIT_API_VERSION_ZERO:
                # Only notify the guest agent if the API was not disabled
                self._forward('api-version', {'apiVersion': commonVersion})

    def _prepare_socket(self):
        supervdsm.getProxy().prepareVmChannel(self._socketName)

    def _create(self):
        self._sock.close()
        self._sock = _create_socket()
        return self._sock.fileno()

    def _connect(self):
        ret = False
        try:
            self._stopped = True
            self.log.debug("Attempting connection to %s", self._socketName)
            result = self._sock.connect_ex(self._socketName)
            self._first_connect.set()
            if result == 0:
                self.log.debug("Connected to %s", self._socketName)
                self._messageState = MessageState.NORMAL
                self._clearReadBuffer()
                # Report the _MAX_SUPPORTED_API_VERSION on refresh to enable
                # the other side to see that we support API versioning
                self._forward('refresh',
                              {'apiVersion': _MAX_SUPPORTED_API_VERSION})
                self._stopped = False
                ret = True
            else:
                self.log.debug("Failed to connect to %s with %d",
                               self._socketName, result)
        except socket.error as err:
            self.log.debug("Connection attempt failed: %s", err)
        return ret

    def _forward(self, cmd, args={}):
        ver = _MESSAGE_API_VERSION_LOOKUP.get(cmd, _IMPLICIT_API_VERSION_ZERO)
        if ver > self.effectiveApiVersion:
            raise GuestAgentUnsupportedMessage(cmd, ver,
                                               self.effectiveApiVersion)
        self._first_connect.wait(self._channelListener.timeout())
        args['__name__'] = cmd
        # TODO: encoding is required only on Python 3. Replace with wrapper
        # hiding this difference.
        message = (json.dumps(args) + '\n').encode('utf8')
        # TODO: socket is non-blocking, handle possible EAGAIN
        self._sock.sendall(message)
        self.log.debug('sent %r', message)

    def _handleMessage(self, message, args):
        self.log.debug("Guest's message %s: %s", message, args)
        if message == 'heartbeat':
            self.guestInfo['memUsage'] = int(args['free-ram'])
            if 'memory-stat' in args:
                for k in ('mem_total', 'mem_unused', 'mem_buffers',
                          'mem_cached', 'swap_in', 'swap_out', 'pageflt',
                          'majflt'):
                    if k not in args['memory-stat']:
                        continue
                    # Convert the value to string since 64-bit integer is not
                    # supported in XMLRPC
                    self.guestInfo['memoryStats'][k] = str(
                        args['memory-stat'][k])
                    if k == 'mem_unused':
                        self.guestInfo['memoryStats']['mem_free'] = str(
                            args['memory-stat']['mem_unused'])

            if 'apiVersion' in args:
                # The guest agent supports API Versioning
                self._handleAPIVersion(args['apiVersion'])
            elif self.effectiveApiVersion != _IMPLICIT_API_VERSION_ZERO:
                # Older versions of the guest agent (before the introduction
                # of API versioning) do not report this field
                # Disable the API if not already disabled (e.g. after
                # downgrade of the guest agent)
                self.log.debug("API versioning no longer reported by guest.")
                self.effectiveApiVersion = _IMPLICIT_API_VERSION_ZERO
            # Only change the state AFTER all data of the heartbeat has been
            # consumed
            self.guestStatus = vmstatus.UP
            if self._seen_shutdown:
                self._seen_shutdown = False
        elif message == 'host-name':
            self.guestInfo['guestName'] = args['name']
        elif message == 'os-version':
            self.guestInfo['guestOs'] = args['version']
        elif message == 'os-info':
            self.guestInfo['guestOsInfo'] = args
        elif message == 'timezone':
            self.guestInfo['guestTimezone'] = args
        elif message == 'network-interfaces':
            interfaces = []
            old_ips = ''
            for iface in args['interfaces']:
                iface['inet'] = iface.get('inet', [])
                iface['inet6'] = iface.get('inet6', [])
                interfaces.append(iface)
                # Provide the old information which includes
                # only the IP addresses.
                old_ips += ' '.join(iface['inet']) + ' '
            self.guestInfo['netIfaces'] = interfaces
            self.guestInfo['guestIPs'] = old_ips.strip()
        elif message == 'applications':
            self.guestInfo['appsList'] = tuple(args['applications'])
            # Fake QEMU-GA if it is not reported
            if not any(bool(_qga_re.match(x))
                       for x in self.guestInfo['appsList']):
                qga_caps = self._qgaCaps()
                if qga_caps is not None and qga_caps['version'] is not None:
                    # NOTE: this is a tuple
                    self.guestInfo['appsList'] = \
                        self.guestInfo['appsList'] + \
                        ('qemu-guest-agent-%s' % qga_caps['version'],)
        elif message == 'active-user':
            currentUser = args['name']
            if ((currentUser != self.guestInfo['username']) and
                not (currentUser == 'Unknown' and
                     self.guestInfo['username'] == 'None')):
                self.guestInfo['username'] = currentUser
                self.guestInfo['lastLogin'] = time.time()
            self.log.debug("username: %s", repr(self.guestInfo['username']))
        elif message == 'session-logon':
            self.guestInfo['session'] = "UserLoggedOn"
        elif message == 'session-lock':
            self.guestInfo['session'] = "Locked"
        elif message == 'session-unlock':
            self.guestInfo['session'] = "Active"
        elif message == 'session-logoff':
            self.guestInfo['session'] = "LoggedOff"
        elif message == 'uninstalled':
            self.log.debug("guest agent was uninstalled.")
            self.guestInfo['appsList'] = ()
        elif message == 'session-startup':
            self._seen_shutdown = False
            self.log.debug("Guest system is started or restarted.")
        elif message == 'fqdn':
            self.guestInfo['guestFQDN'] = args['fqdn']
        elif message == 'session-shutdown':
            self._seen_shutdown = True
            self.log.debug("Guest system shuts down.")
        elif message == 'containers':
            self.guestInfo['guestContainers'] = args['list']
        elif message == 'disks-usage':
            disks = []
            for disk in args['disks']:
                # Converting to string because XML-RPC doesn't support 64-bit
                # integers.
                disk['total'] = str(disk['total'])
                disk['used'] = str(disk['used'])
                disks.append(disk)
            self.guestInfo['disksUsage'] = disks
            self.guestDiskMapping = args.get('mapping', {})
        elif message == 'number-of-cpus':
            self.guestInfo['guestCPUCount'] = int(args['count'])
        elif message == 'completion':
            self._on_completion(args.pop('reply_id', None))
        else:
            self.log.error('Unknown message type %s', message)

    def stop(self):
        self.log.info("Stopping connection")
        self._stopped = True
        try:
            self._channelListener.unregister(self._sock.fileno())
        except socket.error as e:
            if e.args[0] == errno.EBADF:
                # socket was already closed
                pass
            else:
                raise
        else:
            self._sock.close()

    def isResponsive(self):
        return time.time() - self._agentTimestamp < 120

    def getStatus(self):
        return self.guestStatus

    def getGuestInfo(self):
        # This is rather hacky, but for now we want to prefer information from
        # oVirt GA over QEMU-GA
        info = {
            'username': '******',
            'session': 'Unknown',
            'memUsage': 0,
            'guestCPUCount': -1,
            'appsList': (),
            'guestIPs': '',
            'guestFQDN': ''}
        qga = self._qgaGuestInfo()
        if qga is not None:
            info.update(qga)
        if self.isResponsive():
            info.update(self.guestInfo)
        else:
            if len(self.guestInfo['appsList']) > 0:
                info['appsList'] = self.guestInfo['appsList']
            if len(self.guestInfo['guestIPs']) > 0:
                info['guestIPs'] = self.guestInfo['guestIPs']
            if len(self.guestInfo['guestFQDN']) > 0:
                info['guestFQDN'] = self.guestInfo['guestFQDN']
        return utils.picklecopy(info)

    def onReboot(self):
        self.guestStatus = vmstatus.REBOOT_IN_PROGRESS
        self.guestInfo['lastUser'] = '' + self.guestInfo['username']
        self.guestInfo['username'] = '******'
        self.guestInfo['lastLogout'] = time.time()

    def desktopLock(self):
        try:
            self.log.debug("desktopLock called")
            self._forward("lock-screen")
        except Exception as e:
            if isinstance(e, socket.error) and e.args[0] == errno.EBADF:
                self.log.debug('desktopLock failed - Socket not connected')
                return  # Expected when not connected/closed socket
            self.log.exception("desktopLock failed with unexpected exception")

    def desktopLogin(self, domain, user, password):
        try:
            self.log.debug("desktopLogin called")
            if domain != '':
                username = user + '@' + domain
            else:
                username = user
            self._forward('login', {'username': username,
                                    "password": password.value})
        except:
            self.log.exception("desktopLogin failed")

    def desktopLogoff(self, force):
        try:
            self.log.debug("desktopLogoff called")
            self._forward('log-off')
        except:
            self.log.exception("desktopLogoff failed")

    def desktopShutdown(self, timeout, msg, reboot):
        try:
            self.log.debug("desktopShutdown called")
            self._forward('shutdown', {'timeout': timeout, 'message': msg,
                                       'reboot': str(reboot)})
        except:
            self.log.exception("desktopShutdown failed")

    def sendHcCmdToDesktop(self, cmd):
        try:
            self.log.debug("sendHcCmdToDesktop('%s')" % (cmd))
            self._forward(str(cmd))
        except:
            self.log.exception("sendHcCmdToDesktop failed")

    def setNumberOfCPUs(self, count):
        self.log.debug("setNumberOfCPUs('%d') called", count)
        self._forward('set-number-of-cpus', {'count': count})

    def send_lifecycle_event(self, event, **kwargs):
        self.log.debug('send_lifecycle_event %s called', event)
        try:
            message = {'type': event}
            message.update(kwargs)
            self._forward('lifecycle-event', message)
        except GuestAgentUnsupportedMessage:
            # This is ok, that guest agent doesn't know yet how to handle
            # the message
            pass
        except socket.error as e:
            self.log.debug("Failed to forward lifecycle-event: %s", e)

    def _onChannelTimeout(self):
        self.guestInfo['memUsage'] = 0
        if self.guestStatus not in (vmstatus.POWERING_DOWN,
                                    vmstatus.REBOOT_IN_PROGRESS):
            self.log.debug("Guest connection timed out")
            self.guestStatus = None

    def _clearReadBuffer(self):
        self._buffer = []
        self._bufferSize = 0

    def _processMessage(self, line):
        try:
            (message, args) = self._parseLine(line)
            self._agentTimestamp = time.time()
            self._handleMessage(message, args)
        except ValueError as err:
            self.log.error("%s: %s" % (err, repr(line)))

    def _handleData(self, data):
        while (not self._stopped) and b'\n' in data:
            line, data = data.split(b'\n', 1)
            line = b''.join(self._buffer) + line
            self._clearReadBuffer()
            if self._messageState is MessageState.TOO_BIG:
                self._messageState = MessageState.NORMAL
                self.log.warning("Not processing current message because it "
                                 "was too big")
            else:
                self._processMessage(line)

        self._buffer.append(data)
        self._bufferSize += len(data)

        if self._bufferSize >= self.MAX_MESSAGE_SIZE:
            self.log.warning("Discarding buffer with size: %d because the "
                             "message reached maximum size of %d bytes before "
                             "message end was reached.", self._bufferSize,
                             self.MAX_MESSAGE_SIZE)
            self._messageState = MessageState.TOO_BIG
            self._clearReadBuffer()

    def _onChannelRead(self):
        result = True
        try:
            while not self._stopped:
                data = self._sock.recv(2 ** 16)
                # The connection is broken when recv returns no data
                # therefore we're going to set ourself to stopped state
                if not data:
                    self._stopped = True
                    self.log.debug("Disconnected from %s", self._socketName)
                    result = False
                else:
                    self._handleData(data)
        except socket.error as err:
            if err.errno not in (errno.EWOULDBLOCK, errno.EAGAIN):
                raise

        return result

    def _parseLine(self, line):
        # Deal with any bad UTF8 encoding from the (untrusted) guest,
        # by replacing them with the Unicode replacement character
        uniline = line.decode('utf8', 'replace')
        args = json.loads(uniline)
        # Filter out any characters in the untrusted guest response
        # that aren't permitted in XML.  This must be done _after_ the
        # JSON decoding, since otherwise JSON's \u escape decoding
        # could be used to generate the bad characters
        args = _filterObject(args)
        name = args['__name__']
        del args['__name__']
        return (name, args)
Exemple #35
0
class MonitorThread(object):
    _MIGRATION_MONITOR_INTERVAL = config.getint(
        'vars', 'migration_monitor_interval')  # seconds

    def __init__(self, vm, startTime, conv_schedule):
        super(MonitorThread, self).__init__()
        self._stop = threading.Event()
        self._vm = vm
        self._dom = DomainAdapter(self._vm)
        self._startTime = startTime
        self.daemon = True
        self.progress = None
        self._conv_schedule = conv_schedule
        self._thread = concurrent.thread(self.run,
                                         name='migmon/' + self._vm.id[:8])

    def start(self):
        self._thread.start()

    def join(self):
        self._thread.join()

    @property
    def enabled(self):
        return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0

    @logutils.traceback()
    def run(self):
        if self.enabled:
            self._vm.log.debug('starting migration monitor thread')
            try:
                self.monitor_migration()
            except virdomain.NotConnectedError as e:
                # In case the VM is stopped during migration, there is a race
                # between domain disconnection and stopping the monitoring
                # thread. Then the domain may no longer be connected when
                # monitor_migration loop tries to access it. That's harmless
                # and shouldn't bubble up, let's just finish the thread.
                self._vm.log.debug('domain disconnected in monitor thread: %s',
                                   e)
            self._vm.log.debug('stopped migration monitor thread')
        else:
            self._vm.log.info('migration monitor thread disabled'
                              ' (monitoring interval set to 0)')

    def monitor_migration(self):
        lowmark = None
        initial_iteration = last_iteration = None

        self._execute_init(self._conv_schedule['init'])

        while not self._stop.isSet():
            stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            if stopped:
                break

            job_stats = self._vm.job_stats()
            # It may happen that the migration did not start yet
            # so we'll keep waiting
            if not ongoing(job_stats):
                continue

            progress = Progress.from_job_stats(job_stats)
            if initial_iteration is None:
                # The initial iteration number from libvirt is not
                # fixed, since it may include iterations from
                # previously cancelled migrations.
                initial_iteration = last_iteration = progress.mem_iteration

            self._vm.send_migration_status_event()

            if self._vm.post_copy != PostCopyPhase.NONE:
                # Post-copy mode is a final state of a migration -- it either
                # completes or fails and stops the VM, there is no way to
                # continue with the migration in either case.  So we won't
                # handle any further schedule actions once post-copy is
                # successfully started.  It's still recommended to put the
                # abort action after the post-copy action in the schedule, for
                # the case when it's not possible to switch to the post-copy
                # mode for some reason.
                if self._vm.post_copy == PostCopyPhase.RUNNING:
                    # If post-copy is not RUNNING then we are in the interim
                    # phase (which should be short) between initiating the
                    # post-copy migration and the actual start of the post-copy
                    # migration.  Nothing needs to be done in that case.
                    self._vm.log.debug(
                        'Post-copy migration still in progress: %d',
                        progress.data_remaining)
            elif (lowmark is None) or (lowmark > progress.data_remaining):
                lowmark = progress.data_remaining
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).', progress.data_remaining // MiB,
                    lowmark // MiB)

            if not self._vm.post_copy and\
               progress.mem_iteration > last_iteration:
                last_iteration = progress.mem_iteration
                current_iteration = last_iteration - initial_iteration
                self._vm.log.debug('new iteration: %i', current_iteration)
                self._next_action(current_iteration)

            if self._stop.isSet():
                break

            self.progress = progress
            self._vm.log.info('%s', progress)

    def stop(self):
        self._vm.log.debug('stopping migration monitor thread')
        self._stop.set()

    def _next_action(self, stalling):
        head = self._conv_schedule['stalling'][0]

        self._vm.log.debug(
            'Stalling for %d iterations, '
            'checking to make next action: '
            '%s', stalling, head)
        if head['limit'] < stalling:
            self._execute_action_with_params(head['action'])
            self._conv_schedule['stalling'].pop(0)
            self._vm.log.debug('setting conv schedule to: %s',
                               self._conv_schedule)

    def _execute_init(self, init_actions):
        for action_with_params in init_actions:
            self._execute_action_with_params(action_with_params)

    def _execute_action_with_params(self, action_with_params):
        action = str(action_with_params['name'])
        vm = self._vm
        if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME:
            downtime = int(action_with_params['params'][0])
            vm.log.debug('Setting downtime to %d', downtime)
            # pylint: disable=no-member
            self._dom.migrateSetMaxDowntime(downtime, 0)
        elif action == CONVERGENCE_SCHEDULE_POST_COPY:
            if not self._vm.switch_migration_to_post_copy():
                # Do nothing for now; the next action will be invoked after a
                # while
                vm.log.warning('Failed to switch to post-copy migration')
        elif action == CONVERGENCE_SCHEDULE_SET_ABORT:
            vm.log.warning('Aborting migration')
            vm.abort_domjob()
            self.stop()
Exemple #36
0
import types
import weakref

from functools import partial
import six

import ioprocess

from vdsm import constants
from vdsm import utils
from vdsm.common.osutils import get_umask
from vdsm.config import config
from vdsm.storage import constants as sc
from vdsm.storage import exception as se

DEFAULT_TIMEOUT = config.getint("irs", "process_pool_timeout")
IOPROC_IDLE_TIME = config.getint("irs", "max_ioprocess_idle_time")
HELPERS_PER_DOMAIN = config.getint("irs", "process_pool_max_slots_per_domain")
MAX_QUEUED = config.getint("irs", "process_pool_max_queued_slots_per_domain")

_procPoolLock = threading.Lock()
_procPool = {}
_refProcPool = {}

elapsed_time = lambda: os.times()[4]

log = logging.getLogger('storage.oop')


def stop():
    """
Exemple #37
0
 def __init__(self,
              vm,
              dst='',
              dstparams='',
              mode=MODE_REMOTE,
              method=METHOD_ONLINE,
              tunneled=False,
              dstqemu='',
              abortOnError=False,
              consoleAddress=None,
              compressed=False,
              autoConverge=False,
              recovery=False,
              **kwargs):
     self.log = vm.log
     self._vm = vm
     self._dst = dst
     self._mode = mode
     if method != METHOD_ONLINE:
         self.log.warning(
             'migration method %s is deprecated, forced to "online"',
             method)
     self._dstparams = dstparams
     self._enableGuestEvents = kwargs.get('enableGuestEvents', False)
     # TODO: conv.tobool shouldn't be used in this constructor, the
     # conversions should be handled properly in the API layer
     self._consoleAddress = consoleAddress
     self._dstqemu = dstqemu
     self._downtime = kwargs.get('downtime') or \
         config.get('vars', 'migration_downtime')
     self._maxBandwidth = int(
         kwargs.get('maxBandwidth')
         or config.getint('vars', 'migration_max_bandwidth'))
     self._incomingLimit = kwargs.get('incomingLimit')
     self._outgoingLimit = kwargs.get('outgoingLimit')
     self.status = {
         'status': {
             'code': 0,
             'message': 'Migration in progress'
         }
     }
     # we need to guard against concurrent updates only
     self._lock = threading.Lock()
     self._progress = 0
     self._thread = concurrent.thread(self.run,
                                      name='migsrc/' + self._vm.id[:8])
     self._preparingMigrationEvt = True
     self._migrationCanceledEvt = threading.Event()
     self._monitorThread = None
     self._destServer = None
     self._convergence_schedule = {'init': [], 'stalling': []}
     self._use_convergence_schedule = False
     if 'convergenceSchedule' in kwargs:
         self._convergence_schedule = kwargs.get('convergenceSchedule')
         self._use_convergence_schedule = True
         self.log.debug('convergence schedule set to: %s',
                        str(self._convergence_schedule))
     self._started = False
     self._failed = False
     self._recovery = recovery
     tunneled = conv.tobool(tunneled)
     abortOnError = conv.tobool(abortOnError)
     compressed = conv.tobool(compressed)
     autoConverge = conv.tobool(autoConverge)
     self._migration_flags = self._calculate_migration_flags(
         tunneled, abortOnError, compressed, autoConverge)
Exemple #38
0
    def _regular_run(self):
        self.log.debug("Starting migration source thread")
        self._recovery = False
        self._update_outgoing_limit()
        try:
            startTime = time.time()
            # Guest agent API version must be updated before _srcDomXML
            # is created to have the version in _srcDomXML metadata.
            self._vm.update_guest_agent_api_version()
            machineParams = self._setupRemoteMachineParams()
            self._setupVdsConnection()
            self._prepareGuest()

            while not self.started:
                try:
                    self.log.info("Migration semaphore: acquiring")
                    with SourceThread.ongoingMigrations:
                        self.log.info("Migration semaphore: acquired")
                        timeout = config.getint(
                            'vars', 'guest_lifecycle_event_reply_timeout')
                        if self.hibernating:
                            self._vm.guestAgent.events.before_hibernation(
                                wait_timeout=timeout)
                        elif self._enableGuestEvents:
                            self._vm.guestAgent.events.before_migration(
                                wait_timeout=timeout)
                        if self._migrationCanceledEvt.is_set():
                            self._raiseAbortError()
                        self.log.debug("migration semaphore acquired "
                                       "after %d seconds",
                                       time.time() - startTime)
                        self._startUnderlyingMigration(
                            time.time(), machineParams
                        )
                        self._finishSuccessfully(machineParams)
                except libvirt.libvirtError as e:
                    if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED:
                        self.status = response.error(
                            'migCancelErr', message='Migration canceled')
                    # This error occurs when hypervisor cannot start
                    # the migration. For example, when a domain with the same
                    # name already exists on the destination.
                    elif e.get_error_code() == \
                            libvirt.VIR_ERR_OPERATION_FAILED:
                        self.status = response.error(
                            'migOperationErr', message=e.get_str2())
                    raise
                except MigrationLimitExceeded:
                    retry_timeout = config.getint('vars',
                                                  'migration_retry_timeout')
                    self.log.debug("Migration destination busy. Initiating "
                                   "retry in %d seconds.", retry_timeout)
                    self._migrationCanceledEvt.wait(retry_timeout)
        except MigrationDestinationSetupError as e:
            self._recover(str(e))
            # we know what happened, no need to dump hollow stack trace
        except Exception as e:
            self._recover(str(e))
            self.log.exception("Failed to migrate")
        finally:
            # Enable the volume monitor as it can be disabled during migration.
            self._vm.volume_monitor.enable()
Exemple #39
0
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
import threading
import logging

from yajsonrpc import JsonRpcServer
from yajsonrpc.stompreactor import StompReactor

from vdsm import executor
from vdsm.config import config


# TODO test what should be the default values
_THREADS = config.getint('rpc', 'worker_threads')
_TASK_PER_WORKER = config.getint('rpc', 'tasks_per_worker')
_TASKS = _THREADS * _TASK_PER_WORKER


class BindingJsonRpc(object):
    log = logging.getLogger('BindingJsonRpc')

    def __init__(self, bridge, subs, timeout, scheduler):
        self._executor = executor.Executor(name="jsonrpc.Executor",
                                           workers_count=_THREADS,
                                           max_tasks=_TASKS,
                                           scheduler=scheduler)

        self._server = JsonRpcServer(bridge, timeout, self._executor.dispatch)
        self._reactor = StompReactor(subs)
Exemple #40
0
def rescan():
    timeout = config.getint('irs', 'scsi_rescan_maximal_timeout')
    log.debug("Performing SCSI scan, this will take up to %s seconds", timeout)
    rescanOp = iscsiadm.session_rescan_async()
    rescanOp.wait(timeout=timeout)
Exemple #41
0
    def __init__(self, irs, log, scheduler):
        """
        Initialize the (single) clientIF instance

        :param irs: a Dispatcher object to be used as this object's irs.
        :type irs: :class:`vdsm.storage.dispatcher.Dispatcher`
        :param log: a log object to be used for this object's logging.
        :type log: :class:`logging.Logger`
        """
        self.vmContainerLock = threading.Lock()
        self._networkSemaphore = threading.Semaphore()
        self._shutdownSemaphore = threading.Semaphore()
        self.irs = irs
        if self.irs:
            self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self))
            self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB)
        self.log = log
        self._recovery = True
        self.channelListener = Listener(self.log)
        self.mom = None
        self.servers = {}
        self._broker_client = None
        self._subscriptions = defaultdict(list)
        self._scheduler = scheduler
        self._unknown_vm_ids = set()
        if _glusterEnabled:
            self.gluster = gapi.GlusterApi()
        else:
            self.gluster = None
        try:
            self.vmContainer = {}
            self.lastRemoteAccess = 0
            self._enabled = True
            self._netConfigDirty = False
            self._prepareMOM()
            secret.clear()
            concurrent.thread(self._recoverThread, name='vmrecovery').start()
            self.channelListener.settimeout(
                config.getint('vars', 'guest_agent_timeout'))
            self.channelListener.start()
            self.threadLocal = threading.local()
            self.threadLocal.client = ''

            host = config.get('addresses', 'management_ip')
            port = config.getint('addresses', 'management_port')

            # When IPv6 is not enabled, fallback to listen on IPv4 address
            try:
                self._createAcceptor(host, port)
            except socket.error as e:
                if e.errno == errno.EAFNOSUPPORT and host in ('::', '::1'):
                    fallback_host = '0.0.0.0'
                    self._createAcceptor(fallback_host, port)
                else:
                    raise

            self._prepareHttpServer()
            self._prepareJSONRPCServer()
            self._connectToBroker()
        except:
            self.log.error('failed to init clientIF, '
                           'shutting down storage dispatcher')
            if self.irs:
                self.irs.prepareForShutdown()
            raise
Exemple #42
0
    def _recoverExistingVms(self):
        start_time = utils.monotonic_time()
        try:
            self.log.debug('recovery: started')

            # Starting up libvirt might take long when host under high load,
            # we prefer running this code in external thread to avoid blocking
            # API response.
            mog = min(config.getint('vars', 'max_outgoing_migrations'),
                      caps.CpuTopology().cores())
            migration.SourceThread.setMaxOutgoingMigrations(mog)

            # Recover stage 1: domains from libvirt
            doms = getVDSMDomains()
            num_doms = len(doms)
            for idx, v in enumerate(doms):
                vmId = v.UUIDString()
                if self._recoverVm(vmId):
                    self.log.info(
                        'recovery [1:%d/%d]: recovered domain %s from libvirt',
                        idx + 1, num_doms, vmId)
                else:
                    self.log.info(
                        'recovery [1:%d/%d]: loose domain %s found,'
                        ' killing it.', idx + 1, num_doms, vmId)
                    try:
                        v.destroy()
                    except libvirt.libvirtError:
                        self.log.exception(
                            'recovery [1:%d/%d]: failed to kill loose'
                            ' domain %s', idx + 1, num_doms, vmId)

            # Recover stage 2: domains from recovery files
            # we do this to safely handle VMs which disappeared
            # from the host while VDSM was down/restarting
            rec_vms = self._getVDSMVmsFromRecovery()
            num_rec_vms = len(rec_vms)
            if rec_vms:
                self.log.warning(
                    'recovery: found %i VMs from recovery files not'
                    ' reported by libvirt. This should not happen!'
                    ' Will try to recover them.', num_rec_vms)

            for idx, vmId in enumerate(rec_vms):
                if self._recoverVm(vmId):
                    self.log.info(
                        'recovery [2:%d/%d]: recovered domain %s'
                        ' from data file', idx + 1, num_rec_vms, vmId)
                else:
                    self.log.warning(
                        'recovery [2:%d/%d]: VM %s failed to recover from data'
                        ' file, reported as Down', idx + 1, num_rec_vms, vmId)

            # recover stage 3: waiting for domains to go up
            while self._enabled:
                launching = sum(
                    int(v.lastStatus == vmstatus.WAIT_FOR_LAUNCH)
                    for v in self.vmContainer.values())
                if not launching:
                    break
                else:
                    self.log.info('recovery: waiting for %d domains to go up',
                                  launching)
                time.sleep(1)
            self._cleanOldFiles()
            self._recovery = False

            # Now if we have VMs to restore we should wait pool connection
            # and then prepare all volumes.
            # Actually, we need it just to get the resources for future
            # volumes manipulations
            while self._enabled and self.vmContainer and \
                    not self.irs.getConnectedStoragePoolsList()['poollist']:
                self.log.info('recovery: waiting for storage pool to go up')
                time.sleep(5)

            vm_objects = self.vmContainer.values()
            num_vm_objects = len(vm_objects)
            for idx, vm_obj in enumerate(vm_objects):
                # Let's recover as much VMs as possible
                try:
                    # Do not prepare volumes when system goes down
                    if self._enabled:
                        self.log.info(
                            'recovery [%d/%d]: preparing paths for'
                            ' domain %s', idx + 1, num_vm_objects, vm_obj.id)
                        vm_obj.preparePaths(
                            vm_obj.devSpecMapFromConf()[hwclass.DISK])
                except:
                    self.log.exception("recovery [%d/%d]: failed for vm %s",
                                       idx + 1, num_vm_objects, vm_obj.id)

            self.log.info('recovery: completed in %is',
                          utils.monotonic_time() - start_time)

        except:
            self.log.exception("recovery: failed")
            raise
Exemple #43
0
class MonitorThread(object):
    _MIGRATION_MONITOR_INTERVAL = config.getint(
        'vars', 'migration_monitor_interval')  # seconds

    def __init__(self, vm, startTime, conv_schedule, use_conv_schedule):
        super(MonitorThread, self).__init__()
        self._stop = threading.Event()
        self._vm = vm
        self._startTime = startTime
        self.daemon = True
        self.progress = None
        self._conv_schedule = conv_schedule
        self._use_conv_schedule = use_conv_schedule
        self.downtime_thread = _FakeThreadInterface()
        self._thread = concurrent.thread(self.run,
                                         name='migmon/' + self._vm.id[:8])

    def start(self):
        self._thread.start()

    def join(self):
        self._thread.join()

    @property
    def enabled(self):
        return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0

    @logutils.traceback()
    def run(self):
        if self.enabled:
            self._vm.log.debug('starting migration monitor thread')
            try:
                self.monitor_migration()
            except virdomain.NotConnectedError as e:
                # In case the VM is stopped during migration, there is a race
                # between domain disconnection and stopping the monitoring
                # thread. Then the domain may no longer be connected when
                # monitor_migration loop tries to access it. That's harmless
                # and shouldn't bubble up, let's just finish the thread.
                self._vm.log.debug('domain disconnected in monitor thread: %s',
                                   e)
            finally:
                self.downtime_thread.stop()
            if self.downtime_thread.is_alive():
                # on very short migrations, the downtime thread
                # may not be started at all.
                self.downtime_thread.join()
            self._vm.log.debug('stopped migration monitor thread')
        else:
            self._vm.log.info('migration monitor thread disabled'
                              ' (monitoring interval set to 0)')

    def monitor_migration(self):
        memSize = self._vm.mem_size_mb()
        maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) // 1024
        progress_timeout = config.getint('vars', 'migration_progress_timeout')
        lastProgressTime = time.time()
        lowmark = None
        lastDataRemaining = None
        iterationCount = 0

        self._execute_init(self._conv_schedule['init'])
        if not self._use_conv_schedule:
            self._vm.log.debug('setting initial migration downtime')
            self.downtime_thread.set_initial_downtime()

        while not self._stop.isSet():
            stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            if stopped:
                break

            job_stats = self._vm._dom.jobStats()
            # It may happen that the migration did not start yet
            # so we'll keep waiting
            if not ongoing(job_stats):
                continue

            progress = Progress.from_job_stats(job_stats)
            self._vm.send_migration_status_event()

            now = time.time()
            if self._vm.post_copy != PostCopyPhase.NONE:
                # Post-copy mode is a final state of a migration -- it either
                # completes or fails and stops the VM, there is no way to
                # continue with the migration in either case.  So we won't
                # handle any further schedule actions once post-copy is
                # successfully started.  It's still recommended to put the
                # abort action after the post-copy action in the schedule, for
                # the case when it's not possible to switch to the post-copy
                # mode for some reason.
                if self._vm.post_copy == PostCopyPhase.RUNNING:
                    # If post-copy is not RUNNING then we are in the interim
                    # phase (which should be short) between initiating the
                    # post-copy migration and the actual start of the post-copy
                    # migration.  Nothing needs to be done in that case.
                    self._vm.log.debug(
                        'Post-copy migration still in progress: %d',
                        progress.data_remaining)
            elif not self._use_conv_schedule and\
                    (0 < migrationMaxTime < now - self._startTime):
                self._vm.log.warn(
                    'The migration took %d seconds which is '
                    'exceeding the configured maximum time '
                    'for migrations of %d seconds. The '
                    'migration will be aborted.', now - self._startTime,
                    migrationMaxTime)
                self._vm._dom.abortJob()
                self.stop()
                break
            elif (lowmark is None) or (lowmark > progress.data_remaining):
                lowmark = progress.data_remaining
                lastProgressTime = now
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).', progress.data_remaining // Mbytes,
                    lowmark // Mbytes)

            if not self._vm.post_copy and\
                    lastDataRemaining is not None and\
                    lastDataRemaining < progress.data_remaining:
                iterationCount += 1
                self._vm.log.debug('new iteration detected: %i',
                                   iterationCount)
                if self._use_conv_schedule:
                    self._next_action(iterationCount)
                elif iterationCount == 1:
                    # it does not make sense to do any adjustments before
                    # first iteration.
                    self.downtime_thread.start()

            lastDataRemaining = progress.data_remaining

            if not self._use_conv_schedule and\
                    (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                self._vm._dom.abortJob()
                self.stop()

            if self._stop.isSet():
                break

            self.progress = progress
            self._vm.log.info('%s', progress)

    def stop(self):
        self._vm.log.debug('stopping migration monitor thread')
        self._stop.set()

    def _next_action(self, stalling):
        head = self._conv_schedule['stalling'][0]

        self._vm.log.debug(
            'Stalling for %d iterations, '
            'checking to make next action: '
            '%s', stalling, head)
        if head['limit'] < stalling:
            self._execute_action_with_params(head['action'])
            self._conv_schedule['stalling'].pop(0)
            self._vm.log.debug('setting conv schedule to: %s',
                               self._conv_schedule)

    def _execute_init(self, init_actions):
        for action_with_params in init_actions:
            self._execute_action_with_params(action_with_params)

    def _execute_action_with_params(self, action_with_params):
        action = str(action_with_params['name'])
        vm = self._vm
        if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME:
            downtime = int(action_with_params['params'][0])
            vm.log.debug('Setting downtime to %d', downtime)
            vm._dom.migrateSetMaxDowntime(downtime, 0)
        elif action == CONVERGENCE_SCHEDULE_POST_COPY:
            if not self._vm.switch_migration_to_post_copy():
                # Do nothing for now; the next action will be invoked after a
                # while
                vm.log.warn('Failed to switch to post-copy migration')
        elif action == CONVERGENCE_SCHEDULE_SET_ABORT:
            vm.log.warn('Aborting migration')
            vm._dom.abortJob()
            self.stop()
Exemple #44
0
class ImageResourceFactory(rm.SimpleResourceFactory):
    """
    This factory produce resources for images
    """
    storage_repository = config.get('irs', 'repository')
    # Resource timeouts are in seconds. It's written in ms in the config for
    # backward competability reasons
    resource_default_timeout = config.getint('irs',
                                             'prepare_image_timeout') / 1000.0

    def __init__(self, sdUUID):
        rm.SimpleResourceFactory.__init__(self)
        self.sdUUID = sdUUID
        self.volumeResourcesNamespace = rm.getNamespace(
            sc.VOLUME_NAMESPACE, self.sdUUID)

    def __getResourceCandidatesList(self, resourceName, lockType):
        """
        Return list of lock candidates (template and volumes)
        """
        volResourcesList = []
        template = None
        dom = sdCache.produce(sdUUID=self.sdUUID)
        # Get the list of the volumes
        repoPath = os.path.join(self.storage_repository, dom.getPools()[0])
        try:
            chain = image.Image(repoPath).getChain(sdUUID=self.sdUUID,
                                                   imgUUID=resourceName)
        except se.ImageDoesNotExistInSD:
            log.debug("Image %s does not exist in domain %s", resourceName,
                      self.sdUUID)
            return []

        # check if the chain is build above a template, or it is a standalone
        pvol = chain[0].getParentVolume()
        if pvol:
            template = pvol.volUUID
        elif chain[0].isShared():
            # Image of template itself,
            # with no other volumes in chain
            template = chain[0].volUUID
            del chain[:]

        volUUIDChain = [vol.volUUID for vol in chain]
        volUUIDChain.sort()

        # Activate all volumes in chain at once.
        # We will attempt to activate all volumes again down to the flow with
        # no consequence, since they are already active.
        # TODO Fix resource framework to hold images, instead of specific vols.
        # This assumes that chains can not spread into more than one SD.
        if dom.__class__.__name__ == "BlockStorageDomain":
            lvm.activateLVs(self.sdUUID, volUUIDChain)

        failed = False
        # Acquire template locks:
        # - 'lockType' for template's image itself
        # - Always 'shared' lock for image based on template
        try:
            if template:
                if len(volUUIDChain) > 0:
                    volRes = rm.acquireResource(
                        self.volumeResourcesNamespace,
                        template,
                        rm.SHARED,
                        timeout=self.resource_default_timeout)
                else:
                    volRes = rm.acquireResource(
                        self.volumeResourcesNamespace,
                        template,
                        lockType,
                        timeout=self.resource_default_timeout)
                volResourcesList.append(volRes)

            # Acquire 'lockType' volume locks
            for volUUID in volUUIDChain:
                volRes = rm.acquireResource(
                    self.volumeResourcesNamespace,
                    volUUID,
                    lockType,
                    timeout=self.resource_default_timeout)

                volResourcesList.append(volRes)
        except (rm.RequestTimedOutError, se.ResourceAcqusitionFailed) as e:
            log.debug("Cannot acquire volume resource (%s)", str(e))
            failed = True
            raise
        except Exception:
            log.debug("Cannot acquire volume resource", exc_info=True)
            failed = True
            raise
        finally:
            if failed:
                # Release already acquired template/volumes locks
                for volRes in volResourcesList:
                    volRes.release()

        return volResourcesList

    def createResource(self, resourceName, lockType):
        volResourcesList = self.__getResourceCandidatesList(
            resourceName, lockType)
        return ImageResource(volResourcesList)
Exemple #45
0
    def monitor_migration(self):
        memSize = self._vm.mem_size_mb()
        maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) // 1024
        progress_timeout = config.getint('vars', 'migration_progress_timeout')
        lastProgressTime = time.time()
        lowmark = None
        lastDataRemaining = None
        iterationCount = 0

        self._execute_init(self._conv_schedule['init'])
        if not self._use_conv_schedule:
            self._vm.log.debug('setting initial migration downtime')
            self.downtime_thread.set_initial_downtime()

        while not self._stop.isSet():
            stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            if stopped:
                break

            job_stats = self._vm._dom.jobStats()
            # It may happen that the migration did not start yet
            # so we'll keep waiting
            if not ongoing(job_stats):
                continue

            progress = Progress.from_job_stats(job_stats)
            self._vm.send_migration_status_event()

            now = time.time()
            if self._vm.post_copy != PostCopyPhase.NONE:
                # Post-copy mode is a final state of a migration -- it either
                # completes or fails and stops the VM, there is no way to
                # continue with the migration in either case.  So we won't
                # handle any further schedule actions once post-copy is
                # successfully started.  It's still recommended to put the
                # abort action after the post-copy action in the schedule, for
                # the case when it's not possible to switch to the post-copy
                # mode for some reason.
                if self._vm.post_copy == PostCopyPhase.RUNNING:
                    # If post-copy is not RUNNING then we are in the interim
                    # phase (which should be short) between initiating the
                    # post-copy migration and the actual start of the post-copy
                    # migration.  Nothing needs to be done in that case.
                    self._vm.log.debug(
                        'Post-copy migration still in progress: %d',
                        progress.data_remaining)
            elif not self._use_conv_schedule and\
                    (0 < migrationMaxTime < now - self._startTime):
                self._vm.log.warn(
                    'The migration took %d seconds which is '
                    'exceeding the configured maximum time '
                    'for migrations of %d seconds. The '
                    'migration will be aborted.', now - self._startTime,
                    migrationMaxTime)
                self._vm._dom.abortJob()
                self.stop()
                break
            elif (lowmark is None) or (lowmark > progress.data_remaining):
                lowmark = progress.data_remaining
                lastProgressTime = now
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).', progress.data_remaining // Mbytes,
                    lowmark // Mbytes)

            if not self._vm.post_copy and\
                    lastDataRemaining is not None and\
                    lastDataRemaining < progress.data_remaining:
                iterationCount += 1
                self._vm.log.debug('new iteration detected: %i',
                                   iterationCount)
                if self._use_conv_schedule:
                    self._next_action(iterationCount)
                elif iterationCount == 1:
                    # it does not make sense to do any adjustments before
                    # first iteration.
                    self.downtime_thread.start()

            lastDataRemaining = progress.data_remaining

            if not self._use_conv_schedule and\
                    (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                self._vm._dom.abortJob()
                self.stop()

            if self._stop.isSet():
                break

            self.progress = progress
            self._vm.log.info('%s', progress)
Exemple #46
0
from vdsm.common.define import NORMAL
from vdsm.common.network.address import normalize_literal_addr
from vdsm.common.units import MiB
from vdsm.virt.utils import DynamicBoundedSemaphore

from vdsm.virt import virdomain
from vdsm.virt import vmexitreason
from vdsm.virt import vmstatus

MODE_REMOTE = 'remote'
MODE_FILE = 'file'

METHOD_ONLINE = 'online'

incomingMigrations = DynamicBoundedSemaphore(
    max(1, config.getint('vars', 'max_incoming_migrations')))

CONVERGENCE_SCHEDULE_SET_DOWNTIME = "setDowntime"
CONVERGENCE_SCHEDULE_POST_COPY = "postcopy"
CONVERGENCE_SCHEDULE_SET_ABORT = "abort"

ADDRESS = '0'
PORT = 54321


class MigrationDestinationSetupError(RuntimeError):
    """
    Failed to create migration destination VM.
    """

Exemple #47
0
MASTERLV_SIZE = "1024"  # In MiB = 2 ** 20 = 1024 ** 2 => 1GiB
BlockSDVol = namedtuple("BlockSDVol", "name, image, parent")

log = logging.getLogger("Storage.BlockSD")

# FIXME: Make this calculated from something logical
RESERVED_METADATA_SIZE = 40 * (2 ** 20)
RESERVED_MAILBOX_SIZE = MAILBOX_SIZE * clusterlock.MAX_HOST_ID
METADATA_BASE_SIZE = 378
# VG's min metadata threshold is 20%
VG_MDA_MIN_THRESHOLD = 0.2
# VG's metadata size in MiB
VG_METADATASIZE = 128

MAX_PVS_LIMIT = 10  # BZ#648051
MAX_PVS = config.getint('irs', 'maximum_allowed_pvs')
if MAX_PVS > MAX_PVS_LIMIT:
    log.warning("maximum_allowed_pvs = %d ignored. MAX_PVS = %d", MAX_PVS,
                MAX_PVS_LIMIT)
    MAX_PVS = MAX_PVS_LIMIT

PVS_METADATA_SIZE = MAX_PVS * 142

SD_METADATA_SIZE = 2048
DEFAULT_BLOCKSIZE = 512

DMDK_VGUUID = "VGUUID"
DMDK_PV_REGEX = re.compile(r"^PV\d+$")
DMDK_LOGBLKSIZE = "LOGBLKSIZE"
DMDK_PHYBLKSIZE = "PHYBLKSIZE"
Exemple #48
0
 def getMaximumSupportedDomains(self):
     return config.getint("irs", "maximum_domains_in_pool")
Exemple #49
0
from vdsm.storage import sd
from vdsm.storage.sdm.api import create_volume
from vdsm.storage.volumemetadata import VolumeMetadata


class ExpectedFailure(Exception):
    pass


def failure(*args, **kwargs):
    raise ExpectedFailure()


MB = 1024**2
VOL_SIZE = 1073741824
BLOCK_INITIAL_CHUNK_SIZE = MB * config.getint("irs",
                                              "volume_utilization_chunk_mb")
BASE_PARAMS = {
    sc.RAW_FORMAT:
    (VOL_SIZE, sc.RAW_FORMAT, image.SYSTEM_DISK_TYPE, 'raw_volume'),
    sc.COW_FORMAT:
    (VOL_SIZE, sc.COW_FORMAT, image.SYSTEM_DISK_TYPE, 'cow_volume')
}


@expandPermutations
class VolumeArtifactsTestsMixin(object):
    def setUp(self):
        self.img_id = make_uuid()
        self.vol_id = make_uuid()

    def test_state_missing(self):
Exemple #50
0
    def monitor_migration(self):
        memSize = int(self._vm.conf['memSize'])
        maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024
        progress_timeout = config.getint('vars', 'migration_progress_timeout')
        lastProgressTime = time.time()
        lowmark = None
        lastDataRemaining = None
        iterationCount = 0

        self._execute_init(self._conv_schedule['init'])
        if not self._use_conv_schedule:
            self._vm.log.debug('setting initial migration downtime')
            self.downtime_thread.set_initial_downtime()

        while not self._stop.isSet():
            stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            if stopped:
                break

            job_stats = self._vm._dom.jobStats()
            # It may happen that the migration did not start yet
            # so we'll keep waiting
            if not ongoing(job_stats):
                continue

            progress = Progress.from_job_stats(job_stats)

            now = time.time()
            if not self._use_conv_schedule and\
                    (0 < migrationMaxTime < now - self._startTime):
                self._vm.log.warn(
                    'The migration took %d seconds which is '
                    'exceeding the configured maximum time '
                    'for migrations of %d seconds. The '
                    'migration will be aborted.', now - self._startTime,
                    migrationMaxTime)
                self._vm._dom.abortJob()
                self.stop()
                break
            elif (lowmark is None) or (lowmark > progress.data_remaining):
                lowmark = progress.data_remaining
                lastProgressTime = now
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).'
                    ' Refer to RHBZ#919201.', progress.data_remaining / Mbytes,
                    lowmark / Mbytes)

            if lastDataRemaining is not None and\
                    lastDataRemaining < progress.data_remaining:
                iterationCount += 1
                self._vm.log.debug('new iteration detected: %i',
                                   iterationCount)
                if self._use_conv_schedule:
                    self._next_action(iterationCount)
                elif iterationCount == 1:
                    # it does not make sense to do any adjustments before
                    # first iteration.
                    self.downtime_thread.start()

            lastDataRemaining = progress.data_remaining

            if not self._use_conv_schedule and\
                    (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                self._vm._dom.abortJob()
                self.stop()

            if self._stop.isSet():
                break

            self.progress = progress
            self._vm.log.info('%s', progress)
Exemple #51
0
 def __init__(self,
              vm,
              dst='',
              dstparams='',
              mode=MODE_REMOTE,
              method=METHOD_ONLINE,
              tunneled=False,
              dstqemu='',
              abortOnError=False,
              consoleAddress=None,
              compressed=False,
              autoConverge=False,
              recovery=False,
              encrypted=False,
              **kwargs):
     self.log = vm.log
     self._vm = vm
     self._dom = DomainAdapter(self._vm)
     self._dst = dst
     self._mode = mode
     self._dstparams = dstparams
     self._enableGuestEvents = kwargs.get('enableGuestEvents', False)
     # TODO: conv.tobool shouldn't be used in this constructor, the
     # conversions should be handled properly in the API layer
     self._consoleAddress = consoleAddress
     self._dstqemu = dstqemu
     self._encrypted = encrypted
     self._maxBandwidth = int(
         kwargs.get('maxBandwidth')
         or config.getint('vars', 'migration_max_bandwidth'))
     self._incomingLimit = kwargs.get('incomingLimit')
     self._outgoingLimit = kwargs.get('outgoingLimit')
     self.status = {
         'status': {
             'code': 0,
             'message': 'Migration in progress'
         }
     }
     # we need to guard against concurrent updates only
     self._lock = threading.Lock()
     self._progress = 0
     self._thread = concurrent.thread(self.run,
                                      name='migsrc/' + self._vm.id[:8])
     self._preparingMigrationEvt = True
     self._migrationCanceledEvt = threading.Event()
     self._monitorThread = None
     self._destServer = None
     self._legacy_payload_path = None
     if 'convergenceSchedule' in kwargs:
         self._convergence_schedule = kwargs['convergenceSchedule']
     else:
         # Needed for Engine < 4.3 or when legacy migration is used
         # as a supposedly rare fallback in Engine >= 4.3.
         self._convergence_schedule = \
             self._legacy_convergence_schedule(kwargs.get('downtime'))
         self.log.info(
             'using a computed convergence schedule for '
             'a legacy migration: %s', self._convergence_schedule)
     self.log.debug('convergence schedule set to: %s',
                    str(self._convergence_schedule))
     self._started = False
     self._failed = False
     self._recovery = recovery
     tunneled = conv.tobool(tunneled)
     abortOnError = conv.tobool(abortOnError)
     compressed = conv.tobool(compressed)
     autoConverge = conv.tobool(autoConverge)
     self._migration_flags = self._calculate_migration_flags(
         tunneled, abortOnError, compressed, autoConverge, encrypted)
Exemple #52
0
class MonitorThread(object):
    _MIGRATION_MONITOR_INTERVAL = config.getint(
        'vars', 'migration_monitor_interval')  # seconds

    def __init__(self, vm, startTime, conv_schedule, use_conv_schedule):
        super(MonitorThread, self).__init__()
        self._stop = threading.Event()
        self._vm = vm
        self._startTime = startTime
        self.daemon = True
        self.progress = None
        self._conv_schedule = conv_schedule
        self._use_conv_schedule = use_conv_schedule
        self.downtime_thread = _FakeThreadInterface()
        self._thread = concurrent.thread(self.run)

    def start(self):
        self._thread.start()

    def join(self):
        self._thread.join()

    @property
    def enabled(self):
        return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0

    @utils.traceback()
    def run(self):
        if self.enabled:
            self._vm.log.debug('starting migration monitor thread')
            try:
                self.monitor_migration()
            finally:
                self.downtime_thread.stop()
            if self.downtime_thread.is_alive():
                # on very short migrations, the downtime thread
                # may not be started at all.
                self.downtime_thread.join()
            self._vm.log.debug('stopped migration monitor thread')
        else:
            self._vm.log.info('migration monitor thread disabled'
                              ' (monitoring interval set to 0)')

    def monitor_migration(self):
        memSize = int(self._vm.conf['memSize'])
        maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024
        progress_timeout = config.getint('vars', 'migration_progress_timeout')
        lastProgressTime = time.time()
        lowmark = None
        lastDataRemaining = None
        iterationCount = 0

        self._execute_init(self._conv_schedule['init'])
        if not self._use_conv_schedule:
            self._vm.log.debug('setting initial migration downtime')
            self.downtime_thread.set_initial_downtime()

        while not self._stop.isSet():
            stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            if stopped:
                break

            job_stats = self._vm._dom.jobStats()
            # It may happen that the migration did not start yet
            # so we'll keep waiting
            if not ongoing(job_stats):
                continue

            progress = Progress.from_job_stats(job_stats)

            now = time.time()
            if not self._use_conv_schedule and\
                    (0 < migrationMaxTime < now - self._startTime):
                self._vm.log.warn(
                    'The migration took %d seconds which is '
                    'exceeding the configured maximum time '
                    'for migrations of %d seconds. The '
                    'migration will be aborted.', now - self._startTime,
                    migrationMaxTime)
                self._vm._dom.abortJob()
                self.stop()
                break
            elif (lowmark is None) or (lowmark > progress.data_remaining):
                lowmark = progress.data_remaining
                lastProgressTime = now
            else:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).'
                    ' Refer to RHBZ#919201.', progress.data_remaining / Mbytes,
                    lowmark / Mbytes)

            if lastDataRemaining is not None and\
                    lastDataRemaining < progress.data_remaining:
                iterationCount += 1
                self._vm.log.debug('new iteration detected: %i',
                                   iterationCount)
                if self._use_conv_schedule:
                    self._next_action(iterationCount)
                elif iterationCount == 1:
                    # it does not make sense to do any adjustments before
                    # first iteration.
                    self.downtime_thread.start()

            lastDataRemaining = progress.data_remaining

            if not self._use_conv_schedule and\
                    (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                self._vm._dom.abortJob()
                self.stop()

            if self._stop.isSet():
                break

            self.progress = progress
            self._vm.log.info('%s', progress)

    def stop(self):
        self._vm.log.debug('stopping migration monitor thread')
        self._stop.set()

    def _next_action(self, stalling):
        head = self._conv_schedule['stalling'][0]

        self._vm.log.debug(
            'Stalling for %d iterations, '
            'checking to make next action: '
            '%s', stalling, head)
        if head['limit'] < stalling:
            self._execute_action_with_params(head['action'])
            self._conv_schedule['stalling'].pop(0)
            self._vm.log.debug('setting conv schedule to: %s',
                               self._conv_schedule)

    def _execute_init(self, init_actions):
        for action_with_params in init_actions:
            self._execute_action_with_params(action_with_params)

    def _execute_action_with_params(self, action_with_params):
        action = str(action_with_params['name'])
        if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME:
            downtime = int(action_with_params['params'][0])
            self._vm.log.debug('Setting downtime to %d', downtime)
            self._vm._dom.migrateSetMaxDowntime(downtime, 0)
        elif action == CONVERGENCE_SCHEDULE_SET_ABORT:
            self._vm.log.warn('Aborting migration')
            self._vm._dom.abortJob()
            self.stop()
Exemple #53
0
    def _recoverExistingVms(self):
        try:
            # Starting up libvirt might take long when host under high load,
            # we prefer running this code in external thread to avoid blocking
            # API response.
            mog = min(config.getint('vars', 'max_outgoing_migrations'),
                      caps.CpuTopology().cores())
            migration.SourceThread.setMaxOutgoingMigrations(mog)

            # Recover
            for v in getVDSMDomains():
                vmId = v.UUIDString()
                if not self._recoverVm(vmId):
                    # RH qemu proc without recovery
                    self.log.info(
                        'loose qemu process with id: '
                        '%s found, killing it.', vmId)
                    try:
                        v.destroy()
                    except libvirt.libvirtError:
                        self.log.error(
                            'failed to kill loose qemu '
                            'process with id: %s',
                            vmId,
                            exc_info=True)

            # we do this to safely handle VMs which disappeared
            # from the host while VDSM was down/restarting
            recVms = self._getVDSMVmsFromRecovery()
            if recVms:
                self.log.warning(
                    'Found %i VMs from recovery files not'
                    ' reported by libvirt.'
                    ' This should not happen!'
                    ' Will try to recover them.', len(recVms))
            for vmId in recVms:
                if not self._recoverVm(vmId):
                    self.log.warning(
                        'VM %s failed to recover from recovery'
                        ' file, reported as Down', vmId)

            while (self._enabled and vmstatus.WAIT_FOR_LAUNCH
                   in [v.lastStatus for v in self.vmContainer.values()]):
                time.sleep(1)
            self._cleanOldFiles()
            self._recovery = False

            # Now if we have VMs to restore we should wait pool connection
            # and then prepare all volumes.
            # Actually, we need it just to get the resources for future
            # volumes manipulations
            while self._enabled and self.vmContainer and \
                    not self.irs.getConnectedStoragePoolsList()['poollist']:
                time.sleep(5)

            for vmId, vmObj in self.vmContainer.items():
                # Let's recover as much VMs as possible
                try:
                    # Do not prepare volumes when system goes down
                    if self._enabled:
                        vmObj.preparePaths(
                            vmObj.buildConfDevices()[vm.DISK_DEVICES])
                except:
                    self.log.error("Vm %s recovery failed",
                                   vmId,
                                   exc_info=True)
        except:
            self.log.error("Vm's recovery failed", exc_info=True)
            raise
Exemple #54
0
def get():
    numa.update()
    caps = {}
    cpu_topology = numa.cpu_topology()

    caps['kvmEnabled'] = str(os.path.exists('/dev/kvm')).lower()

    if config.getboolean('vars', 'report_host_threads_as_cores'):
        caps['cpuCores'] = str(cpu_topology.threads)
    else:
        caps['cpuCores'] = str(cpu_topology.cores)

    caps['cpuThreads'] = str(cpu_topology.threads)
    caps['cpuSockets'] = str(cpu_topology.sockets)
    caps['onlineCpus'] = ','.join(
        [str(cpu_id) for cpu_id in cpu_topology.online_cpus])

    caps['cpuTopology'] = [{
        'cpu_id': cpu.cpu_id,
        'numa_cell_id': cpu.numa_cell_id,
        'socket_id': cpu.socket_id,
        'die_id': cpu.die_id,
        'core_id': cpu.core_id,
    } for cpu in numa.cpu_info()]

    caps['cpuSpeed'] = cpuinfo.frequency()
    caps['cpuModel'] = cpuinfo.model()
    caps['cpuFlags'] = ','.join(_getFlagsAndFeatures())
    caps['vdsmToCpusAffinity'] = list(taskset.get(os.getpid()))

    caps.update(dsaversion.version_info())

    proxy = supervdsm.getProxy()
    net_caps = proxy.network_caps()
    caps.update(net_caps)
    caps['ovnConfigured'] = proxy.is_ovn_configured()

    try:
        caps['hooks'] = hooks.installed()
    except:
        logging.debug('not reporting hooks', exc_info=True)

    caps['operatingSystem'] = osinfo.version()
    caps['uuid'] = host.uuid()
    caps['packages2'] = osinfo.package_versions()
    caps['realtimeKernel'] = osinfo.runtime_kernel_flags().realtime
    caps['kernelArgs'] = osinfo.kernel_args()
    caps['nestedVirtualization'] = osinfo.nested_virtualization().enabled
    caps['emulatedMachines'] = machinetype.emulated_machines(
        cpuarch.effective())
    caps['ISCSIInitiatorName'] = _getIscsiIniName()
    caps['HBAInventory'] = hba.HBAInventory()
    caps['vmTypes'] = ['kvm']

    caps['memSize'] = str(utils.readMemInfo()['MemTotal'] // 1024)
    caps['reservedMem'] = str(
        config.getint('vars', 'host_mem_reserve') +
        config.getint('vars', 'extra_mem_reserve'))
    caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead')

    caps['rngSources'] = rngsources.list_available()

    caps['numaNodes'] = dict(numa.topology())
    caps['numaNodeDistance'] = dict(numa.distances())
    caps['autoNumaBalancing'] = numa.autonuma_status()

    caps['selinux'] = osinfo.selinux_status()

    caps['liveSnapshot'] = 'true'
    caps['liveMerge'] = 'true'
    caps['kdumpStatus'] = osinfo.kdump_status()
    caps["deferred_preallocation"] = True

    caps['hostdevPassthrough'] = str(hostdev.is_supported()).lower()
    # TODO This needs to be removed after adding engine side support
    # and adding gdeploy support to enable libgfapi on RHHI by default
    caps['additionalFeatures'] = ['libgfapi_supported']
    if osinfo.glusterEnabled:
        from vdsm.gluster.api import glusterAdditionalFeatures
        caps['additionalFeatures'].extend(glusterAdditionalFeatures())
    caps['hostedEngineDeployed'] = _isHostedEngineDeployed()
    caps['hugepages'] = hugepages.supported()
    caps['kernelFeatures'] = osinfo.kernel_features()
    caps['vncEncrypted'] = _isVncEncrypted()
    caps['backupEnabled'] = True
    caps['coldBackupEnabled'] = True
    caps['clearBitmapsEnabled'] = True
    caps['fipsEnabled'] = _getFipsEnabled()
    try:
        caps['boot_uuid'] = osinfo.boot_uuid()
    except Exception:
        logging.exception("Can not find boot uuid")
    caps['tscFrequency'] = _getTscFrequency()
    caps['tscScaling'] = _getTscScaling()

    try:
        caps["connector_info"] = managedvolume.connector_info()
    except se.ManagedVolumeNotSupported as e:
        logging.info("managedvolume not supported: %s", e)
    except se.ManagedVolumeHelperFailed as e:
        logging.exception("Error getting managedvolume connector info: %s", e)

    # Which domain versions are supported by this host.
    caps["domain_versions"] = sc.DOMAIN_VERSIONS

    caps["supported_block_size"] = backends.supported_block_size()
    caps["cd_change_pdiv"] = True
    caps["refresh_disk_supported"] = True

    return caps
Exemple #55
0
 def connect(self):
     iscsi.addIscsiNode(self._iface, self._target, self._cred)
     timeout = config.getint("irs", "udev_settle_timeout")
     udevadm.settle(timeout)
from vdsm.common import exception
from vdsm.common import response
from vdsm.config import config
from vdsm.virt import migration
from vdsm.virt import vmstatus

from monkeypatch import MonkeyPatchScope
from testlib import VdsmTestCase as TestCaseBase
from testlib import permutations, expandPermutations
from testlib import make_config

from . import vmfakelib as fake
import pytest

# defaults
_DOWNTIME = config.getint('vars', 'migration_downtime')

_STEPS = config.getint('vars', 'migration_downtime_steps')

_STEPS_MIN = 2
_STEPS_HUGE = 1000

_DOWNTIME_MIN = 100
_DOWNTIME_HUGE = 10000

_PARAMS = tuple(
    product((_DOWNTIME_MIN, _DOWNTIME, _DOWNTIME_HUGE),
            (_STEPS_MIN, _STEPS, _STEPS_HUGE)))


@expandPermutations
Exemple #57
0
    def _startUnderlyingMigration(self, startTime):
        if self._mode == 'file':
            hooks.before_vm_hibernate(self._vm._dom.XMLDesc(0), self._vm.conf)
            try:
                self._vm._vmStats.pause()
                fname = self._vm.cif.prepareVolumePath(self._dst)
                try:
                    self._vm._dom.save(fname)
                finally:
                    self._vm.cif.teardownVolumePath(self._dst)
            except Exception:
                self._vm._vmStats.cont()
                raise
        else:
            for dev in self._vm._customDevices():
                hooks.before_device_migrate_source(
                    dev._deviceXML, self._vm.conf, dev.custom)
            hooks.before_vm_migrate_source(self._vm._dom.XMLDesc(0),
                                           self._vm.conf)
            response = self.destServer.migrationCreate(self._machineParams)
            if response['status']['code']:
                self.status = response
                raise RuntimeError('migration destination error: ' +
                                   response['status']['message'])
            if config.getboolean('vars', 'ssl'):
                transport = 'tls'
            else:
                transport = 'tcp'
            duri = 'qemu+%s://%s/system' % (transport, self.remoteHost)
            if self._vm.conf['_migrationParams']['dstqemu']:
                muri = 'tcp://%s' % \
                       self._vm.conf['_migrationParams']['dstqemu']
            else:
                muri = 'tcp://%s' % self.remoteHost

            self._vm.log.debug('starting migration to %s '
                               'with miguri %s', duri, muri)

            t = DowntimeThread(self._vm, int(self._downtime))

            self._monitorThread = MonitorThread(self._vm, startTime)
            self._monitorThread.start()

            try:
                if self._vm.hasSpice and self._vm.conf.get('clientIp'):
                    SPICE_MIGRATION_HANDOVER_TIME = 120
                    self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME)

                maxBandwidth = config.getint('vars', 'migration_max_bandwidth')
                # FIXME: there still a race here with libvirt,
                # if we call stop() and libvirt migrateToURI2 didn't start
                # we may return migration stop but it will start at libvirt
                # side
                self._preparingMigrationEvt = False
                if not self._migrationCanceledEvt:
                    self._vm._dom.migrateToURI2(
                        duri, muri, None,
                        libvirt.VIR_MIGRATE_LIVE |
                        libvirt.VIR_MIGRATE_PEER2PEER |
                        (libvirt.VIR_MIGRATE_TUNNELLED if
                            self._tunneled else 0) |
                        (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if
                            self._abortOnError else 0),
                        None, maxBandwidth)
                else:
                    self._raiseAbortError()

            finally:
                t.cancel()
                self._monitorThread.stop()
Exemple #58
0
import libvirt

from vdsm import containersconnection
from vdsm import executor
from vdsm import host
from vdsm import libvirtconnection
from vdsm.config import config
from vdsm.virt import migration
from vdsm.virt import sampling
from vdsm.virt import virdomain
from vdsm.virt import vmstatus

# Just a made up number. Maybe should be equal to number of cores?
# TODO: make them tunable through private, unsupported configuration items
_WORKERS = config.getint('sampling', 'periodic_workers')
_TASK_PER_WORKER = config.getint('sampling', 'periodic_task_per_worker')
_TASKS = _WORKERS * _TASK_PER_WORKER
_MAX_WORKERS = config.getint('sampling', 'max_workers')

_operations = []
_executor = None


def _timeout_from(interval):
    """
    Estimate a sensible timeout given a periodic interval.
    """
    return interval / 2.

Exemple #59
0
def get():
    targetArch = getTargetArch()

    caps = {}

    caps['kvmEnabled'] = \
        str(config.getboolean('vars', 'fake_kvm_support') or
            os.path.exists('/dev/kvm')).lower()

    cpuInfo = CpuInfo()
    cpuTopology = CpuTopology()
    if config.getboolean('vars', 'report_host_threads_as_cores'):
        caps['cpuCores'] = str(cpuTopology.threads())
    else:
        caps['cpuCores'] = str(cpuTopology.cores())

    caps['cpuThreads'] = str(cpuTopology.threads())
    caps['cpuSockets'] = str(cpuTopology.sockets())
    caps['onlineCpus'] = ','.join(cpuTopology.onlineCpus())
    caps['cpuSpeed'] = cpuInfo.mhz()
    if config.getboolean('vars', 'fake_kvm_support'):
        if targetArch == Architecture.X86_64:
            caps['cpuModel'] = 'Intel(Fake) CPU'

            flagList = ['vmx', 'sse2', 'nx']

            if targetArch == platform.machine():
                flagList += cpuInfo.flags()

            flags = set(flagList)

            caps['cpuFlags'] = ','.join(flags) + ',model_486,model_pentium,' \
                'model_pentium2,model_pentium3,model_pentiumpro,' \
                'model_qemu32,model_coreduo,model_core2duo,model_n270,' \
                'model_Conroe,model_Penryn,model_Nehalem,model_Opteron_G1'
        elif targetArch in Architecture.POWER:
            caps['cpuModel'] = 'POWER 8 (fake)'
            caps['cpuFlags'] = 'powernv,model_power8'
        else:
            raise RuntimeError('Unsupported architecture: %s' % targetArch)
    else:
        caps['cpuModel'] = cpuInfo.model()
        caps['cpuFlags'] = ','.join(cpuInfo.flags() +
                                    _getCompatibleCpuModels())

    caps.update(_getVersionInfo())
    caps.update(netinfo.get())
    _report_legacy_bondings(caps)
    _report_network_qos(caps)

    try:
        caps['hooks'] = hooks.installed()
    except:
        logging.debug('not reporting hooks', exc_info=True)

    caps['operatingSystem'] = osversion()
    caps['uuid'] = utils.getHostUUID()
    caps['packages2'] = _getKeyPackages()
    caps['emulatedMachines'] = _getEmulatedMachines(targetArch)
    caps['ISCSIInitiatorName'] = _getIscsiIniName()
    caps['HBAInventory'] = storage.hba.HBAInventory()
    caps['vmTypes'] = ['kvm']

    caps['memSize'] = str(utils.readMemInfo()['MemTotal'] / 1024)
    caps['reservedMem'] = str(
        config.getint('vars', 'host_mem_reserve') +
        config.getint('vars', 'extra_mem_reserve'))
    caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead')

    # Verify that our libvirt supports virtio RNG (since 10.0.2-31)
    libvirtVer = LooseVersion('-'.join(
        (caps['packages2']['libvirt']['version'],
         caps['packages2']['libvirt']['release'])))
    requiredVer = LooseVersion('0.10.2-31')

    if libvirtVer >= requiredVer:
        caps['rngSources'] = _getRngSources()
    else:
        logging.debug('VirtioRNG DISABLED: libvirt version %s required >= %s',
                      libvirtVer, requiredVer)

    caps['numaNodes'] = getNumaTopology()
    caps['numaNodeDistance'] = getNumaNodeDistance()
    caps['autoNumaBalancing'] = getAutoNumaBalancingInfo()

    caps['selinux'] = _getSELinux()

    liveSnapSupported = _getLiveSnapshotSupport(targetArch)
    if liveSnapSupported is not None:
        caps['liveSnapshot'] = str(liveSnapSupported).lower()
    caps['liveMerge'] = str(getLiveMergeSupport()).lower()
    caps['kdumpStatus'] = _getKdumpStatus()

    caps['hostdevPassthrough'] = str(_getHostdevPassthorughSupport()).lower()

    return caps
Exemple #60
0
    def monitor_migration(self):
        def update_progress(remaining, total):
            if remaining == 0 and total:
                return 100
            progress = 100 - 100 * remaining / total if total else 0
            return progress if (progress < 100) else 99

        self._vm.log.debug('starting migration monitor thread')

        memSize = int(self._vm.conf['memSize'])
        maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem')
        migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024
        lastProgressTime = time.time()
        lowmark = None
        progress_timeout = config.getint('vars', 'migration_progress_timeout')

        while not self._stop.isSet():
            self._stop.wait(self._MIGRATION_MONITOR_INTERVAL)
            (jobType, timeElapsed, _, dataTotal, dataProcessed, dataRemaining,
             memTotal, memProcessed, memRemaining, fileTotal, fileProcessed,
             _) = self._vm._dom.jobInfo()
            # from libvirt sources: data* = file* + mem*.
            # docs can be misleading due to misaligned lines.
            abort = False
            now = time.time()
            if 0 < migrationMaxTime < now - self._startTime:
                self._vm.log.warn(
                    'The migration took %d seconds which is '
                    'exceeding the configured maximum time '
                    'for migrations of %d seconds. The '
                    'migration will be aborted.', now - self._startTime,
                    migrationMaxTime)
                abort = True
            elif (lowmark is None) or (lowmark > dataRemaining):
                lowmark = dataRemaining
                lastProgressTime = now
            elif (now - lastProgressTime) > progress_timeout:
                # Migration is stuck, abort
                self._vm.log.warn(
                    'Migration is stuck: Hasn\'t progressed in %s seconds. '
                    'Aborting.' % (now - lastProgressTime))
                abort = True

            if abort:
                self._vm._dom.abortJob()
                self.stop()
                break

            if dataRemaining > lowmark:
                self._vm.log.warn(
                    'Migration stalling: remaining (%sMiB)'
                    ' > lowmark (%sMiB).'
                    ' Refer to RHBZ#919201.', dataRemaining / Mbytes,
                    lowmark / Mbytes)

            if jobType != libvirt.VIR_DOMAIN_JOB_NONE:
                self.progress = update_progress(dataRemaining, dataTotal)

                self._vm.log.info('Migration Progress: %s seconds elapsed,'
                                  ' %s%% of data processed' %
                                  (timeElapsed / 1000, self.progress))