def __init__(self, pool, maxHostID, monitorInterval=2): self._messageTypes = {} # Save arguments self._stop = False self._stopped = False self._poolID = str(pool.spUUID) self._spmStorageDir = pool.storage_repository tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool("mailbox-spm", tpSize, waitTimeout, maxTasks) # *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice # versa *** # self._inbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "inbox") if not os.path.exists(self._inbox): self.log.error("SPM_MailMonitor create failed - inbox %s does not " "exist" % repr(self._inbox)) raise RuntimeError("SPM_MailMonitor create failed - inbox %s does " "not exist" % repr(self._inbox)) self._outbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "outbox") if not os.path.exists(self._outbox): self.log.error("SPM_MailMonitor create failed - outbox %s does " "not exist" % repr(self._outbox)) raise RuntimeError("SPM_MailMonitor create failed - outbox %s " "does not exist" % repr(self._outbox)) self._numHosts = int(maxHostID) self._outMailLen = MAILBOX_SIZE * self._numHosts self._monitorInterval = monitorInterval # TODO: add support for multiple paths (multiple mailboxes) self._outgoingMail = self._outMailLen * "\0" self._incomingMail = self._outgoingMail self._inCmd = ['dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1' ] self._outCmd = ['dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock', 'conv=notrunc', 'count=1' ] self._outLock = threading.Lock() self._inLock = threading.Lock() # Clear outgoing mail self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: " "%s", self._outCmd) cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, " "dd failed") t = concurrent.thread(self.run, name="mailbox-spm", logger=self.log.name) t.start() self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
def __init__(self, vm, dst='', dstparams='', mode=MODE_REMOTE, method=METHOD_ONLINE, tunneled=False, dstqemu='', abortOnError=False, consoleAddress=None, compressed=False, autoConverge=False, **kwargs): self.log = vm.log self._vm = vm self._dst = dst self._mode = mode if method != METHOD_ONLINE: self.log.warning( 'migration method %s is deprecated, forced to "online"', method) self._dstparams = dstparams self._machineParams = {} self._tunneled = utils.tobool(tunneled) self._abortOnError = utils.tobool(abortOnError) self._consoleAddress = consoleAddress self._dstqemu = dstqemu self._downtime = kwargs.get('downtime') or \ config.get('vars', 'migration_downtime') self._maxBandwidth = int( kwargs.get('maxBandwidth') or config.getint('vars', 'migration_max_bandwidth') ) self._autoConverge = autoConverge self._compressed = compressed self.status = { 'status': { 'code': 0, 'message': 'Migration in progress'}} self._progress = 0 threading.Thread.__init__(self) self._preparingMigrationEvt = True self._migrationCanceledEvt = False self._monitorThread = None self._destServer = None progress_timeout = config.getint('vars', 'migration_progress_timeout') self._convergence_schedule = { 'init': [], 'stalling': [ { 'limit': progress_timeout, 'action': { 'name': CONVERGENCE_SCHEDULE_SET_ABORT, 'params': [] } } ] } self._use_convergence_schedule = False if 'convergenceSchedule' in kwargs: self._convergence_schedule = kwargs.get('convergenceSchedule') self._use_convergence_schedule = True self.log.debug('convergence schedule set to: %s', str(self._convergence_schedule))
def start(cif): global _operations _scheduler.start() _executor.start() def per_vm_operation(func, period): disp = VmDispatcher(cif.getVMs, _executor, func, _timeout_from(period)) return Operation(disp, period) _operations = [ # needs dispatching becuse updating the volume stats needs the # access the storage, thus can block. per_vm_operation(UpdateVolumes, config.getint("irs", "vol_size_sample_interval")), # needs dispatching becuse access FS and libvirt data per_vm_operation(NumaInfoMonitor, config.getint("vars", "vm_sample_numa_interval")), # Job monitoring need QEMU monitor access. per_vm_operation(BlockjobMonitor, config.getint("vars", "vm_sample_jobs_interval")), # libvirt sampling using bulk stats can block, but unresponsive # domains are handled inside VMBulkSampler for performance reasons; # thus, does not need dispatching. Operation( sampling.VMBulkSampler(libvirtconnection.get(cif), cif.getVMs, sampling.stats_cache), config.getint("vars", "vm_sample_interval"), ), # we do this only until we get high water mark notifications # from qemu. Access storage and/or qemu monitor, so can block, # thus we need dispatching. per_vm_operation(DriveWatermarkMonitor, config.getint("vars", "vm_watermark_interval")), ] for op in _operations: op.start()
def __init__(self, irs, log, scheduler): """ Initialize the (single) clientIF instance :param irs: a Dispatcher object to be used as this object's irs. :type irs: :class:`storage.dispatcher.Dispatcher` :param log: a log object to be used for this object's logging. :type log: :class:`logging.Logger` """ self.vmContainerLock = threading.Lock() self._networkSemaphore = threading.Semaphore() self._shutdownSemaphore = threading.Semaphore() self.irs = irs if self.irs: self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self)) self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB) self.log = log self._recovery = True self.channelListener = Listener(self.log) self._generationID = str(uuid.uuid4()) self.mom = None self.bindings = {} self._broker_client = None self._subscriptions = defaultdict(list) self._scheduler = scheduler if _glusterEnabled: self.gluster = gapi.GlusterApi(self, log) else: self.gluster = None try: self.vmContainer = {} self._hostStats = sampling.HostStatsThread( sampling.host_samples) self._hostStats.start() self.lastRemoteAccess = 0 self._enabled = True self._netConfigDirty = False self._prepareMOM() secret.clear() concurrent.thread(self._recoverThread, name='clientIFinit').start() self.channelListener.settimeout( config.getint('vars', 'guest_agent_timeout')) self.channelListener.start() self.threadLocal = threading.local() self.threadLocal.client = '' host = config.get('addresses', 'management_ip') port = config.getint('addresses', 'management_port') self._createAcceptor(host, port) self._prepareXMLRPCBinding() self._prepareJSONRPCBinding() self._connectToBroker() except: self.log.error('failed to init clientIF, ' 'shutting down storage dispatcher') if self.irs: self.irs.prepareForShutdown() raise
def __init__(self, poolID, maxHostID, inbox, outbox, monitorInterval=2): """ Note: inbox paramerter here should point to the HSM's outbox mailbox file, and vice versa. """ self._messageTypes = {} # Save arguments self._stop = False self._stopped = False self._poolID = poolID tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = wait_timeout(monitorInterval) maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool("mailbox-spm", tpSize, waitTimeout, maxTasks) self._inbox = inbox if not os.path.exists(self._inbox): self.log.error("SPM_MailMonitor create failed - inbox %s does not " "exist" % repr(self._inbox)) raise RuntimeError("SPM_MailMonitor create failed - inbox %s does " "not exist" % repr(self._inbox)) self._outbox = outbox if not os.path.exists(self._outbox): self.log.error("SPM_MailMonitor create failed - outbox %s does " "not exist" % repr(self._outbox)) raise RuntimeError("SPM_MailMonitor create failed - outbox %s " "does not exist" % repr(self._outbox)) self._numHosts = int(maxHostID) self._outMailLen = MAILBOX_SIZE * self._numHosts self._monitorInterval = monitorInterval # TODO: add support for multiple paths (multiple mailboxes) self._outgoingMail = self._outMailLen * "\0" self._incomingMail = self._outgoingMail self._inCmd = ['dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1' ] self._outCmd = ['dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock', 'conv=notrunc', 'count=1' ] self._outLock = threading.Lock() self._inLock = threading.Lock() # Clear outgoing mail self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: " "%s", self._outCmd) cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, " "dd failed") self._thread = concurrent.thread( self.run, name="mailbox-spm", log=self.log) self._thread.start() self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
def __init__(self, tpSize=config.getint('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getint('irs', 'max_tasks')): self.storage_repository = config.get('irs', 'repository') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = []
def __init__(self, tpSize=config.getint('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getint('irs', 'max_tasks')): self.tp = ThreadPool("tasks", tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = [] self._insertTaskLock = threading.Lock()
def _regular_run(self): self.log.debug("Starting migration source thread") self._recovery = False self._update_outgoing_limit() try: startTime = time.time() machineParams = self._setupRemoteMachineParams() self._setupVdsConnection() self._prepareGuest() while not self._started: try: self.log.info("Migration semaphore: acquiring") with SourceThread.ongoingMigrations: self.log.info("Migration semaphore: acquired") timeout = config.getint( 'vars', 'guest_lifecycle_event_reply_timeout') if self.hibernating: self._vm.guestAgent.events.before_hibernation( wait_timeout=timeout) elif self._enableGuestEvents: self._vm.guestAgent.events.before_migration( wait_timeout=timeout) if self._migrationCanceledEvt.is_set(): self._raiseAbortError() self.log.debug("migration semaphore acquired " "after %d seconds", time.time() - startTime) migrationParams = { 'dst': self._dst, 'mode': self._mode, 'method': METHOD_ONLINE, 'dstparams': self._dstparams, 'dstqemu': self._dstqemu, } self._startUnderlyingMigration( time.time(), migrationParams, machineParams ) self._finishSuccessfully(machineParams) except libvirt.libvirtError as e: if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED: self.status = response.error( 'migCancelErr', message='Migration canceled') raise except MigrationLimitExceeded: retry_timeout = config.getint('vars', 'migration_retry_timeout') self.log.debug("Migration destination busy. Initiating " "retry in %d seconds.", retry_timeout) self._migrationCanceledEvt.wait(retry_timeout) except MigrationDestinationSetupError as e: self._recover(str(e)) # we know what happened, no need to dump hollow stack trace except Exception as e: self._recover(str(e)) self.log.exception("Failed to migrate")
def get(): caps = {} caps['kvmEnabled'] = \ str(config.getboolean('vars', 'fake_kvm_support') or os.path.exists('/dev/kvm')).lower() cpuInfo = CpuInfo() cpuTopology = CpuTopology() if config.getboolean('vars', 'report_host_threads_as_cores'): caps['cpuCores'] = str(cpuTopology.threads()) else: caps['cpuCores'] = str(cpuTopology.cores()) caps['cpuThreads'] = str(cpuTopology.threads()) caps['cpuSockets'] = str(cpuTopology.sockets()) caps['cpuSpeed'] = cpuInfo.mhz() if config.getboolean('vars', 'fake_kvm_support'): caps['cpuModel'] = 'Intel(Fake) CPU' flags = set(cpuInfo.flags() + ['vmx', 'sse2', 'nx']) caps['cpuFlags'] = ','.join(flags) + 'model_486,model_pentium,' \ 'model_pentium2,model_pentium3,model_pentiumpro,model_qemu32,' \ 'model_coreduo,model_core2duo,model_n270,model_Conroe,' \ 'model_Penryn,model_Nehalem,model_Opteron_G1' else: caps['cpuModel'] = cpuInfo.model() caps['cpuFlags'] = ','.join(cpuInfo.flags() + _getCompatibleCpuModels()) caps.update(dsaversion.version_info) caps.update(netinfo.get()) try: caps['hooks'] = hooks.installed() except: logging.debug('not reporting hooks', exc_info=True) caps['operatingSystem'] = osversion() caps['uuid'] = utils.getHostUUID() caps['packages2'] = _getKeyPackages() caps['emulatedMachines'] = _getEmulatedMachines() caps['ISCSIInitiatorName'] = _getIscsiIniName() caps['HBAInventory'] = storage.hba.HBAInventory() caps['vmTypes'] = ['kvm'] caps['memSize'] = str(utils.readMemInfo()['MemTotal'] / 1024) caps['reservedMem'] = str(config.getint('vars', 'host_mem_reserve') + config.getint('vars', 'extra_mem_reserve')) caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead') return caps
def __init__(self, vm, downtime): super(DowntimeThread, self).__init__() self.DOWNTIME_STEPS = config.getint('vars', 'migration_downtime_steps') self._vm = vm self._downtime = downtime self._stop = threading.Event() delay_per_gib = config.getint('vars', 'migration_downtime_delay') memSize = int(vm.conf['memSize']) self._wait = (delay_per_gib * max(memSize, 2048) + 1023) / 1024 self.daemon = True self.start()
def forceScsiScan(): processes = [] minTimeout = config.getint('irs', 'scsi_rescan_minimal_timeout') maxTimeout = config.getint('irs', 'scsi_rescan_maximal_timeout') for hba in glob.glob(SCAN_PATTERN): cmd = [constants.EXT_DD, 'of=' + hba] p = misc.execCmd(cmd, sudo=False, sync=False) try: p.stdin.write("- - -") p.stdin.flush() p.stdin.close() except OSError as e: if p.wait(0) is False: log.error("pid %s still running", p.pid) log.warning("Error in rescan of hba:%s with returncode:%s and " "error message: %s", hba, p.returncode, p.stderr.read(1000)) if e.errno != errno.EPIPE: raise else: log.warning("Ignoring error in rescan of hba %s: ", hba, exc_info=True) continue processes.append((hba, p)) if (minTimeout > maxTimeout or minTimeout < 0): minTimeout = 2 maxTimeout = 30 log.warning("One of the following configuration arguments has an " "illegal value: scsi_rescan_minimal_timeout or " "scsi_rescan_maximal_timeout. Set to %s and %s seconds " "respectively.", minTimeout, maxTimeout) log.debug("Performing SCSI scan, this will take up to %s seconds", maxTimeout) time.sleep(minTimeout) for i in xrange(maxTimeout - minTimeout): for p in processes[:]: (hba, proc) = p if proc.wait(0): if proc.returncode != 0: log.warning('returncode for: %s is: %s', hba, proc.returncode) processes.remove(p) if not processes: break else: time.sleep(1) else: log.warning("Still waiting for scsi scan of hbas: %s", tuple(hba for p in processes))
def shutdown(self, timeout, message): try: now = time.time() if self.lastStatus == 'Down': return if self.guestAgent and self.guestAgent.isResponsive(): self._guestEventTime = now self._guestEvent = 'Powering down' self.log.debug('guestAgent shutdown called') self.guestAgent.desktopShutdown(timeout, message) agent_timeout = int(timeout) + config.getint('vars', 'sys_shutdown_timeout') timer = threading.Timer(agent_timeout, self._timedShutdown) timer.start() elif self.conf['acpiEnable'].lower() == "true": self._guestEventTime = now self._guestEvent = 'Powering down' self._acpiShutdown() # No tools, no ACPI else: return {'status': {'code': errCode['exist']['status']['code'], 'message': 'VM without ACPI or active SolidICE tools. Try Forced Shutdown.'}} except: self.log.error("Shutdown failed", exc_info=True) return {'status': {'code': doneCode['code'], 'message': 'Machine shut down'}}
def _lvExtend(self, block_dev, newsize=None): volID = None for d in self._devices[DISK_DEVICES]: if not d.blockDev: continue if d.name != block_dev: continue if newsize is None: newsize = config.getint('irs', 'volume_utilization_chunk_mb') + (d.apparentsize + 2**20 - 1) / 2**20 # TODO cap newsize by max volume size volDict = {'poolID': d.poolID, 'domainID': d.domainID, 'imageID': d.imageID, 'volumeID': d.volumeID} d.needExtend = True d.reqsize = newsize # sendExtendMsg expects size in bytes self.cif.irs.sendExtendMsg(d.poolID, volDict, newsize * 2**20, self._afterLvExtend) self.log.debug('%s/%s (%s): apparentsize %s req %s', d.domainID, d.volumeID, d.name, d.apparentsize / constants.MEGAB, newsize) #in MiB volID = d.volumeID break # store most recently requested size in conf, to be re-requested on # migration destination for dev in self.conf['devices']: if dev['type'] == DISK_DEVICES and dev.get('volumeID') == volID: dev['reqsize'] = str(newsize)
def _wait_for_shutting_down_vms(self): """ Wait loop checking remaining VMs in vm container This method is helper method that highers the probability of engine to properly acknowledge that all VMs are terminated by host shutdown. The VMs are shutdown by external service: libvirt-guests The service pauses system shutdown on systemd shutdown and gracefully shutdowns the running VMs. This method applies only when the host is in shutdown. If the host is running, the method ends immediately. """ # how long to wait before release shutdown # we are waiting in whole seconds # if config is not present, do not wait timeout = config.getint('vars', 'timeout_engine_clear_vms') # time to wait in the final phase in seconds # it allows host to flush its final state to the engine final_wait = 2 if not host_in_shutdown(): return self.log.info('host in shutdown waiting') for _ in range((timeout - final_wait) * 10): if not self.vmContainer: # once all VMs are cleared exit break time.sleep(0.1) time.sleep(final_wait)
def _setupVdsConnection(self): if self.hibernating: return hostPort = vdscli.cannonizeHostPort( self._dst, config.getint('addresses', 'management_port')) self.remoteHost, port = hostPort.rsplit(':', 1) try: client = self._createClient(port) requestQueues = config.get('addresses', 'request_queues') requestQueue = requestQueues.split(",")[0] self._destServer = jsonrpcvdscli.connect(requestQueue, client) self.log.debug('Initiating connection with destination') self._destServer.ping() except (JsonRpcBindingsError, JsonRpcNoResponseError): if config.getboolean('vars', 'ssl'): self._destServer = vdscli.connect( hostPort, useSSL=True, TransportClass=kaxmlrpclib.TcpkeepSafeTransport) else: self._destServer = kaxmlrpclib.Server('http://' + hostPort) self.log.debug('Destination server is: ' + hostPort)
def _recoverExistingVms(self): start_time = utils.monotonic_time() try: self.log.debug('recovery: started') # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), numa.cpu_topology().cores) migration.SourceThread.setMaxOutgoingMigrations(mog) recovery.all_vms(self) # recover stage 3: waiting for domains to go up self._waitForDomainsUp() recovery.clean_vm_files(self) self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations self._waitForStoragePool() self._preparePathsForRecoveredVMs() self.log.info('recovery: completed in %is', utils.monotonic_time() - start_time) except: self.log.exception("recovery: failed") raise
def _perform_migration(self, duri, muri): if self._vm.hasSpice and self._vm.conf.get('clientIp'): SPICE_MIGRATION_HANDOVER_TIME = 120 self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME) maxBandwidth = config.getint('vars', 'migration_max_bandwidth') # FIXME: there still a race here with libvirt, # if we call stop() and libvirt migrateToURI3 didn't start # we may return migration stop but it will start at libvirt # side self._preparingMigrationEvt = False if not self._migrationCanceledEvt: # TODO: use libvirt constants when bz#1222795 is fixed params = {VIR_MIGRATE_PARAM_URI: str(muri), VIR_MIGRATE_PARAM_BANDWIDTH: maxBandwidth} flags = (libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | (libvirt.VIR_MIGRATE_TUNNELLED if self._tunneled else 0) | (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if self._abortOnError else 0) | (libvirt.VIR_MIGRATE_COMPRESSED if self._compressed else 0) | (libvirt.VIR_MIGRATE_AUTO_CONVERGE if self._autoConverge else 0)) self._vm._dom.migrateToURI3(duri, params, flags) else: self._raiseAbortError()
def _perform_migration(self, duri, muri): if self._vm.hasSpice and self._vm.conf.get('clientIp'): SPICE_MIGRATION_HANDOVER_TIME = 120 self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME) maxBandwidth = config.getint('vars', 'migration_max_bandwidth') # FIXME: there still a race here with libvirt, # if we call stop() and libvirt migrateToURI2 didn't start # we may return migration stop but it will start at libvirt # side self._preparingMigrationEvt = False if not self._migrationCanceledEvt: self._vm._dom.migrateToURI2( duri, muri, None, libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | (libvirt.VIR_MIGRATE_TUNNELLED if self._tunneled else 0) | (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if self._abortOnError else 0) | (libvirt.VIR_MIGRATE_COMPRESSED if self._compressed else 0) | (libvirt.VIR_MIGRATE_AUTO_CONVERGE if self._autoConverge else 0), None, maxBandwidth) else: self._raiseAbortError()
def _mem_committed(mem_size_mb): """ Legacy algorithm found in oVirt <= 4.1 """ memory = mem_size_mb memory += config.getint('vars', 'guest_ram_overhead') return 2 ** 20 * memory
def calculate_volume_alloc_size(cls, preallocate, capacity, initial_size): """ Calculate the allocation size in mb of the volume 'preallocate' - Sparse or Preallocated 'capacity' - the volume size in blocks 'initial_size' - optional, if provided the initial allocated size in blocks for sparse volumes """ if initial_size and preallocate == sc.PREALLOCATED_VOL: log.error("Initial size is not supported for preallocated volumes") raise se.InvalidParameterException("initial size", initial_size) if initial_size: capacity_bytes = capacity * sc.BLOCK_SIZE initial_size_bytes = initial_size * sc.BLOCK_SIZE max_size = cls.max_size(capacity_bytes, sc.COW_FORMAT) if initial_size_bytes > max_size: log.error("The requested initial %s is bigger " "than the max size %s", initial_size_bytes, max_size) raise se.InvalidParameterException("initial size", initial_size) if preallocate == sc.SPARSE_VOL: if initial_size: initial_size = int(initial_size * QCOW_OVERHEAD_FACTOR) alloc_size = (utils.round(initial_size, BLOCKS_TO_MB) // BLOCKS_TO_MB) else: alloc_size = config.getint("irs", "volume_utilization_chunk_mb") else: alloc_size = utils.round(capacity, BLOCKS_TO_MB) // BLOCKS_TO_MB return alloc_size
def calculate_volume_alloc_size(cls, preallocate, capacity, initial_size): """ Calculate the allocation size in mb of the volume 'preallocate' - Sparse or Preallocated 'capacity' - the volume size in sectors 'initial_size' - optional, if provided the initial allocated size in sectors for sparse volumes """ if initial_size and initial_size > capacity: log.error("The volume size %s is smaller " "than the requested initial size %s", capacity, initial_size) raise se.InvalidParameterException("initial size", initial_size) if initial_size and preallocate == sc.PREALLOCATED_VOL: log.error("Initial size is not supported for preallocated volumes") raise se.InvalidParameterException("initial size", initial_size) if preallocate == sc.SPARSE_VOL: if initial_size: initial_size = int(initial_size * QCOW_OVERHEAD_FACTOR) alloc_size = ((initial_size + SECTORS_TO_MB - 1) / SECTORS_TO_MB) else: alloc_size = config.getint("irs", "volume_utilization_chunk_mb") else: alloc_size = (capacity + SECTORS_TO_MB - 1) / SECTORS_TO_MB return alloc_size
def __init__(self, name=None): self._name = name or config.get("containers", "network_name") self._gw = config.get("containers", "network_gateway") self._nic = config.get("containers", "network_interface") self._subnet = config.get("containers", "network_subnet") self._mask = config.getint("containers", "network_mask") self._existing = False
def _loadBindingJsonRpc(self): from BindingJsonRpc import BindingJsonRpc from Bridge import DynamicBridge ip = config.get('addresses', 'management_ip') port = config.getint('addresses', 'json_port') conf = [('tcp', {"ip": ip, "port": port})] self.bindings['json'] = BindingJsonRpc(DynamicBridge(), conf)
def _setupVdsConnection(self): if self.hibernating: return # FIXME: The port will depend on the binding being used. # This assumes xmlrpc hostPort = vdscli.cannonizeHostPort( self._dst, config.getint('addresses', 'management_port')) self.remoteHost, _ = hostPort.rsplit(':', 1) if config.getboolean('vars', 'ssl'): self._destServer = vdscli.connect( hostPort, useSSL=True, TransportClass=kaxmlrpclib.TcpkeepSafeTransport) else: self._destServer = kaxmlrpclib.Server('http://' + hostPort) self.log.debug('Destination server is: ' + hostPort) try: self.log.debug('Initiating connection with destination') status = self._destServer.getVmStats(self._vm.id) if not status['status']['code']: self.log.error("Machine already exists on the destination") self.status = errCode['exist'] except Exception: self.log.exception("Error initiating connection") self.status = errCode['noConPeer']
def _autodelete_if_required(self): if self.autodelete: timeout = config.getint("jobs", "autodelete_delay") if timeout >= 0: logging.info("Job %r will be deleted in %d seconds", self.id, timeout) _scheduler.schedule(timeout, self._delete)
def __init__(self, name=None): self._name = name or config.get( 'containers', 'network_name') self._gw = config.get('containers', 'network_gateway') self._nic = config.get('containers', 'network_interface') self._subnet = config.get('containers', 'network_subnet') self._mask = config.getint('containers', 'network_mask') self._existing = False
def getMaximumSupportedDomains(self): msdInfo = self.masterDomain.getInfo() msdType = sd.name2type(msdInfo["type"]) msdVersion = int(msdInfo["version"]) if msdType in sd.BLOCK_DOMAIN_TYPES and msdVersion in blockSD.VERS_METADATA_LV: return MAX_DOMAINS else: return config.getint("irs", "maximum_domains_in_pool")
def _memory_viewer(): cherrypy.tree.mount(dowser.Root()) cherrypy.config.update({ 'server.socket_host': '0.0.0.0', 'server.socket_port': config.getint('devel', 'memory_profile_port')}) cherrypy.engine.start()
def monitor_migration(self): def update_progress(remaining, total): if remaining == 0 and total: return 100 progress = 100 - 100 * remaining / total if total else 0 return progress if (progress < 100) else 99 self._vm.log.debug('starting migration monitor thread') memSize = int(self._vm.conf['memSize']) maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024 lastProgressTime = time.time() lowmark = None self._execute_init(self._conv_schedule['init']) while not self._stop.isSet(): self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) (jobType, timeElapsed, _, dataTotal, dataProcessed, dataRemaining, memTotal, memProcessed, memRemaining, fileTotal, fileProcessed, _) = self._vm._dom.jobInfo() # from libvirt sources: data* = file* + mem*. # docs can be misleading due to misaligned lines. now = time.time() if not self._use_conv_schedule and\ (0 < migrationMaxTime < now - self._startTime): self._vm.log.warn('The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) self._vm._dom.abortJob() self.stop() break elif (lowmark is None) or (lowmark > dataRemaining): lowmark = dataRemaining lastProgressTime = now else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).' ' Refer to RHBZ#919201.', dataRemaining / Mbytes, lowmark / Mbytes) self._next_action(now - lastProgressTime) if self._stop.isSet(): break if jobType != libvirt.VIR_DOMAIN_JOB_NONE: self.progress = update_progress(dataRemaining, dataTotal) self._vm.log.info('Migration Progress: %s seconds elapsed,' ' %s%% of data processed' % (timeElapsed / 1000, self.progress))
def __init__ (self, log): """ Initialize the (single) clientIF instance :param log: a log object to be used for this object's logging. :type log: :class:`logging.Logger` """ self.vmContainerLock = threading.Lock() self._networkSemaphore = threading.Semaphore() self._shutdownSemaphore = threading.Semaphore() self.log = log self._recovery = True self._libvirt = libvirtconnection.get() self._syncLibvirtNetworks() self.channelListener = Listener(self.log) self._generationID = str(uuid.uuid4()) self._initIRS() try: self.vmContainer = {} ifids = netinfo.nics() + netinfo.bondings() ifrates = map(netinfo.speed, ifids) self._hostStats = utils.HostStatsThread(cif=self, log=log, ifids=ifids, ifrates=ifrates) self._hostStats.start() mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuInfo().cores()) vm.MigrationSourceThread.setMaxOutgoingMigrations(mog) self.lastRemoteAccess = 0 self._memLock = threading.Lock() self._enabled = True self.ksmMonitor = ksm.KsmMonitorThread(self) self._netConfigDirty = False threading.Thread(target=self._recoverExistingVms, name='clientIFinit').start() self.channelListener.settimeout(config.getint('vars', 'guest_agent_timeout')) self.channelListener.start() self.threadLocal = threading.local() self.threadLocal.client = '' except: self.log.error('failed to init clientIF, shutting down storage dispatcher') if self.irs: self.irs.prepareForShutdown() raise self._prepareBindings()
def __init__(self, irs, log, scheduler): """ Initialize the (single) clientIF instance :param irs: a Dispatcher object to be used as this object's irs. :type irs: :class:`vdsm.storage.dispatcher.Dispatcher` :param log: a log object to be used for this object's logging. :type log: :class:`logging.Logger` """ self.vmContainerLock = threading.Lock() self._networkSemaphore = threading.Semaphore() self._shutdownSemaphore = threading.Semaphore() self.irs = irs if self.irs: self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self)) self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB) self.log = log self._recovery = True # TODO: The guest agent related code spreads around too much. There is # QemuGuestAgentPoller and ChannelListner here and then many instances # of GuestAgent per VM in vm.py. This should be refactored and # operated by single object. Idealy the distinction between what is # served by QEMU-GA and what is server by oVirt GA should not be # visible to the rest of the code. self.channelListener = Listener(self.log) self.qga_poller = QemuGuestAgentPoller(self, log, scheduler) self.mom = None self.servers = {} self._broker_client = None self._subscriptions = defaultdict(list) self._scheduler = scheduler self._unknown_vm_ids = set() if _glusterEnabled: self.gluster = gapi.GlusterApi() else: self.gluster = None try: self.vmContainer = {} self.lastRemoteAccess = 0 self._enabled = True self._netConfigDirty = False self.mom = MomClient(config.get("mom", "socket_path")) self.mom.connect() secret.clear() concurrent.thread(self._recoverThread, name='vmrecovery').start() self.channelListener.settimeout( config.getint('vars', 'guest_agent_timeout')) self.channelListener.start() self.qga_poller.start() self.threadLocal = threading.local() self.threadLocal.client = '' host = config.get('addresses', 'management_ip') port = config.getint('addresses', 'management_port') # When IPv6 is not enabled, fallback to listen on IPv4 address try: self._createAcceptor(host, port) except socket.error as e: if e.errno == errno.EAFNOSUPPORT and host in ('::', '::1'): fallback_host = '0.0.0.0' self._createAcceptor(fallback_host, port) else: raise self._prepareHttpServer() self._prepareJSONRPCServer() self._connectToBroker() except: self.log.error('failed to init clientIF, ' 'shutting down storage dispatcher') if self.irs: self.irs.prepareForShutdown() raise
class MonitorThread(threading.Thread): _MIGRATION_MONITOR_INTERVAL = config.getint( 'vars', 'migration_monitor_interval') # seconds def __init__(self, vm, startTime): super(MonitorThread, self).__init__() self._stop = threading.Event() self._vm = vm self._startTime = startTime self.daemon = True self.progress = 0 @property def enabled(self): return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0 def run(self): if self.enabled: self.monitor_migration() else: self._vm.log.debug('migration monitor thread disabled' ' (monitoring interval set to 0)') def monitor_migration(self): def calculateProgress(remaining, total): if remaining == 0 and total: return 100 progress = 100 - 100 * remaining / total if total else 0 return progress if (progress < 100) else 99 self._vm.log.debug('starting migration monitor thread') memSize = int(self._vm.conf['memSize']) maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024 lastProgressTime = time.time() lowmark = None progress_timeout = config.getint('vars', 'migration_progress_timeout') while not self._stop.isSet(): self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) (jobType, timeElapsed, _, dataTotal, dataProcessed, dataRemaining, memTotal, memProcessed, memRemaining, fileTotal, fileProcessed, _) = self._vm._dom.jobInfo() # from libvirt sources: data* = file* + mem*. # docs can be misleading due to misaligned lines. abort = False now = time.time() if 0 < migrationMaxTime < now - self._startTime: self._vm.log.warn('The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) abort = True elif (lowmark is None) or (lowmark > dataRemaining): lowmark = dataRemaining lastProgressTime = now elif (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) abort = True if abort: self._vm._dom.abortJob() self.stop() break if dataRemaining > lowmark: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).' ' Refer to RHBZ#919201.', dataRemaining / Mbytes, lowmark / Mbytes) if jobType == 0: continue self.progress = calculateProgress(dataRemaining, dataTotal) self._vm.log.info('Migration Progress: %s seconds elapsed, %s%% of' ' data processed' % (timeElapsed / 1000, self.progress)) def stop(self): self._vm.log.debug('stopping migration monitor thread') self._stop.set()
_QEMU_OSINFO_COMMAND = 'guest-get-osinfo' _QEMU_TIMEZONE_COMMAND = 'guest-get-timezone' _QEMU_FSINFO_COMMAND = 'guest-get-fsinfo' _QEMU_DISKS_COMMAND = 'guest-get-disks' _HOST_NAME_FIELD = 'host-name' _OS_ID_FIELD = 'id' _TIMEZONE_OFFSET_FIELD = 'offset' _TIMEZONE_ZONE_FIELD = 'zone' _FS_DISK_FIELD = 'disk' _FS_DISK_DEVICE_FIELD = 'dev' _FS_DISK_SERIAL_FIELD = 'serial' _GUEST_OS_WINDOWS = 'mswindows' _WORKERS = config.getint('guest_agent', 'periodic_workers') _TASK_PER_WORKER = config.getint('guest_agent', 'periodic_task_per_worker') _TASKS = _WORKERS * _TASK_PER_WORKER _MAX_WORKERS = config.getint('guest_agent', 'max_workers') _COMMAND_TIMEOUT = config.getint('guest_agent', 'qga_command_timeout') _INITIAL_INTERVAL = config.getint('guest_agent', 'qga_initial_info_interval') _TASK_TIMEOUT = config.getint('guest_agent', 'qga_task_timeout') _THROTTLING_INTERVAL = 60 from libvirt import \ VIR_DOMAIN_GUEST_INFO_USERS, \ VIR_DOMAIN_GUEST_INFO_OS, \ VIR_DOMAIN_GUEST_INFO_TIMEZONE, \ VIR_DOMAIN_GUEST_INFO_HOSTNAME, \ VIR_DOMAIN_GUEST_INFO_FILESYSTEM
class GuestAgent(object): MAX_MESSAGE_SIZE = 2 ** 20 # 1 MiB for now SEEN_SHUTDOWN_TIMEOUT = config.getint('vars', 'sys_shutdown_timeout') * 2 def __init__(self, socketName, channelListener, log, onStatusChange, qgaCaps, qgaGuestInfo, api_version=None, user='******', ips=''): self.effectiveApiVersion = min( api_version or _IMPLICIT_API_VERSION_ZERO, _MAX_SUPPORTED_API_VERSION) self._onStatusChange = onStatusChange self.log = log self._socketName = socketName self._sock = _create_socket() self._stopped = True self._status = None self.guestDiskMapping = {} self.guestInfo = { 'username': user, 'memUsage': 0, 'guestCPUCount': -1, 'guestIPs': ips, 'guestFQDN': '', 'session': 'Unknown', 'appsList': (), 'disksUsage': [], 'netIfaces': [], 'memoryStats': {}} self._agentTimestamp = 0 self._channelListener = channelListener self._messageState = MessageState.NORMAL self.events = GuestAgentEvents(self) self._completion_lock = threading.Lock() self._completion_events = {} self._first_connect = threading.Event() self._seen_shutdown = None self._qgaCaps = qgaCaps self._qgaGuestInfo = qgaGuestInfo def has_seen_shutdown(self): if self._seen_shutdown is None: return True diff = time.time() - self._agentTimestamp if diff < GuestAgent.SEEN_SHUTDOWN_TIMEOUT: return self._seen_shutdown return False def _on_completion(self, reply_id): with self._completion_lock: event = self._completion_events.pop(reply_id, None) if event is not None: event.set() @property def can_reply(self): active = self.isResponsive() return active and self.effectiveApiVersion >= _REPLY_CAP_MIN_VERSION @contextlib.contextmanager def _waitable_message(self, wait_timeout, reply_id): if self.can_reply and wait_timeout is not None: event = threading.Event() with self._completion_lock: self._completion_events[reply_id] = event yield event.wait(wait_timeout) with self._completion_lock: self._completion_events.pop(reply_id, None) else: yield @property def guestStatus(self): return self._status @guestStatus.setter def guestStatus(self, value): oldValue = self._status self._status = value if oldValue != value and self._onStatusChange: self._onStatusChange() @property def guestDiskMapping(self): return self._guestDiskMapping @guestDiskMapping.setter def guestDiskMapping(self, value): self._guestDiskMapping = value if value: self._diskMappingHash = hash(json.dumps(value, sort_keys=True)) else: self._diskMappingHash = None @property def diskMappingHash(self): return self._diskMappingHash def start(self): self.log.info("Starting connection") self._prepare_socket() self._channelListener.register( self._create, self._connect, self._onChannelRead, self._onChannelTimeout) def _handleAPIVersion(self, version): """ Handles the API version value from the heartbeat If the value `version` is an valid int the highest possible API version in common will be determined and set to the attribute `self.effectiveApiVersion` if the value has changed. If the value changed the `api-version` message will be sent to the guest agent to notify it about the changed common API version. If the value of `version` is not an int, the API version support will be disabled by assigning _IMPLICIT_API_VERSION_ZERO to `self.effectiveApiVersion` Args: version - the api version reported by the guest agent """ try: commonVersion = int(version) except ValueError: self.log.warning("Received invalid version value: %s", version) commonVersion = _IMPLICIT_API_VERSION_ZERO else: commonVersion = max(commonVersion, _IMPLICIT_API_VERSION_ZERO) commonVersion = min(commonVersion, _MAX_SUPPORTED_API_VERSION) if commonVersion != self.effectiveApiVersion: # Only update if the value changed self.log.info("Guest API version changed from %d to %d", self.effectiveApiVersion, commonVersion) self.effectiveApiVersion = commonVersion if commonVersion != _IMPLICIT_API_VERSION_ZERO: # Only notify the guest agent if the API was not disabled self._forward('api-version', {'apiVersion': commonVersion}) def _prepare_socket(self): supervdsm.getProxy().prepareVmChannel(self._socketName) def _create(self): self._sock.close() self._sock = _create_socket() return self._sock.fileno() def _connect(self): ret = False try: self._stopped = True self.log.debug("Attempting connection to %s", self._socketName) result = self._sock.connect_ex(self._socketName) self._first_connect.set() if result == 0: self.log.debug("Connected to %s", self._socketName) self._messageState = MessageState.NORMAL self._clearReadBuffer() # Report the _MAX_SUPPORTED_API_VERSION on refresh to enable # the other side to see that we support API versioning self._forward('refresh', {'apiVersion': _MAX_SUPPORTED_API_VERSION}) self._stopped = False ret = True else: self.log.debug("Failed to connect to %s with %d", self._socketName, result) except socket.error as err: self.log.debug("Connection attempt failed: %s", err) return ret def _forward(self, cmd, args={}): ver = _MESSAGE_API_VERSION_LOOKUP.get(cmd, _IMPLICIT_API_VERSION_ZERO) if ver > self.effectiveApiVersion: raise GuestAgentUnsupportedMessage(cmd, ver, self.effectiveApiVersion) self._first_connect.wait(self._channelListener.timeout()) args['__name__'] = cmd # TODO: encoding is required only on Python 3. Replace with wrapper # hiding this difference. message = (json.dumps(args) + '\n').encode('utf8') # TODO: socket is non-blocking, handle possible EAGAIN self._sock.sendall(message) self.log.debug('sent %r', message) def _handleMessage(self, message, args): self.log.debug("Guest's message %s: %s", message, args) if message == 'heartbeat': self.guestInfo['memUsage'] = int(args['free-ram']) if 'memory-stat' in args: for k in ('mem_total', 'mem_unused', 'mem_buffers', 'mem_cached', 'swap_in', 'swap_out', 'pageflt', 'majflt'): if k not in args['memory-stat']: continue # Convert the value to string since 64-bit integer is not # supported in XMLRPC self.guestInfo['memoryStats'][k] = str( args['memory-stat'][k]) if k == 'mem_unused': self.guestInfo['memoryStats']['mem_free'] = str( args['memory-stat']['mem_unused']) if 'apiVersion' in args: # The guest agent supports API Versioning self._handleAPIVersion(args['apiVersion']) elif self.effectiveApiVersion != _IMPLICIT_API_VERSION_ZERO: # Older versions of the guest agent (before the introduction # of API versioning) do not report this field # Disable the API if not already disabled (e.g. after # downgrade of the guest agent) self.log.debug("API versioning no longer reported by guest.") self.effectiveApiVersion = _IMPLICIT_API_VERSION_ZERO # Only change the state AFTER all data of the heartbeat has been # consumed self.guestStatus = vmstatus.UP if self._seen_shutdown: self._seen_shutdown = False elif message == 'host-name': self.guestInfo['guestName'] = args['name'] elif message == 'os-version': self.guestInfo['guestOs'] = args['version'] elif message == 'os-info': self.guestInfo['guestOsInfo'] = args elif message == 'timezone': self.guestInfo['guestTimezone'] = args elif message == 'network-interfaces': interfaces = [] old_ips = '' for iface in args['interfaces']: iface['inet'] = iface.get('inet', []) iface['inet6'] = iface.get('inet6', []) interfaces.append(iface) # Provide the old information which includes # only the IP addresses. old_ips += ' '.join(iface['inet']) + ' ' self.guestInfo['netIfaces'] = interfaces self.guestInfo['guestIPs'] = old_ips.strip() elif message == 'applications': self.guestInfo['appsList'] = tuple(args['applications']) # Fake QEMU-GA if it is not reported if not any(bool(_qga_re.match(x)) for x in self.guestInfo['appsList']): qga_caps = self._qgaCaps() if qga_caps is not None and qga_caps['version'] is not None: # NOTE: this is a tuple self.guestInfo['appsList'] = \ self.guestInfo['appsList'] + \ ('qemu-guest-agent-%s' % qga_caps['version'],) elif message == 'active-user': currentUser = args['name'] if ((currentUser != self.guestInfo['username']) and not (currentUser == 'Unknown' and self.guestInfo['username'] == 'None')): self.guestInfo['username'] = currentUser self.guestInfo['lastLogin'] = time.time() self.log.debug("username: %s", repr(self.guestInfo['username'])) elif message == 'session-logon': self.guestInfo['session'] = "UserLoggedOn" elif message == 'session-lock': self.guestInfo['session'] = "Locked" elif message == 'session-unlock': self.guestInfo['session'] = "Active" elif message == 'session-logoff': self.guestInfo['session'] = "LoggedOff" elif message == 'uninstalled': self.log.debug("guest agent was uninstalled.") self.guestInfo['appsList'] = () elif message == 'session-startup': self._seen_shutdown = False self.log.debug("Guest system is started or restarted.") elif message == 'fqdn': self.guestInfo['guestFQDN'] = args['fqdn'] elif message == 'session-shutdown': self._seen_shutdown = True self.log.debug("Guest system shuts down.") elif message == 'containers': self.guestInfo['guestContainers'] = args['list'] elif message == 'disks-usage': disks = [] for disk in args['disks']: # Converting to string because XML-RPC doesn't support 64-bit # integers. disk['total'] = str(disk['total']) disk['used'] = str(disk['used']) disks.append(disk) self.guestInfo['disksUsage'] = disks self.guestDiskMapping = args.get('mapping', {}) elif message == 'number-of-cpus': self.guestInfo['guestCPUCount'] = int(args['count']) elif message == 'completion': self._on_completion(args.pop('reply_id', None)) else: self.log.error('Unknown message type %s', message) def stop(self): self.log.info("Stopping connection") self._stopped = True try: self._channelListener.unregister(self._sock.fileno()) except socket.error as e: if e.args[0] == errno.EBADF: # socket was already closed pass else: raise else: self._sock.close() def isResponsive(self): return time.time() - self._agentTimestamp < 120 def getStatus(self): return self.guestStatus def getGuestInfo(self): # This is rather hacky, but for now we want to prefer information from # oVirt GA over QEMU-GA info = { 'username': '******', 'session': 'Unknown', 'memUsage': 0, 'guestCPUCount': -1, 'appsList': (), 'guestIPs': '', 'guestFQDN': ''} qga = self._qgaGuestInfo() if qga is not None: info.update(qga) if self.isResponsive(): info.update(self.guestInfo) else: if len(self.guestInfo['appsList']) > 0: info['appsList'] = self.guestInfo['appsList'] if len(self.guestInfo['guestIPs']) > 0: info['guestIPs'] = self.guestInfo['guestIPs'] if len(self.guestInfo['guestFQDN']) > 0: info['guestFQDN'] = self.guestInfo['guestFQDN'] return utils.picklecopy(info) def onReboot(self): self.guestStatus = vmstatus.REBOOT_IN_PROGRESS self.guestInfo['lastUser'] = '' + self.guestInfo['username'] self.guestInfo['username'] = '******' self.guestInfo['lastLogout'] = time.time() def desktopLock(self): try: self.log.debug("desktopLock called") self._forward("lock-screen") except Exception as e: if isinstance(e, socket.error) and e.args[0] == errno.EBADF: self.log.debug('desktopLock failed - Socket not connected') return # Expected when not connected/closed socket self.log.exception("desktopLock failed with unexpected exception") def desktopLogin(self, domain, user, password): try: self.log.debug("desktopLogin called") if domain != '': username = user + '@' + domain else: username = user self._forward('login', {'username': username, "password": password.value}) except: self.log.exception("desktopLogin failed") def desktopLogoff(self, force): try: self.log.debug("desktopLogoff called") self._forward('log-off') except: self.log.exception("desktopLogoff failed") def desktopShutdown(self, timeout, msg, reboot): try: self.log.debug("desktopShutdown called") self._forward('shutdown', {'timeout': timeout, 'message': msg, 'reboot': str(reboot)}) except: self.log.exception("desktopShutdown failed") def sendHcCmdToDesktop(self, cmd): try: self.log.debug("sendHcCmdToDesktop('%s')" % (cmd)) self._forward(str(cmd)) except: self.log.exception("sendHcCmdToDesktop failed") def setNumberOfCPUs(self, count): self.log.debug("setNumberOfCPUs('%d') called", count) self._forward('set-number-of-cpus', {'count': count}) def send_lifecycle_event(self, event, **kwargs): self.log.debug('send_lifecycle_event %s called', event) try: message = {'type': event} message.update(kwargs) self._forward('lifecycle-event', message) except GuestAgentUnsupportedMessage: # This is ok, that guest agent doesn't know yet how to handle # the message pass except socket.error as e: self.log.debug("Failed to forward lifecycle-event: %s", e) def _onChannelTimeout(self): self.guestInfo['memUsage'] = 0 if self.guestStatus not in (vmstatus.POWERING_DOWN, vmstatus.REBOOT_IN_PROGRESS): self.log.debug("Guest connection timed out") self.guestStatus = None def _clearReadBuffer(self): self._buffer = [] self._bufferSize = 0 def _processMessage(self, line): try: (message, args) = self._parseLine(line) self._agentTimestamp = time.time() self._handleMessage(message, args) except ValueError as err: self.log.error("%s: %s" % (err, repr(line))) def _handleData(self, data): while (not self._stopped) and b'\n' in data: line, data = data.split(b'\n', 1) line = b''.join(self._buffer) + line self._clearReadBuffer() if self._messageState is MessageState.TOO_BIG: self._messageState = MessageState.NORMAL self.log.warning("Not processing current message because it " "was too big") else: self._processMessage(line) self._buffer.append(data) self._bufferSize += len(data) if self._bufferSize >= self.MAX_MESSAGE_SIZE: self.log.warning("Discarding buffer with size: %d because the " "message reached maximum size of %d bytes before " "message end was reached.", self._bufferSize, self.MAX_MESSAGE_SIZE) self._messageState = MessageState.TOO_BIG self._clearReadBuffer() def _onChannelRead(self): result = True try: while not self._stopped: data = self._sock.recv(2 ** 16) # The connection is broken when recv returns no data # therefore we're going to set ourself to stopped state if not data: self._stopped = True self.log.debug("Disconnected from %s", self._socketName) result = False else: self._handleData(data) except socket.error as err: if err.errno not in (errno.EWOULDBLOCK, errno.EAGAIN): raise return result def _parseLine(self, line): # Deal with any bad UTF8 encoding from the (untrusted) guest, # by replacing them with the Unicode replacement character uniline = line.decode('utf8', 'replace') args = json.loads(uniline) # Filter out any characters in the untrusted guest response # that aren't permitted in XML. This must be done _after_ the # JSON decoding, since otherwise JSON's \u escape decoding # could be used to generate the bad characters args = _filterObject(args) name = args['__name__'] del args['__name__'] return (name, args)
class MonitorThread(object): _MIGRATION_MONITOR_INTERVAL = config.getint( 'vars', 'migration_monitor_interval') # seconds def __init__(self, vm, startTime, conv_schedule): super(MonitorThread, self).__init__() self._stop = threading.Event() self._vm = vm self._dom = DomainAdapter(self._vm) self._startTime = startTime self.daemon = True self.progress = None self._conv_schedule = conv_schedule self._thread = concurrent.thread(self.run, name='migmon/' + self._vm.id[:8]) def start(self): self._thread.start() def join(self): self._thread.join() @property def enabled(self): return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0 @logutils.traceback() def run(self): if self.enabled: self._vm.log.debug('starting migration monitor thread') try: self.monitor_migration() except virdomain.NotConnectedError as e: # In case the VM is stopped during migration, there is a race # between domain disconnection and stopping the monitoring # thread. Then the domain may no longer be connected when # monitor_migration loop tries to access it. That's harmless # and shouldn't bubble up, let's just finish the thread. self._vm.log.debug('domain disconnected in monitor thread: %s', e) self._vm.log.debug('stopped migration monitor thread') else: self._vm.log.info('migration monitor thread disabled' ' (monitoring interval set to 0)') def monitor_migration(self): lowmark = None initial_iteration = last_iteration = None self._execute_init(self._conv_schedule['init']) while not self._stop.isSet(): stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) if stopped: break job_stats = self._vm.job_stats() # It may happen that the migration did not start yet # so we'll keep waiting if not ongoing(job_stats): continue progress = Progress.from_job_stats(job_stats) if initial_iteration is None: # The initial iteration number from libvirt is not # fixed, since it may include iterations from # previously cancelled migrations. initial_iteration = last_iteration = progress.mem_iteration self._vm.send_migration_status_event() if self._vm.post_copy != PostCopyPhase.NONE: # Post-copy mode is a final state of a migration -- it either # completes or fails and stops the VM, there is no way to # continue with the migration in either case. So we won't # handle any further schedule actions once post-copy is # successfully started. It's still recommended to put the # abort action after the post-copy action in the schedule, for # the case when it's not possible to switch to the post-copy # mode for some reason. if self._vm.post_copy == PostCopyPhase.RUNNING: # If post-copy is not RUNNING then we are in the interim # phase (which should be short) between initiating the # post-copy migration and the actual start of the post-copy # migration. Nothing needs to be done in that case. self._vm.log.debug( 'Post-copy migration still in progress: %d', progress.data_remaining) elif (lowmark is None) or (lowmark > progress.data_remaining): lowmark = progress.data_remaining else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).', progress.data_remaining // MiB, lowmark // MiB) if not self._vm.post_copy and\ progress.mem_iteration > last_iteration: last_iteration = progress.mem_iteration current_iteration = last_iteration - initial_iteration self._vm.log.debug('new iteration: %i', current_iteration) self._next_action(current_iteration) if self._stop.isSet(): break self.progress = progress self._vm.log.info('%s', progress) def stop(self): self._vm.log.debug('stopping migration monitor thread') self._stop.set() def _next_action(self, stalling): head = self._conv_schedule['stalling'][0] self._vm.log.debug( 'Stalling for %d iterations, ' 'checking to make next action: ' '%s', stalling, head) if head['limit'] < stalling: self._execute_action_with_params(head['action']) self._conv_schedule['stalling'].pop(0) self._vm.log.debug('setting conv schedule to: %s', self._conv_schedule) def _execute_init(self, init_actions): for action_with_params in init_actions: self._execute_action_with_params(action_with_params) def _execute_action_with_params(self, action_with_params): action = str(action_with_params['name']) vm = self._vm if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME: downtime = int(action_with_params['params'][0]) vm.log.debug('Setting downtime to %d', downtime) # pylint: disable=no-member self._dom.migrateSetMaxDowntime(downtime, 0) elif action == CONVERGENCE_SCHEDULE_POST_COPY: if not self._vm.switch_migration_to_post_copy(): # Do nothing for now; the next action will be invoked after a # while vm.log.warning('Failed to switch to post-copy migration') elif action == CONVERGENCE_SCHEDULE_SET_ABORT: vm.log.warning('Aborting migration') vm.abort_domjob() self.stop()
import types import weakref from functools import partial import six import ioprocess from vdsm import constants from vdsm import utils from vdsm.common.osutils import get_umask from vdsm.config import config from vdsm.storage import constants as sc from vdsm.storage import exception as se DEFAULT_TIMEOUT = config.getint("irs", "process_pool_timeout") IOPROC_IDLE_TIME = config.getint("irs", "max_ioprocess_idle_time") HELPERS_PER_DOMAIN = config.getint("irs", "process_pool_max_slots_per_domain") MAX_QUEUED = config.getint("irs", "process_pool_max_queued_slots_per_domain") _procPoolLock = threading.Lock() _procPool = {} _refProcPool = {} elapsed_time = lambda: os.times()[4] log = logging.getLogger('storage.oop') def stop(): """
def __init__(self, vm, dst='', dstparams='', mode=MODE_REMOTE, method=METHOD_ONLINE, tunneled=False, dstqemu='', abortOnError=False, consoleAddress=None, compressed=False, autoConverge=False, recovery=False, **kwargs): self.log = vm.log self._vm = vm self._dst = dst self._mode = mode if method != METHOD_ONLINE: self.log.warning( 'migration method %s is deprecated, forced to "online"', method) self._dstparams = dstparams self._enableGuestEvents = kwargs.get('enableGuestEvents', False) # TODO: conv.tobool shouldn't be used in this constructor, the # conversions should be handled properly in the API layer self._consoleAddress = consoleAddress self._dstqemu = dstqemu self._downtime = kwargs.get('downtime') or \ config.get('vars', 'migration_downtime') self._maxBandwidth = int( kwargs.get('maxBandwidth') or config.getint('vars', 'migration_max_bandwidth')) self._incomingLimit = kwargs.get('incomingLimit') self._outgoingLimit = kwargs.get('outgoingLimit') self.status = { 'status': { 'code': 0, 'message': 'Migration in progress' } } # we need to guard against concurrent updates only self._lock = threading.Lock() self._progress = 0 self._thread = concurrent.thread(self.run, name='migsrc/' + self._vm.id[:8]) self._preparingMigrationEvt = True self._migrationCanceledEvt = threading.Event() self._monitorThread = None self._destServer = None self._convergence_schedule = {'init': [], 'stalling': []} self._use_convergence_schedule = False if 'convergenceSchedule' in kwargs: self._convergence_schedule = kwargs.get('convergenceSchedule') self._use_convergence_schedule = True self.log.debug('convergence schedule set to: %s', str(self._convergence_schedule)) self._started = False self._failed = False self._recovery = recovery tunneled = conv.tobool(tunneled) abortOnError = conv.tobool(abortOnError) compressed = conv.tobool(compressed) autoConverge = conv.tobool(autoConverge) self._migration_flags = self._calculate_migration_flags( tunneled, abortOnError, compressed, autoConverge)
def _regular_run(self): self.log.debug("Starting migration source thread") self._recovery = False self._update_outgoing_limit() try: startTime = time.time() # Guest agent API version must be updated before _srcDomXML # is created to have the version in _srcDomXML metadata. self._vm.update_guest_agent_api_version() machineParams = self._setupRemoteMachineParams() self._setupVdsConnection() self._prepareGuest() while not self.started: try: self.log.info("Migration semaphore: acquiring") with SourceThread.ongoingMigrations: self.log.info("Migration semaphore: acquired") timeout = config.getint( 'vars', 'guest_lifecycle_event_reply_timeout') if self.hibernating: self._vm.guestAgent.events.before_hibernation( wait_timeout=timeout) elif self._enableGuestEvents: self._vm.guestAgent.events.before_migration( wait_timeout=timeout) if self._migrationCanceledEvt.is_set(): self._raiseAbortError() self.log.debug("migration semaphore acquired " "after %d seconds", time.time() - startTime) self._startUnderlyingMigration( time.time(), machineParams ) self._finishSuccessfully(machineParams) except libvirt.libvirtError as e: if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED: self.status = response.error( 'migCancelErr', message='Migration canceled') # This error occurs when hypervisor cannot start # the migration. For example, when a domain with the same # name already exists on the destination. elif e.get_error_code() == \ libvirt.VIR_ERR_OPERATION_FAILED: self.status = response.error( 'migOperationErr', message=e.get_str2()) raise except MigrationLimitExceeded: retry_timeout = config.getint('vars', 'migration_retry_timeout') self.log.debug("Migration destination busy. Initiating " "retry in %d seconds.", retry_timeout) self._migrationCanceledEvt.wait(retry_timeout) except MigrationDestinationSetupError as e: self._recover(str(e)) # we know what happened, no need to dump hollow stack trace except Exception as e: self._recover(str(e)) self.log.exception("Failed to migrate") finally: # Enable the volume monitor as it can be disabled during migration. self._vm.volume_monitor.enable()
# # You should have received a copy of the GNU General Public # License along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import threading import logging from yajsonrpc import JsonRpcServer from yajsonrpc.stompreactor import StompReactor from vdsm import executor from vdsm.config import config # TODO test what should be the default values _THREADS = config.getint('rpc', 'worker_threads') _TASK_PER_WORKER = config.getint('rpc', 'tasks_per_worker') _TASKS = _THREADS * _TASK_PER_WORKER class BindingJsonRpc(object): log = logging.getLogger('BindingJsonRpc') def __init__(self, bridge, subs, timeout, scheduler): self._executor = executor.Executor(name="jsonrpc.Executor", workers_count=_THREADS, max_tasks=_TASKS, scheduler=scheduler) self._server = JsonRpcServer(bridge, timeout, self._executor.dispatch) self._reactor = StompReactor(subs)
def rescan(): timeout = config.getint('irs', 'scsi_rescan_maximal_timeout') log.debug("Performing SCSI scan, this will take up to %s seconds", timeout) rescanOp = iscsiadm.session_rescan_async() rescanOp.wait(timeout=timeout)
def __init__(self, irs, log, scheduler): """ Initialize the (single) clientIF instance :param irs: a Dispatcher object to be used as this object's irs. :type irs: :class:`vdsm.storage.dispatcher.Dispatcher` :param log: a log object to be used for this object's logging. :type log: :class:`logging.Logger` """ self.vmContainerLock = threading.Lock() self._networkSemaphore = threading.Semaphore() self._shutdownSemaphore = threading.Semaphore() self.irs = irs if self.irs: self._contEIOVmsCB = partial(clientIF.contEIOVms, proxy(self)) self.irs.registerDomainStateChangeCallback(self._contEIOVmsCB) self.log = log self._recovery = True self.channelListener = Listener(self.log) self.mom = None self.servers = {} self._broker_client = None self._subscriptions = defaultdict(list) self._scheduler = scheduler self._unknown_vm_ids = set() if _glusterEnabled: self.gluster = gapi.GlusterApi() else: self.gluster = None try: self.vmContainer = {} self.lastRemoteAccess = 0 self._enabled = True self._netConfigDirty = False self._prepareMOM() secret.clear() concurrent.thread(self._recoverThread, name='vmrecovery').start() self.channelListener.settimeout( config.getint('vars', 'guest_agent_timeout')) self.channelListener.start() self.threadLocal = threading.local() self.threadLocal.client = '' host = config.get('addresses', 'management_ip') port = config.getint('addresses', 'management_port') # When IPv6 is not enabled, fallback to listen on IPv4 address try: self._createAcceptor(host, port) except socket.error as e: if e.errno == errno.EAFNOSUPPORT and host in ('::', '::1'): fallback_host = '0.0.0.0' self._createAcceptor(fallback_host, port) else: raise self._prepareHttpServer() self._prepareJSONRPCServer() self._connectToBroker() except: self.log.error('failed to init clientIF, ' 'shutting down storage dispatcher') if self.irs: self.irs.prepareForShutdown() raise
def _recoverExistingVms(self): start_time = utils.monotonic_time() try: self.log.debug('recovery: started') # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover stage 1: domains from libvirt doms = getVDSMDomains() num_doms = len(doms) for idx, v in enumerate(doms): vmId = v.UUIDString() if self._recoverVm(vmId): self.log.info( 'recovery [1:%d/%d]: recovered domain %s from libvirt', idx + 1, num_doms, vmId) else: self.log.info( 'recovery [1:%d/%d]: loose domain %s found,' ' killing it.', idx + 1, num_doms, vmId) try: v.destroy() except libvirt.libvirtError: self.log.exception( 'recovery [1:%d/%d]: failed to kill loose' ' domain %s', idx + 1, num_doms, vmId) # Recover stage 2: domains from recovery files # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting rec_vms = self._getVDSMVmsFromRecovery() num_rec_vms = len(rec_vms) if rec_vms: self.log.warning( 'recovery: found %i VMs from recovery files not' ' reported by libvirt. This should not happen!' ' Will try to recover them.', num_rec_vms) for idx, vmId in enumerate(rec_vms): if self._recoverVm(vmId): self.log.info( 'recovery [2:%d/%d]: recovered domain %s' ' from data file', idx + 1, num_rec_vms, vmId) else: self.log.warning( 'recovery [2:%d/%d]: VM %s failed to recover from data' ' file, reported as Down', idx + 1, num_rec_vms, vmId) # recover stage 3: waiting for domains to go up while self._enabled: launching = sum( int(v.lastStatus == vmstatus.WAIT_FOR_LAUNCH) for v in self.vmContainer.values()) if not launching: break else: self.log.info('recovery: waiting for %d domains to go up', launching) time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: self.log.info('recovery: waiting for storage pool to go up') time.sleep(5) vm_objects = self.vmContainer.values() num_vm_objects = len(vm_objects) for idx, vm_obj in enumerate(vm_objects): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: self.log.info( 'recovery [%d/%d]: preparing paths for' ' domain %s', idx + 1, num_vm_objects, vm_obj.id) vm_obj.preparePaths( vm_obj.devSpecMapFromConf()[hwclass.DISK]) except: self.log.exception("recovery [%d/%d]: failed for vm %s", idx + 1, num_vm_objects, vm_obj.id) self.log.info('recovery: completed in %is', utils.monotonic_time() - start_time) except: self.log.exception("recovery: failed") raise
class MonitorThread(object): _MIGRATION_MONITOR_INTERVAL = config.getint( 'vars', 'migration_monitor_interval') # seconds def __init__(self, vm, startTime, conv_schedule, use_conv_schedule): super(MonitorThread, self).__init__() self._stop = threading.Event() self._vm = vm self._startTime = startTime self.daemon = True self.progress = None self._conv_schedule = conv_schedule self._use_conv_schedule = use_conv_schedule self.downtime_thread = _FakeThreadInterface() self._thread = concurrent.thread(self.run, name='migmon/' + self._vm.id[:8]) def start(self): self._thread.start() def join(self): self._thread.join() @property def enabled(self): return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0 @logutils.traceback() def run(self): if self.enabled: self._vm.log.debug('starting migration monitor thread') try: self.monitor_migration() except virdomain.NotConnectedError as e: # In case the VM is stopped during migration, there is a race # between domain disconnection and stopping the monitoring # thread. Then the domain may no longer be connected when # monitor_migration loop tries to access it. That's harmless # and shouldn't bubble up, let's just finish the thread. self._vm.log.debug('domain disconnected in monitor thread: %s', e) finally: self.downtime_thread.stop() if self.downtime_thread.is_alive(): # on very short migrations, the downtime thread # may not be started at all. self.downtime_thread.join() self._vm.log.debug('stopped migration monitor thread') else: self._vm.log.info('migration monitor thread disabled' ' (monitoring interval set to 0)') def monitor_migration(self): memSize = self._vm.mem_size_mb() maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) // 1024 progress_timeout = config.getint('vars', 'migration_progress_timeout') lastProgressTime = time.time() lowmark = None lastDataRemaining = None iterationCount = 0 self._execute_init(self._conv_schedule['init']) if not self._use_conv_schedule: self._vm.log.debug('setting initial migration downtime') self.downtime_thread.set_initial_downtime() while not self._stop.isSet(): stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) if stopped: break job_stats = self._vm._dom.jobStats() # It may happen that the migration did not start yet # so we'll keep waiting if not ongoing(job_stats): continue progress = Progress.from_job_stats(job_stats) self._vm.send_migration_status_event() now = time.time() if self._vm.post_copy != PostCopyPhase.NONE: # Post-copy mode is a final state of a migration -- it either # completes or fails and stops the VM, there is no way to # continue with the migration in either case. So we won't # handle any further schedule actions once post-copy is # successfully started. It's still recommended to put the # abort action after the post-copy action in the schedule, for # the case when it's not possible to switch to the post-copy # mode for some reason. if self._vm.post_copy == PostCopyPhase.RUNNING: # If post-copy is not RUNNING then we are in the interim # phase (which should be short) between initiating the # post-copy migration and the actual start of the post-copy # migration. Nothing needs to be done in that case. self._vm.log.debug( 'Post-copy migration still in progress: %d', progress.data_remaining) elif not self._use_conv_schedule and\ (0 < migrationMaxTime < now - self._startTime): self._vm.log.warn( 'The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) self._vm._dom.abortJob() self.stop() break elif (lowmark is None) or (lowmark > progress.data_remaining): lowmark = progress.data_remaining lastProgressTime = now else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).', progress.data_remaining // Mbytes, lowmark // Mbytes) if not self._vm.post_copy and\ lastDataRemaining is not None and\ lastDataRemaining < progress.data_remaining: iterationCount += 1 self._vm.log.debug('new iteration detected: %i', iterationCount) if self._use_conv_schedule: self._next_action(iterationCount) elif iterationCount == 1: # it does not make sense to do any adjustments before # first iteration. self.downtime_thread.start() lastDataRemaining = progress.data_remaining if not self._use_conv_schedule and\ (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) self._vm._dom.abortJob() self.stop() if self._stop.isSet(): break self.progress = progress self._vm.log.info('%s', progress) def stop(self): self._vm.log.debug('stopping migration monitor thread') self._stop.set() def _next_action(self, stalling): head = self._conv_schedule['stalling'][0] self._vm.log.debug( 'Stalling for %d iterations, ' 'checking to make next action: ' '%s', stalling, head) if head['limit'] < stalling: self._execute_action_with_params(head['action']) self._conv_schedule['stalling'].pop(0) self._vm.log.debug('setting conv schedule to: %s', self._conv_schedule) def _execute_init(self, init_actions): for action_with_params in init_actions: self._execute_action_with_params(action_with_params) def _execute_action_with_params(self, action_with_params): action = str(action_with_params['name']) vm = self._vm if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME: downtime = int(action_with_params['params'][0]) vm.log.debug('Setting downtime to %d', downtime) vm._dom.migrateSetMaxDowntime(downtime, 0) elif action == CONVERGENCE_SCHEDULE_POST_COPY: if not self._vm.switch_migration_to_post_copy(): # Do nothing for now; the next action will be invoked after a # while vm.log.warn('Failed to switch to post-copy migration') elif action == CONVERGENCE_SCHEDULE_SET_ABORT: vm.log.warn('Aborting migration') vm._dom.abortJob() self.stop()
class ImageResourceFactory(rm.SimpleResourceFactory): """ This factory produce resources for images """ storage_repository = config.get('irs', 'repository') # Resource timeouts are in seconds. It's written in ms in the config for # backward competability reasons resource_default_timeout = config.getint('irs', 'prepare_image_timeout') / 1000.0 def __init__(self, sdUUID): rm.SimpleResourceFactory.__init__(self) self.sdUUID = sdUUID self.volumeResourcesNamespace = rm.getNamespace( sc.VOLUME_NAMESPACE, self.sdUUID) def __getResourceCandidatesList(self, resourceName, lockType): """ Return list of lock candidates (template and volumes) """ volResourcesList = [] template = None dom = sdCache.produce(sdUUID=self.sdUUID) # Get the list of the volumes repoPath = os.path.join(self.storage_repository, dom.getPools()[0]) try: chain = image.Image(repoPath).getChain(sdUUID=self.sdUUID, imgUUID=resourceName) except se.ImageDoesNotExistInSD: log.debug("Image %s does not exist in domain %s", resourceName, self.sdUUID) return [] # check if the chain is build above a template, or it is a standalone pvol = chain[0].getParentVolume() if pvol: template = pvol.volUUID elif chain[0].isShared(): # Image of template itself, # with no other volumes in chain template = chain[0].volUUID del chain[:] volUUIDChain = [vol.volUUID for vol in chain] volUUIDChain.sort() # Activate all volumes in chain at once. # We will attempt to activate all volumes again down to the flow with # no consequence, since they are already active. # TODO Fix resource framework to hold images, instead of specific vols. # This assumes that chains can not spread into more than one SD. if dom.__class__.__name__ == "BlockStorageDomain": lvm.activateLVs(self.sdUUID, volUUIDChain) failed = False # Acquire template locks: # - 'lockType' for template's image itself # - Always 'shared' lock for image based on template try: if template: if len(volUUIDChain) > 0: volRes = rm.acquireResource( self.volumeResourcesNamespace, template, rm.SHARED, timeout=self.resource_default_timeout) else: volRes = rm.acquireResource( self.volumeResourcesNamespace, template, lockType, timeout=self.resource_default_timeout) volResourcesList.append(volRes) # Acquire 'lockType' volume locks for volUUID in volUUIDChain: volRes = rm.acquireResource( self.volumeResourcesNamespace, volUUID, lockType, timeout=self.resource_default_timeout) volResourcesList.append(volRes) except (rm.RequestTimedOutError, se.ResourceAcqusitionFailed) as e: log.debug("Cannot acquire volume resource (%s)", str(e)) failed = True raise except Exception: log.debug("Cannot acquire volume resource", exc_info=True) failed = True raise finally: if failed: # Release already acquired template/volumes locks for volRes in volResourcesList: volRes.release() return volResourcesList def createResource(self, resourceName, lockType): volResourcesList = self.__getResourceCandidatesList( resourceName, lockType) return ImageResource(volResourcesList)
def monitor_migration(self): memSize = self._vm.mem_size_mb() maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) // 1024 progress_timeout = config.getint('vars', 'migration_progress_timeout') lastProgressTime = time.time() lowmark = None lastDataRemaining = None iterationCount = 0 self._execute_init(self._conv_schedule['init']) if not self._use_conv_schedule: self._vm.log.debug('setting initial migration downtime') self.downtime_thread.set_initial_downtime() while not self._stop.isSet(): stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) if stopped: break job_stats = self._vm._dom.jobStats() # It may happen that the migration did not start yet # so we'll keep waiting if not ongoing(job_stats): continue progress = Progress.from_job_stats(job_stats) self._vm.send_migration_status_event() now = time.time() if self._vm.post_copy != PostCopyPhase.NONE: # Post-copy mode is a final state of a migration -- it either # completes or fails and stops the VM, there is no way to # continue with the migration in either case. So we won't # handle any further schedule actions once post-copy is # successfully started. It's still recommended to put the # abort action after the post-copy action in the schedule, for # the case when it's not possible to switch to the post-copy # mode for some reason. if self._vm.post_copy == PostCopyPhase.RUNNING: # If post-copy is not RUNNING then we are in the interim # phase (which should be short) between initiating the # post-copy migration and the actual start of the post-copy # migration. Nothing needs to be done in that case. self._vm.log.debug( 'Post-copy migration still in progress: %d', progress.data_remaining) elif not self._use_conv_schedule and\ (0 < migrationMaxTime < now - self._startTime): self._vm.log.warn( 'The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) self._vm._dom.abortJob() self.stop() break elif (lowmark is None) or (lowmark > progress.data_remaining): lowmark = progress.data_remaining lastProgressTime = now else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).', progress.data_remaining // Mbytes, lowmark // Mbytes) if not self._vm.post_copy and\ lastDataRemaining is not None and\ lastDataRemaining < progress.data_remaining: iterationCount += 1 self._vm.log.debug('new iteration detected: %i', iterationCount) if self._use_conv_schedule: self._next_action(iterationCount) elif iterationCount == 1: # it does not make sense to do any adjustments before # first iteration. self.downtime_thread.start() lastDataRemaining = progress.data_remaining if not self._use_conv_schedule and\ (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) self._vm._dom.abortJob() self.stop() if self._stop.isSet(): break self.progress = progress self._vm.log.info('%s', progress)
from vdsm.common.define import NORMAL from vdsm.common.network.address import normalize_literal_addr from vdsm.common.units import MiB from vdsm.virt.utils import DynamicBoundedSemaphore from vdsm.virt import virdomain from vdsm.virt import vmexitreason from vdsm.virt import vmstatus MODE_REMOTE = 'remote' MODE_FILE = 'file' METHOD_ONLINE = 'online' incomingMigrations = DynamicBoundedSemaphore( max(1, config.getint('vars', 'max_incoming_migrations'))) CONVERGENCE_SCHEDULE_SET_DOWNTIME = "setDowntime" CONVERGENCE_SCHEDULE_POST_COPY = "postcopy" CONVERGENCE_SCHEDULE_SET_ABORT = "abort" ADDRESS = '0' PORT = 54321 class MigrationDestinationSetupError(RuntimeError): """ Failed to create migration destination VM. """
MASTERLV_SIZE = "1024" # In MiB = 2 ** 20 = 1024 ** 2 => 1GiB BlockSDVol = namedtuple("BlockSDVol", "name, image, parent") log = logging.getLogger("Storage.BlockSD") # FIXME: Make this calculated from something logical RESERVED_METADATA_SIZE = 40 * (2 ** 20) RESERVED_MAILBOX_SIZE = MAILBOX_SIZE * clusterlock.MAX_HOST_ID METADATA_BASE_SIZE = 378 # VG's min metadata threshold is 20% VG_MDA_MIN_THRESHOLD = 0.2 # VG's metadata size in MiB VG_METADATASIZE = 128 MAX_PVS_LIMIT = 10 # BZ#648051 MAX_PVS = config.getint('irs', 'maximum_allowed_pvs') if MAX_PVS > MAX_PVS_LIMIT: log.warning("maximum_allowed_pvs = %d ignored. MAX_PVS = %d", MAX_PVS, MAX_PVS_LIMIT) MAX_PVS = MAX_PVS_LIMIT PVS_METADATA_SIZE = MAX_PVS * 142 SD_METADATA_SIZE = 2048 DEFAULT_BLOCKSIZE = 512 DMDK_VGUUID = "VGUUID" DMDK_PV_REGEX = re.compile(r"^PV\d+$") DMDK_LOGBLKSIZE = "LOGBLKSIZE" DMDK_PHYBLKSIZE = "PHYBLKSIZE"
def getMaximumSupportedDomains(self): return config.getint("irs", "maximum_domains_in_pool")
from vdsm.storage import sd from vdsm.storage.sdm.api import create_volume from vdsm.storage.volumemetadata import VolumeMetadata class ExpectedFailure(Exception): pass def failure(*args, **kwargs): raise ExpectedFailure() MB = 1024**2 VOL_SIZE = 1073741824 BLOCK_INITIAL_CHUNK_SIZE = MB * config.getint("irs", "volume_utilization_chunk_mb") BASE_PARAMS = { sc.RAW_FORMAT: (VOL_SIZE, sc.RAW_FORMAT, image.SYSTEM_DISK_TYPE, 'raw_volume'), sc.COW_FORMAT: (VOL_SIZE, sc.COW_FORMAT, image.SYSTEM_DISK_TYPE, 'cow_volume') } @expandPermutations class VolumeArtifactsTestsMixin(object): def setUp(self): self.img_id = make_uuid() self.vol_id = make_uuid() def test_state_missing(self):
def monitor_migration(self): memSize = int(self._vm.conf['memSize']) maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024 progress_timeout = config.getint('vars', 'migration_progress_timeout') lastProgressTime = time.time() lowmark = None lastDataRemaining = None iterationCount = 0 self._execute_init(self._conv_schedule['init']) if not self._use_conv_schedule: self._vm.log.debug('setting initial migration downtime') self.downtime_thread.set_initial_downtime() while not self._stop.isSet(): stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) if stopped: break job_stats = self._vm._dom.jobStats() # It may happen that the migration did not start yet # so we'll keep waiting if not ongoing(job_stats): continue progress = Progress.from_job_stats(job_stats) now = time.time() if not self._use_conv_schedule and\ (0 < migrationMaxTime < now - self._startTime): self._vm.log.warn( 'The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) self._vm._dom.abortJob() self.stop() break elif (lowmark is None) or (lowmark > progress.data_remaining): lowmark = progress.data_remaining lastProgressTime = now else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).' ' Refer to RHBZ#919201.', progress.data_remaining / Mbytes, lowmark / Mbytes) if lastDataRemaining is not None and\ lastDataRemaining < progress.data_remaining: iterationCount += 1 self._vm.log.debug('new iteration detected: %i', iterationCount) if self._use_conv_schedule: self._next_action(iterationCount) elif iterationCount == 1: # it does not make sense to do any adjustments before # first iteration. self.downtime_thread.start() lastDataRemaining = progress.data_remaining if not self._use_conv_schedule and\ (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) self._vm._dom.abortJob() self.stop() if self._stop.isSet(): break self.progress = progress self._vm.log.info('%s', progress)
def __init__(self, vm, dst='', dstparams='', mode=MODE_REMOTE, method=METHOD_ONLINE, tunneled=False, dstqemu='', abortOnError=False, consoleAddress=None, compressed=False, autoConverge=False, recovery=False, encrypted=False, **kwargs): self.log = vm.log self._vm = vm self._dom = DomainAdapter(self._vm) self._dst = dst self._mode = mode self._dstparams = dstparams self._enableGuestEvents = kwargs.get('enableGuestEvents', False) # TODO: conv.tobool shouldn't be used in this constructor, the # conversions should be handled properly in the API layer self._consoleAddress = consoleAddress self._dstqemu = dstqemu self._encrypted = encrypted self._maxBandwidth = int( kwargs.get('maxBandwidth') or config.getint('vars', 'migration_max_bandwidth')) self._incomingLimit = kwargs.get('incomingLimit') self._outgoingLimit = kwargs.get('outgoingLimit') self.status = { 'status': { 'code': 0, 'message': 'Migration in progress' } } # we need to guard against concurrent updates only self._lock = threading.Lock() self._progress = 0 self._thread = concurrent.thread(self.run, name='migsrc/' + self._vm.id[:8]) self._preparingMigrationEvt = True self._migrationCanceledEvt = threading.Event() self._monitorThread = None self._destServer = None self._legacy_payload_path = None if 'convergenceSchedule' in kwargs: self._convergence_schedule = kwargs['convergenceSchedule'] else: # Needed for Engine < 4.3 or when legacy migration is used # as a supposedly rare fallback in Engine >= 4.3. self._convergence_schedule = \ self._legacy_convergence_schedule(kwargs.get('downtime')) self.log.info( 'using a computed convergence schedule for ' 'a legacy migration: %s', self._convergence_schedule) self.log.debug('convergence schedule set to: %s', str(self._convergence_schedule)) self._started = False self._failed = False self._recovery = recovery tunneled = conv.tobool(tunneled) abortOnError = conv.tobool(abortOnError) compressed = conv.tobool(compressed) autoConverge = conv.tobool(autoConverge) self._migration_flags = self._calculate_migration_flags( tunneled, abortOnError, compressed, autoConverge, encrypted)
class MonitorThread(object): _MIGRATION_MONITOR_INTERVAL = config.getint( 'vars', 'migration_monitor_interval') # seconds def __init__(self, vm, startTime, conv_schedule, use_conv_schedule): super(MonitorThread, self).__init__() self._stop = threading.Event() self._vm = vm self._startTime = startTime self.daemon = True self.progress = None self._conv_schedule = conv_schedule self._use_conv_schedule = use_conv_schedule self.downtime_thread = _FakeThreadInterface() self._thread = concurrent.thread(self.run) def start(self): self._thread.start() def join(self): self._thread.join() @property def enabled(self): return MonitorThread._MIGRATION_MONITOR_INTERVAL > 0 @utils.traceback() def run(self): if self.enabled: self._vm.log.debug('starting migration monitor thread') try: self.monitor_migration() finally: self.downtime_thread.stop() if self.downtime_thread.is_alive(): # on very short migrations, the downtime thread # may not be started at all. self.downtime_thread.join() self._vm.log.debug('stopped migration monitor thread') else: self._vm.log.info('migration monitor thread disabled' ' (monitoring interval set to 0)') def monitor_migration(self): memSize = int(self._vm.conf['memSize']) maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024 progress_timeout = config.getint('vars', 'migration_progress_timeout') lastProgressTime = time.time() lowmark = None lastDataRemaining = None iterationCount = 0 self._execute_init(self._conv_schedule['init']) if not self._use_conv_schedule: self._vm.log.debug('setting initial migration downtime') self.downtime_thread.set_initial_downtime() while not self._stop.isSet(): stopped = self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) if stopped: break job_stats = self._vm._dom.jobStats() # It may happen that the migration did not start yet # so we'll keep waiting if not ongoing(job_stats): continue progress = Progress.from_job_stats(job_stats) now = time.time() if not self._use_conv_schedule and\ (0 < migrationMaxTime < now - self._startTime): self._vm.log.warn( 'The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) self._vm._dom.abortJob() self.stop() break elif (lowmark is None) or (lowmark > progress.data_remaining): lowmark = progress.data_remaining lastProgressTime = now else: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).' ' Refer to RHBZ#919201.', progress.data_remaining / Mbytes, lowmark / Mbytes) if lastDataRemaining is not None and\ lastDataRemaining < progress.data_remaining: iterationCount += 1 self._vm.log.debug('new iteration detected: %i', iterationCount) if self._use_conv_schedule: self._next_action(iterationCount) elif iterationCount == 1: # it does not make sense to do any adjustments before # first iteration. self.downtime_thread.start() lastDataRemaining = progress.data_remaining if not self._use_conv_schedule and\ (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) self._vm._dom.abortJob() self.stop() if self._stop.isSet(): break self.progress = progress self._vm.log.info('%s', progress) def stop(self): self._vm.log.debug('stopping migration monitor thread') self._stop.set() def _next_action(self, stalling): head = self._conv_schedule['stalling'][0] self._vm.log.debug( 'Stalling for %d iterations, ' 'checking to make next action: ' '%s', stalling, head) if head['limit'] < stalling: self._execute_action_with_params(head['action']) self._conv_schedule['stalling'].pop(0) self._vm.log.debug('setting conv schedule to: %s', self._conv_schedule) def _execute_init(self, init_actions): for action_with_params in init_actions: self._execute_action_with_params(action_with_params) def _execute_action_with_params(self, action_with_params): action = str(action_with_params['name']) if action == CONVERGENCE_SCHEDULE_SET_DOWNTIME: downtime = int(action_with_params['params'][0]) self._vm.log.debug('Setting downtime to %d', downtime) self._vm._dom.migrateSetMaxDowntime(downtime, 0) elif action == CONVERGENCE_SCHEDULE_SET_ABORT: self._vm.log.warn('Aborting migration') self._vm._dom.abortJob() self.stop()
def _recoverExistingVms(self): try: # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover for v in getVDSMDomains(): vmId = v.UUIDString() if not self._recoverVm(vmId): # RH qemu proc without recovery self.log.info( 'loose qemu process with id: ' '%s found, killing it.', vmId) try: v.destroy() except libvirt.libvirtError: self.log.error( 'failed to kill loose qemu ' 'process with id: %s', vmId, exc_info=True) # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting recVms = self._getVDSMVmsFromRecovery() if recVms: self.log.warning( 'Found %i VMs from recovery files not' ' reported by libvirt.' ' This should not happen!' ' Will try to recover them.', len(recVms)) for vmId in recVms: if not self._recoverVm(vmId): self.log.warning( 'VM %s failed to recover from recovery' ' file, reported as Down', vmId) while (self._enabled and vmstatus.WAIT_FOR_LAUNCH in [v.lastStatus for v in self.vmContainer.values()]): time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: time.sleep(5) for vmId, vmObj in self.vmContainer.items(): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: vmObj.preparePaths( vmObj.buildConfDevices()[vm.DISK_DEVICES]) except: self.log.error("Vm %s recovery failed", vmId, exc_info=True) except: self.log.error("Vm's recovery failed", exc_info=True) raise
def get(): numa.update() caps = {} cpu_topology = numa.cpu_topology() caps['kvmEnabled'] = str(os.path.exists('/dev/kvm')).lower() if config.getboolean('vars', 'report_host_threads_as_cores'): caps['cpuCores'] = str(cpu_topology.threads) else: caps['cpuCores'] = str(cpu_topology.cores) caps['cpuThreads'] = str(cpu_topology.threads) caps['cpuSockets'] = str(cpu_topology.sockets) caps['onlineCpus'] = ','.join( [str(cpu_id) for cpu_id in cpu_topology.online_cpus]) caps['cpuTopology'] = [{ 'cpu_id': cpu.cpu_id, 'numa_cell_id': cpu.numa_cell_id, 'socket_id': cpu.socket_id, 'die_id': cpu.die_id, 'core_id': cpu.core_id, } for cpu in numa.cpu_info()] caps['cpuSpeed'] = cpuinfo.frequency() caps['cpuModel'] = cpuinfo.model() caps['cpuFlags'] = ','.join(_getFlagsAndFeatures()) caps['vdsmToCpusAffinity'] = list(taskset.get(os.getpid())) caps.update(dsaversion.version_info()) proxy = supervdsm.getProxy() net_caps = proxy.network_caps() caps.update(net_caps) caps['ovnConfigured'] = proxy.is_ovn_configured() try: caps['hooks'] = hooks.installed() except: logging.debug('not reporting hooks', exc_info=True) caps['operatingSystem'] = osinfo.version() caps['uuid'] = host.uuid() caps['packages2'] = osinfo.package_versions() caps['realtimeKernel'] = osinfo.runtime_kernel_flags().realtime caps['kernelArgs'] = osinfo.kernel_args() caps['nestedVirtualization'] = osinfo.nested_virtualization().enabled caps['emulatedMachines'] = machinetype.emulated_machines( cpuarch.effective()) caps['ISCSIInitiatorName'] = _getIscsiIniName() caps['HBAInventory'] = hba.HBAInventory() caps['vmTypes'] = ['kvm'] caps['memSize'] = str(utils.readMemInfo()['MemTotal'] // 1024) caps['reservedMem'] = str( config.getint('vars', 'host_mem_reserve') + config.getint('vars', 'extra_mem_reserve')) caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead') caps['rngSources'] = rngsources.list_available() caps['numaNodes'] = dict(numa.topology()) caps['numaNodeDistance'] = dict(numa.distances()) caps['autoNumaBalancing'] = numa.autonuma_status() caps['selinux'] = osinfo.selinux_status() caps['liveSnapshot'] = 'true' caps['liveMerge'] = 'true' caps['kdumpStatus'] = osinfo.kdump_status() caps["deferred_preallocation"] = True caps['hostdevPassthrough'] = str(hostdev.is_supported()).lower() # TODO This needs to be removed after adding engine side support # and adding gdeploy support to enable libgfapi on RHHI by default caps['additionalFeatures'] = ['libgfapi_supported'] if osinfo.glusterEnabled: from vdsm.gluster.api import glusterAdditionalFeatures caps['additionalFeatures'].extend(glusterAdditionalFeatures()) caps['hostedEngineDeployed'] = _isHostedEngineDeployed() caps['hugepages'] = hugepages.supported() caps['kernelFeatures'] = osinfo.kernel_features() caps['vncEncrypted'] = _isVncEncrypted() caps['backupEnabled'] = True caps['coldBackupEnabled'] = True caps['clearBitmapsEnabled'] = True caps['fipsEnabled'] = _getFipsEnabled() try: caps['boot_uuid'] = osinfo.boot_uuid() except Exception: logging.exception("Can not find boot uuid") caps['tscFrequency'] = _getTscFrequency() caps['tscScaling'] = _getTscScaling() try: caps["connector_info"] = managedvolume.connector_info() except se.ManagedVolumeNotSupported as e: logging.info("managedvolume not supported: %s", e) except se.ManagedVolumeHelperFailed as e: logging.exception("Error getting managedvolume connector info: %s", e) # Which domain versions are supported by this host. caps["domain_versions"] = sc.DOMAIN_VERSIONS caps["supported_block_size"] = backends.supported_block_size() caps["cd_change_pdiv"] = True caps["refresh_disk_supported"] = True return caps
def connect(self): iscsi.addIscsiNode(self._iface, self._target, self._cred) timeout = config.getint("irs", "udev_settle_timeout") udevadm.settle(timeout)
from vdsm.common import exception from vdsm.common import response from vdsm.config import config from vdsm.virt import migration from vdsm.virt import vmstatus from monkeypatch import MonkeyPatchScope from testlib import VdsmTestCase as TestCaseBase from testlib import permutations, expandPermutations from testlib import make_config from . import vmfakelib as fake import pytest # defaults _DOWNTIME = config.getint('vars', 'migration_downtime') _STEPS = config.getint('vars', 'migration_downtime_steps') _STEPS_MIN = 2 _STEPS_HUGE = 1000 _DOWNTIME_MIN = 100 _DOWNTIME_HUGE = 10000 _PARAMS = tuple( product((_DOWNTIME_MIN, _DOWNTIME, _DOWNTIME_HUGE), (_STEPS_MIN, _STEPS, _STEPS_HUGE))) @expandPermutations
def _startUnderlyingMigration(self, startTime): if self._mode == 'file': hooks.before_vm_hibernate(self._vm._dom.XMLDesc(0), self._vm.conf) try: self._vm._vmStats.pause() fname = self._vm.cif.prepareVolumePath(self._dst) try: self._vm._dom.save(fname) finally: self._vm.cif.teardownVolumePath(self._dst) except Exception: self._vm._vmStats.cont() raise else: for dev in self._vm._customDevices(): hooks.before_device_migrate_source( dev._deviceXML, self._vm.conf, dev.custom) hooks.before_vm_migrate_source(self._vm._dom.XMLDesc(0), self._vm.conf) response = self.destServer.migrationCreate(self._machineParams) if response['status']['code']: self.status = response raise RuntimeError('migration destination error: ' + response['status']['message']) if config.getboolean('vars', 'ssl'): transport = 'tls' else: transport = 'tcp' duri = 'qemu+%s://%s/system' % (transport, self.remoteHost) if self._vm.conf['_migrationParams']['dstqemu']: muri = 'tcp://%s' % \ self._vm.conf['_migrationParams']['dstqemu'] else: muri = 'tcp://%s' % self.remoteHost self._vm.log.debug('starting migration to %s ' 'with miguri %s', duri, muri) t = DowntimeThread(self._vm, int(self._downtime)) self._monitorThread = MonitorThread(self._vm, startTime) self._monitorThread.start() try: if self._vm.hasSpice and self._vm.conf.get('clientIp'): SPICE_MIGRATION_HANDOVER_TIME = 120 self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME) maxBandwidth = config.getint('vars', 'migration_max_bandwidth') # FIXME: there still a race here with libvirt, # if we call stop() and libvirt migrateToURI2 didn't start # we may return migration stop but it will start at libvirt # side self._preparingMigrationEvt = False if not self._migrationCanceledEvt: self._vm._dom.migrateToURI2( duri, muri, None, libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | (libvirt.VIR_MIGRATE_TUNNELLED if self._tunneled else 0) | (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if self._abortOnError else 0), None, maxBandwidth) else: self._raiseAbortError() finally: t.cancel() self._monitorThread.stop()
import libvirt from vdsm import containersconnection from vdsm import executor from vdsm import host from vdsm import libvirtconnection from vdsm.config import config from vdsm.virt import migration from vdsm.virt import sampling from vdsm.virt import virdomain from vdsm.virt import vmstatus # Just a made up number. Maybe should be equal to number of cores? # TODO: make them tunable through private, unsupported configuration items _WORKERS = config.getint('sampling', 'periodic_workers') _TASK_PER_WORKER = config.getint('sampling', 'periodic_task_per_worker') _TASKS = _WORKERS * _TASK_PER_WORKER _MAX_WORKERS = config.getint('sampling', 'max_workers') _operations = [] _executor = None def _timeout_from(interval): """ Estimate a sensible timeout given a periodic interval. """ return interval / 2.
def get(): targetArch = getTargetArch() caps = {} caps['kvmEnabled'] = \ str(config.getboolean('vars', 'fake_kvm_support') or os.path.exists('/dev/kvm')).lower() cpuInfo = CpuInfo() cpuTopology = CpuTopology() if config.getboolean('vars', 'report_host_threads_as_cores'): caps['cpuCores'] = str(cpuTopology.threads()) else: caps['cpuCores'] = str(cpuTopology.cores()) caps['cpuThreads'] = str(cpuTopology.threads()) caps['cpuSockets'] = str(cpuTopology.sockets()) caps['onlineCpus'] = ','.join(cpuTopology.onlineCpus()) caps['cpuSpeed'] = cpuInfo.mhz() if config.getboolean('vars', 'fake_kvm_support'): if targetArch == Architecture.X86_64: caps['cpuModel'] = 'Intel(Fake) CPU' flagList = ['vmx', 'sse2', 'nx'] if targetArch == platform.machine(): flagList += cpuInfo.flags() flags = set(flagList) caps['cpuFlags'] = ','.join(flags) + ',model_486,model_pentium,' \ 'model_pentium2,model_pentium3,model_pentiumpro,' \ 'model_qemu32,model_coreduo,model_core2duo,model_n270,' \ 'model_Conroe,model_Penryn,model_Nehalem,model_Opteron_G1' elif targetArch in Architecture.POWER: caps['cpuModel'] = 'POWER 8 (fake)' caps['cpuFlags'] = 'powernv,model_power8' else: raise RuntimeError('Unsupported architecture: %s' % targetArch) else: caps['cpuModel'] = cpuInfo.model() caps['cpuFlags'] = ','.join(cpuInfo.flags() + _getCompatibleCpuModels()) caps.update(_getVersionInfo()) caps.update(netinfo.get()) _report_legacy_bondings(caps) _report_network_qos(caps) try: caps['hooks'] = hooks.installed() except: logging.debug('not reporting hooks', exc_info=True) caps['operatingSystem'] = osversion() caps['uuid'] = utils.getHostUUID() caps['packages2'] = _getKeyPackages() caps['emulatedMachines'] = _getEmulatedMachines(targetArch) caps['ISCSIInitiatorName'] = _getIscsiIniName() caps['HBAInventory'] = storage.hba.HBAInventory() caps['vmTypes'] = ['kvm'] caps['memSize'] = str(utils.readMemInfo()['MemTotal'] / 1024) caps['reservedMem'] = str( config.getint('vars', 'host_mem_reserve') + config.getint('vars', 'extra_mem_reserve')) caps['guestOverhead'] = config.get('vars', 'guest_ram_overhead') # Verify that our libvirt supports virtio RNG (since 10.0.2-31) libvirtVer = LooseVersion('-'.join( (caps['packages2']['libvirt']['version'], caps['packages2']['libvirt']['release']))) requiredVer = LooseVersion('0.10.2-31') if libvirtVer >= requiredVer: caps['rngSources'] = _getRngSources() else: logging.debug('VirtioRNG DISABLED: libvirt version %s required >= %s', libvirtVer, requiredVer) caps['numaNodes'] = getNumaTopology() caps['numaNodeDistance'] = getNumaNodeDistance() caps['autoNumaBalancing'] = getAutoNumaBalancingInfo() caps['selinux'] = _getSELinux() liveSnapSupported = _getLiveSnapshotSupport(targetArch) if liveSnapSupported is not None: caps['liveSnapshot'] = str(liveSnapSupported).lower() caps['liveMerge'] = str(getLiveMergeSupport()).lower() caps['kdumpStatus'] = _getKdumpStatus() caps['hostdevPassthrough'] = str(_getHostdevPassthorughSupport()).lower() return caps
def monitor_migration(self): def update_progress(remaining, total): if remaining == 0 and total: return 100 progress = 100 - 100 * remaining / total if total else 0 return progress if (progress < 100) else 99 self._vm.log.debug('starting migration monitor thread') memSize = int(self._vm.conf['memSize']) maxTimePerGiB = config.getint('vars', 'migration_max_time_per_gib_mem') migrationMaxTime = (maxTimePerGiB * memSize + 1023) / 1024 lastProgressTime = time.time() lowmark = None progress_timeout = config.getint('vars', 'migration_progress_timeout') while not self._stop.isSet(): self._stop.wait(self._MIGRATION_MONITOR_INTERVAL) (jobType, timeElapsed, _, dataTotal, dataProcessed, dataRemaining, memTotal, memProcessed, memRemaining, fileTotal, fileProcessed, _) = self._vm._dom.jobInfo() # from libvirt sources: data* = file* + mem*. # docs can be misleading due to misaligned lines. abort = False now = time.time() if 0 < migrationMaxTime < now - self._startTime: self._vm.log.warn( 'The migration took %d seconds which is ' 'exceeding the configured maximum time ' 'for migrations of %d seconds. The ' 'migration will be aborted.', now - self._startTime, migrationMaxTime) abort = True elif (lowmark is None) or (lowmark > dataRemaining): lowmark = dataRemaining lastProgressTime = now elif (now - lastProgressTime) > progress_timeout: # Migration is stuck, abort self._vm.log.warn( 'Migration is stuck: Hasn\'t progressed in %s seconds. ' 'Aborting.' % (now - lastProgressTime)) abort = True if abort: self._vm._dom.abortJob() self.stop() break if dataRemaining > lowmark: self._vm.log.warn( 'Migration stalling: remaining (%sMiB)' ' > lowmark (%sMiB).' ' Refer to RHBZ#919201.', dataRemaining / Mbytes, lowmark / Mbytes) if jobType != libvirt.VIR_DOMAIN_JOB_NONE: self.progress = update_progress(dataRemaining, dataTotal) self._vm.log.info('Migration Progress: %s seconds elapsed,' ' %s%% of data processed' % (timeElapsed / 1000, self.progress))