def reboot(self, hard=False): libvirt = util.get_libvirt() instance = self._get_domain() if not hard: instance.reboot(flags=libvirt.VIR_DOMAIN_REBOOT_ACPI_POWER_BTN) else: instance.reset()
def power_on(self): if not os.path.exists(self.xml_file): db.enqueue_instance_error(self.db_entry['uuid'], 'missing domain file in power on') libvirt = util.get_libvirt() with open(self.xml_file) as f: xml = f.read() instance = self._get_domain() if not instance: conn = libvirt.open(None) instance = conn.defineXML(xml) if not instance: db.enqueue_instance_error(self.db_entry['uuid'], 'power on failed to create domain') raise exceptions.NoDomainException() try: instance.create() except libvirt.libvirtError as e: if not str(e).startswith( 'Requested operation is not valid: domain is already running' ): LOG.withObj(self).warning('Instance start error: %s' % e) return False instance.setAutostart(1) db.update_instance_power_state( self.db_entry['uuid'], util.extract_power_state(libvirt, instance)) db.add_event('instance', self.db_entry['uuid'], 'poweron', 'complete', None, None) return True
def is_powered_on(self): instance = self._get_domain() if not instance: return 'off' libvirt = util.get_libvirt() return util.extract_power_state(libvirt, instance)
def run(self): workers = [] LOG.info('Starting Queues') libvirt = util.get_libvirt() conn = libvirt.open(None) present_cpus, _, _ = conn.getCPUMap() while True: try: for w in copy.copy(workers): if not w.is_alive(): w.join(1) workers.remove(w) if len(workers) < present_cpus / 2: jobname, workitem = db.dequeue(config.NODE_NAME) else: workitem = None if not workitem: time.sleep(0.2) continue p = multiprocessing.Process( target=handle, args=(jobname, workitem,), name='%s-worker' % daemon.process_name('queues')) p.start() workers.append(p) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e)
def _get_domain(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: return conn.lookupByName('sf:' + self.db_entry['uuid']) except libvirt.libvirtError: return None
def instance_start(instance_uuid, network): log = LOG.withField('instance', instance_uuid) with db.get_lock('instance', None, instance_uuid, ttl=900, timeout=120, op='Instance start') as lock: instance = virt.from_db(instance_uuid) # Collect the networks nets = {} for netdesc in network: if netdesc['network_uuid'] not in nets: n = net.from_db(netdesc['network_uuid']) if not n: db.enqueue_instance_error(instance_uuid, 'missing network') return nets[netdesc['network_uuid']] = n # Create the networks with util.RecordedOperation('ensure networks exist', instance): for network_uuid in nets: n = nets[network_uuid] try: n.create() n.ensure_mesh() n.update_dhcp() except exceptions.DeadNetwork as e: log.withField( 'network', n).warning('Instance tried to use dead network') db.enqueue_instance_error( instance_uuid, 'tried to use dead network: %s' % e) return # Allocate console and VDI ports instance.allocate_instance_ports() # Now we can start the instance libvirt = util.get_libvirt() try: with util.RecordedOperation('instance creation', instance): instance.create(lock=lock) except libvirt.libvirtError as e: code = e.get_error_code() if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED, libvirt.VIR_ERR_XML_ERROR): db.enqueue_instance_error(instance_uuid, 'instance failed to start: %s' % e) return for iface in db.get_instance_interfaces(instance_uuid): db.update_network_interface_state(iface['uuid'], 'created')
def unpause(self): libvirt = util.get_libvirt() instance = self._get_domain() instance.resume() db.update_instance_power_state( self.db_entry['uuid'], util.extract_power_state(libvirt, instance)) db.add_event('instance', self.db_entry['uuid'], 'unpause', 'complete', None, None)
def reboot(self, hard=False): libvirt = util.get_libvirt() instance = self._get_domain() if not hard: instance.reboot(flags=libvirt.VIR_DOMAIN_REBOOT_ACPI_POWER_BTN) else: instance.reset() db.add_event('instance', self.db_entry['uuid'], 'reboot', 'complete', None, None)
def power_off(self): libvirt = util.get_libvirt() instance = self._get_domain() if not instance: return try: instance.destroy() except libvirt.libvirtError as e: LOG.withObj(self).error('Failed to delete domain: %s' % e) self.update_power_state('off') self.add_event('poweroff', 'complete')
def power_off(self): libvirt = util.get_libvirt() instance = self._get_domain() if not instance: return try: instance.destroy() except libvirt.libvirtError as e: logutil.error([self], 'Failed to delete domain: %s' % e) db.add_event( 'instance', self.db_entry['uuid'], 'poweroff', 'complete', None, None)
def power_off(self): libvirt = util.get_libvirt() instance = self._get_domain() if not instance: return try: instance.destroy() except libvirt.libvirtError as e: LOG.withObj(self).error('Failed to delete domain: %s' % e) db.update_instance_power_state(self.db_entry['uuid'], 'off') db.add_event('instance', self.db_entry['uuid'], 'poweroff', 'complete', None, None)
def power_off(self): libvirt = util.get_libvirt() with open(self.xml_file) as f: xml = f.read() instance = self._get_domain() if not instance: conn = libvirt.open(None) instance = conn.defineXML(xml) if not instance: LOG.error('%s: Failed to create libvirt domain' % self) return try: instance.destroy() except libvirt.libvirtError as e: LOG.error('%s: Failed to delete domain: %s' % (self, e))
def power_on(self): libvirt = util.get_libvirt() with open(self.xml_file) as f: xml = f.read() instance = self._get_domain() if not instance: conn = libvirt.open(None) instance = conn.defineXML(xml) if not instance: LOG.error('%s: Failed to create libvirt domain' % self) return try: instance.create() except libvirt.libvirtError: pass instance.setAutostart(1)
def instance_start(instance_uuid, network): with db.get_lock('instance', None, instance_uuid, ttl=900) as lock: instance = virt.from_db(instance_uuid) # Collect the networks nets = {} for netdesc in network: if netdesc['network_uuid'] not in nets: n = net.from_db(netdesc['network_uuid']) if not n: db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'missing network') return nets[netdesc['network_uuid']] = n # Create the networks with util.RecordedOperation('ensure networks exist', instance): for network_uuid in nets: n = nets[network_uuid] n.create() n.ensure_mesh() n.update_dhcp() # Now we can start the isntance libvirt = util.get_libvirt() try: with util.RecordedOperation('instance creation', instance): instance.create(lock=lock) except libvirt.libvirtError as e: code = e.get_error_code() if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED, libvirt.VIR_ERR_XML_ERROR): db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'instance failed to start') return for iface in db.get_instance_interfaces(instance_uuid): db.update_network_interface_state(iface['uuid'], 'created')
def _update_power_states(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: seen = [] # Active VMs have an ID. Active means running in libvirt # land. for domain_id in conn.listDomainsID(): domain = conn.lookupByID(domain_id) if not domain.name().startswith('sf:'): continue instance_uuid = domain.name().split(':')[1] instance = db.get_instance(instance_uuid) if not instance: # Instance is SF but not in database. Kill to reduce load. logutil.warning([virt.ThinInstance(instance_uuid)], 'Destroying unknown instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) seen.append(domain.name()) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue db.instance_enforced_deletes_increment(instance_uuid) attempts = instance.get('enforced_deletes', 0) if attempts > 5: # Sometimes we just can't delete the VM. Try the big hammer instead. logutil.warning( [virt.ThinInstance(instance_uuid)], 'Attempting alternate delete method for instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) db.add_event('instance', instance_uuid, 'enforced delete', 'complete', None, None) else: i = virt.from_db(instance_uuid) i.delete() i.update_instance_state('deleted') logutil.warning([virt.ThinInstance(instance_uuid)], 'Deleting stray instance (attempt %d)' % attempts) continue state = util.extract_power_state(libvirt, domain) db.update_instance_power_state(instance_uuid, state) if state == 'crashed': db.update_instance_state(instance_uuid, 'error') # Inactive VMs just have a name, and are powered off # in our state system. for domain_name in conn.listDefinedDomains(): if not domain_name.startswith('sf:'): continue if domain_name not in seen: instance_uuid = domain_name.split(':')[1] instance = db.get_instance(instance_uuid) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue domain = conn.lookupByName(domain_name) domain.undefine() logutil.info([virt.ThinInstance(instance_uuid)], 'Detected stray instance') db.add_event('instance', instance_uuid, 'deleted stray', 'complete', None, None) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) instance_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', instance_uuid) if not os.path.exists(instance_path): # If we're inactive and our files aren't on disk, # we have a problem. logutil.info([virt.ThinInstance(instance_uuid)], 'Detected error state for instance') db.update_instance_state(instance_uuid, 'error') elif instance.get('power_state') != 'off': logutil.info([virt.ThinInstance(instance_uuid)], 'Detected power off for instance') db.update_instance_power_state(instance_uuid, 'off') db.add_event('instance', instance_uuid, 'detected poweroff', 'complete', None, None) except libvirt.libvirtError as e: logutil.error(None, 'Failed to lookup all domains: %s' % e)
def _get_stats(): libvirt = util.get_libvirt() retval = {} conn = libvirt.open(None) # CPU info present_cpus, _, available_cpus = conn.getCPUMap() retval.update({ 'cpu_max': present_cpus, 'cpu_available': available_cpus, }) retval['cpu_max_per_instance'] = conn.getMaxVcpus(None) # This is disable as data we don't currently use # for i in range(present_cpus): # per_cpu_stats = conn.getCPUStats(i) # for key in per_cpu_stats: # retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key] try: load_1, load_5, load_15 = psutil.getloadavg() retval.update({ 'cpu_load_1': load_1, 'cpu_load_5': load_5, 'cpu_load_15': load_15, }) except Exception: pass # Memory info - libvirt returns memory in KiB memory_status = conn.getMemoryStats( libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS) retval.update({ 'memory_max': memory_status['total'] // 1024, 'memory_available': memory_status['free'] // 1024, }) # Disk info s = os.statvfs(config.parsed.get('STORAGE_PATH')) disk_counters = psutil.disk_io_counters() retval.update({ 'disk_total': s.f_frsize * s.f_blocks, 'disk_free': s.f_frsize * s.f_bavail, 'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree), 'disk_read_bytes': disk_counters.read_bytes, 'disk_write_bytes': disk_counters.write_bytes, }) # Network info net_counters = psutil.net_io_counters() retval.update({ 'network_read_bytes': net_counters.bytes_recv, 'network_write_bytes': net_counters.bytes_sent, }) # Virtual machine consumption info total_instances = 0 total_active_instances = 0 total_instance_max_memory = 0 total_instance_actual_memory = 0 total_instance_vcpus = 0 total_instance_cpu_time = 0 for guest in conn.listAllDomains(): active = guest.isActive() == 1 _, maxmem, mem, cpus, cpu_time = guest.info() total_instances += 1 if active: total_active_instances += 1 total_instance_max_memory += maxmem total_instance_actual_memory += mem total_instance_vcpus += cpus total_instance_cpu_time += cpu_time retval.update({ 'cpu_total_instance_vcpus': total_instance_vcpus, 'cpu_total_instance_cpu_time': total_instance_cpu_time, 'memory_total_instance_max_memory': total_instance_max_memory // 1024, 'memory_total_instance_actual_memory': total_instance_actual_memory // 1024, 'instances_total': total_instances, 'instances_active': total_active_instances, }) return retval
def unpause(self): libvirt = util.get_libvirt() instance = self._get_domain() instance.resume() self.update_power_state(util.extract_power_state(libvirt, instance)) self.add_event('unpause', 'complete')
def _update_power_states(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: seen = [] # Active VMs have an ID. Active means running in libvirt # land. for domain_id in conn.listDomainsID(): domain = conn.lookupByID(domain_id) if not domain.name().startswith('sf:'): continue instance_uuid = domain.name().split(':')[1] log_ctx = LOG.withInstance(instance_uuid) instance = virt.Instance.from_db(instance_uuid) if not instance: # Instance is SF but not in database. Kill to reduce load. log_ctx.warning('Destroying unknown instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) continue instance.place_instance(config.NODE_NAME) seen.append(domain.name()) if instance.state == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance.state_updated < 300: continue instance.enforced_deletes_increment() attempts = instance.enforced_deletes if attempts > 5: # Sometimes we just can't delete the VM. Try the big # hammer instead. log_ctx.warning( 'Attempting alternate delete method for instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) instance.add_event('enforced delete', 'complete') else: instance.delete() log_ctx.withField('attempt', attempts).warning( 'Deleting stray instance') continue state = util.extract_power_state(libvirt, domain) instance.update_power_state(state) if state == 'crashed': instance.update_instance_state('error') # Inactive VMs just have a name, and are powered off # in our state system. for domain_name in conn.listDefinedDomains(): if not domain_name.startswith('sf:'): continue if domain_name not in seen: instance_uuid = domain_name.split(':')[1] log_ctx = LOG.withInstance(instance_uuid) instance = virt.Instance.from_db(instance_uuid) if not instance: # Instance is SF but not in database. Kill because # unknown. log_ctx.warning('Removing unknown inactive instance') domain = conn.lookupByName(domain_name) domain.undefine() continue if instance.state == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue domain = conn.lookupByName(domain_name) domain.undefine() log_ctx.info('Detected stray instance') instance.add_event('deleted stray', 'complete') continue instance.place_instance(config.NODE_NAME) if not os.path.exists(instance.instance_path()): # If we're inactive and our files aren't on disk, # we have a problem. log_ctx.info('Detected error state for instance') instance.update_instance_state('error') elif instance.power_state != 'off': log_ctx.info('Detected power off for instance') instance.update_power_state('off') instance.add_event('detected poweroff', 'complete') except libvirt.libvirtError as e: LOG.error('Failed to lookup all domains: %s' % e)
def _get_stats(): libvirt = util.get_libvirt() retval = {} conn = libvirt.open(None) # CPU info present_cpus, _, available_cpus = conn.getCPUMap() retval.update({ 'cpu_max': present_cpus, 'cpu_available': available_cpus, }) retval['cpu_max_per_instance'] = conn.getMaxVcpus(None) # This is disabled as data we don't currently use # for i in range(present_cpus): # per_cpu_stats = conn.getCPUStats(i) # for key in per_cpu_stats: # retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key] try: load_1, load_5, load_15 = psutil.getloadavg() retval.update({ 'cpu_load_1': load_1, 'cpu_load_5': load_5, 'cpu_load_15': load_15, }) except Exception as e: util.ignore_exception('load average', e) # System memory info, converting bytes to mb stats = psutil.virtual_memory() retval.update({ 'memory_max': stats.total // 1024 // 1024, 'memory_available': stats.available // 1024 // 1024 }) # libvirt memory info, converting kb to mb memory_status = conn.getMemoryStats( libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS) retval.update({ 'memory_max_libvirt': memory_status['total'] // 1024, 'memory_available_libvirt': memory_status['free'] // 1024, }) # Kernel Shared Memory (KSM) information ksm_details = {} for ent in os.listdir('/sys/kernel/mm/ksm'): with open('/sys/kernel/mm/ksm/%s' % ent) as f: ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip()) retval.update(ksm_details) # Disk info s = os.statvfs(config.get('STORAGE_PATH')) disk_counters = psutil.disk_io_counters() retval.update({ 'disk_total': s.f_frsize * s.f_blocks, 'disk_free': s.f_frsize * s.f_bavail, 'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree), 'disk_read_bytes': disk_counters.read_bytes, 'disk_write_bytes': disk_counters.write_bytes, }) # Network info net_counters = psutil.net_io_counters() retval.update({ 'network_read_bytes': net_counters.bytes_recv, 'network_write_bytes': net_counters.bytes_sent, }) # Virtual machine consumption info total_instances = 0 total_active_instances = 0 total_instance_max_memory = 0 total_instance_actual_memory = 0 total_instance_vcpus = 0 total_instance_cpu_time = 0 for guest in conn.listAllDomains(): try: active = guest.isActive() == 1 if active: _, maxmem, mem, cpus, cpu_time = guest.info() except libvirt.libvirtError as e: LOG.debug('During resource calc ignored libvirt error: %s' % e) active = False if active: total_instances += 1 total_active_instances += 1 total_instance_max_memory += maxmem total_instance_actual_memory += mem total_instance_vcpus += cpus total_instance_cpu_time += cpu_time # Queue health statistics node_queue_processing, node_queue_waiting = db.get_queue_length( config.NODE_NAME) retval.update({ 'cpu_total_instance_vcpus': total_instance_vcpus, 'cpu_total_instance_cpu_time': total_instance_cpu_time, 'memory_total_instance_max': total_instance_max_memory // 1024, 'memory_total_instance_actual': total_instance_actual_memory // 1024, 'instances_total': total_instances, 'instances_active': total_active_instances, 'node_queue_processing': node_queue_processing, 'node_queue_waiting': node_queue_waiting, }) if util.is_network_node(): network_queue_processing, network_queue_waiting = db.get_queue_length( 'networknode') retval.update({ 'network_queue_processing': network_queue_processing, 'network_queue_waiting': network_queue_waiting, }) return retval