def ignore_exception(processname, e): msg = '[Exception] Ignored error in %s: %s' % (processname, e) _, _, tb = sys.exc_info() if tb: msg += '\n%s' % traceback.format_exc() logutil.error(None, msg)
def _populate_block_devices(self): disk_spec = self.db_entry['disk_spec'] if not disk_spec: # This should not occur since the API will filter for zero disks. logutil.error([self], 'Found disk spec empty: %s' % self.db_entry) # Stop continuous crashing by falsely claiming disks are configured. self.db_entry['block_devices'] = {'finalized': True} return bus = _get_defaulted_disk_bus(disk_spec[0]) root_device = _get_disk_device_base(bus) + 'a' config_device = _get_disk_device_base(bus) + 'b' disk_type = 'qcow2' if config.parsed.get('DISK_FORMAT') == 'flat': disk_type = 'raw' self.db_entry['block_devices'] = { 'devices': [ { 'type': disk_type, 'size': self.db_entry['disk_spec'][0].get('size'), 'device': root_device, 'bus': bus, 'path': os.path.join(self.instance_path, root_device), 'base': self.db_entry['disk_spec'][0].get('base'), 'present_as': _get_defaulted_disk_type(self.db_entry['disk_spec'][0]), 'snapshot_ignores': False }, { 'type': 'raw', 'device': config_device, 'bus': bus, 'path': os.path.join(self.instance_path, config_device), 'present_as': 'disk', 'snapshot_ignores': True } ] } i = 0 for d in self.db_entry['disk_spec'][1:]: bus = _get_defaulted_disk_bus(d) device = _get_disk_device_base(bus) + chr(ord('c') + i) self.db_entry['block_devices']['devices'].append({ 'type': disk_type, 'size': d.get('size'), 'device': device, 'bus': bus, 'path': os.path.join(self.instance_path, device), 'base': d.get('base'), 'present_as': _get_defaulted_disk_type(d), 'snapshot_ignores': False }) i += 1 self.db_entry['block_devices']['finalized'] = False
def power_off(self): libvirt = util.get_libvirt() instance = self._get_domain() if not instance: return try: instance.destroy() except libvirt.libvirtError as e: logutil.error([self], 'Failed to delete domain: %s' % e) db.add_event( 'instance', self.db_entry['uuid'], 'poweroff', 'complete', None, None)
def error(status_code, message): global TESTING body = {'error': message, 'status': status_code} if TESTING or config.parsed.get('INCLUDE_TRACEBACKS') == '1': _, _, tb = sys.exc_info() if tb: body['traceback'] = traceback.format_exc() resp = flask.Response(json.dumps(body), mimetype='application/json') resp.status_code = status_code logutil.error( None, 'Returning API error: %d, %s\n %s' % (status_code, message, '\n '.join( body.get('traceback', '').split('\n')))) return resp
def handle(jobname, workitem): j = JobName(jobname) logutil.info([j], 'Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): ro = [j] instance_uuid = task.get('instance_uuid') if instance_uuid: i = virt.from_db(instance_uuid) ro.append(i) if task.get('type').startswith('instance_') and not instance_uuid: logutil.error(ro, 'Instance task lacks instance uuid') return if instance_uuid: db.add_event('instance', instance_uuid, task.get('type').replace('_', ' '), 'dequeued', None, 'Work item %s' % jobname) logutil.info( ro, 'Executing task %s: %s' % (task.get('type', 'unknown'), task)) if task.get('type') == 'image_fetch': image_fetch(task.get('url'), instance_uuid) if task.get('type') == 'instance_preflight': redirect_to = instance_preflight(instance_uuid, task.get('network')) if redirect_to: util.log('info', ro, 'Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return if task.get('type') == 'instance_start': instance_start(instance_uuid, task.get('network')) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) if task.get('type') == 'instance_delete': try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, task.get('next_state', 'unknown')) if task.get('next_state_message'): db.update_instance_error_message( instance_uuid, task.get('next_state_message')) db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) except Exception as e: if instance_uuid: util.ignore_exception(daemon.process_name('queues'), e) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed queue task: %s' % e) finally: db.resolve(config.parsed.get('NODE_NAME'), jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) logutil.info([j], 'Completed workitem')
def _update_power_states(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: seen = [] # Active VMs have an ID. Active means running in libvirt # land. for domain_id in conn.listDomainsID(): domain = conn.lookupByID(domain_id) if not domain.name().startswith('sf:'): continue instance_uuid = domain.name().split(':')[1] instance = db.get_instance(instance_uuid) if not instance: # Instance is SF but not in database. Kill to reduce load. logutil.warning([virt.ThinInstance(instance_uuid)], 'Destroying unknown instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) seen.append(domain.name()) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue db.instance_enforced_deletes_increment(instance_uuid) attempts = instance.get('enforced_deletes', 0) if attempts > 5: # Sometimes we just can't delete the VM. Try the big hammer instead. logutil.warning( [virt.ThinInstance(instance_uuid)], 'Attempting alternate delete method for instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) db.add_event('instance', instance_uuid, 'enforced delete', 'complete', None, None) else: i = virt.from_db(instance_uuid) i.delete() i.update_instance_state('deleted') logutil.warning([virt.ThinInstance(instance_uuid)], 'Deleting stray instance (attempt %d)' % attempts) continue state = util.extract_power_state(libvirt, domain) db.update_instance_power_state(instance_uuid, state) if state == 'crashed': db.update_instance_state(instance_uuid, 'error') # Inactive VMs just have a name, and are powered off # in our state system. for domain_name in conn.listDefinedDomains(): if not domain_name.startswith('sf:'): continue if domain_name not in seen: instance_uuid = domain_name.split(':')[1] instance = db.get_instance(instance_uuid) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue domain = conn.lookupByName(domain_name) domain.undefine() logutil.info([virt.ThinInstance(instance_uuid)], 'Detected stray instance') db.add_event('instance', instance_uuid, 'deleted stray', 'complete', None, None) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) instance_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', instance_uuid) if not os.path.exists(instance_path): # If we're inactive and our files aren't on disk, # we have a problem. logutil.info([virt.ThinInstance(instance_uuid)], 'Detected error state for instance') db.update_instance_state(instance_uuid, 'error') elif instance.get('power_state') != 'off': logutil.info([virt.ThinInstance(instance_uuid)], 'Detected power off for instance') db.update_instance_power_state(instance_uuid, 'off') db.add_event('instance', instance_uuid, 'detected poweroff', 'complete', None, None) except libvirt.libvirtError as e: logutil.error(None, 'Failed to lookup all domains: %s' % e)