def power_on(self): if not os.path.exists(self.xml_file): db.enqueue_instance_error(self.db_entry['uuid'], 'missing domain file in power on') libvirt = util.get_libvirt() with open(self.xml_file) as f: xml = f.read() instance = self._get_domain() if not instance: conn = libvirt.open(None) instance = conn.defineXML(xml) if not instance: db.enqueue_instance_error(self.db_entry['uuid'], 'power on failed to create domain') raise exceptions.NoDomainException() try: instance.create() except libvirt.libvirtError as e: if not str(e).startswith( 'Requested operation is not valid: domain is already running' ): LOG.withObj(self).warning('Instance start error: %s' % e) return False instance.setAutostart(1) db.update_instance_power_state( self.db_entry['uuid'], util.extract_power_state(libvirt, instance)) db.add_event('instance', self.db_entry['uuid'], 'poweron', 'complete', None, None) return True
def image_fetch(url, instance_uuid): instance = None if instance_uuid: instance = virt.from_db(instance_uuid) try: # TODO(andy): Wait up to 15 mins for another queue process to download # the required image. This will be changed to queue on a # "waiting_image_fetch" queue but this works now. with db.get_lock('image', config.NODE_NAME, Image.calc_unique_ref(url), timeout=15 * 60, op='Image fetch') as lock: img = Image.from_url(url) img.get([lock], instance) db.add_event('image', url, 'fetch', None, None, 'success') except (exceptions.HTTPError, requests.exceptions.RequestException) as e: LOG.withField('image', url).info('Failed to fetch image') if instance_uuid: db.enqueue_instance_error(instance_uuid, 'Image fetch failed: %s' % e) # Clean common problems to store in events msg = str(e) re_conn_err = re.compile(r'.*NewConnectionError\(\'\<.*\>: (.*)\'') m = re_conn_err.match(msg) if m: msg = m.group(1) db.add_event('image', url, 'fetch', None, None, 'Error: ' + msg) raise exceptions.ImageFetchTaskFailedException( 'Failed to fetch image %s' % url)
def restore_instances(): # Ensure all instances for this node are defined networks = [] instances = [] for inst in list( db.get_instances(only_node=config.parsed.get('NODE_NAME'))): for iface in db.get_instance_interfaces(inst['uuid']): if not iface['network_uuid'] in networks: networks.append(iface['network_uuid']) instances.append(inst['uuid']) with util.RecordedOperation('restore networks', None): for network in networks: try: n = net.from_db(network) LOG.withObj(n).info('Restoring network') n.create() n.ensure_mesh() n.update_dhcp() except Exception as e: util.ignore_exception('restore network %s' % network, e) with util.RecordedOperation('restore instances', None): for instance in instances: try: with db.get_lock('instance', None, instance, ttl=120, timeout=120, op='Instance restore'): i = virt.from_db(instance) if not i: continue started = ['on', 'transition-to-on', 'initial', 'unknown'] if i.db_entry.get('power_state', 'unknown') not in started: continue LOG.withObj(i).info('Restoring instance') i.create() except Exception as e: util.ignore_exception('restore instance %s' % instance, e) db.enqueue_instance_error( instance, 'exception while restoring instance on daemon restart')
def instance_start(instance_uuid, network): with db.get_lock('instance', None, instance_uuid, ttl=900, timeout=120, op='Instance start') as lock: instance = virt.from_db(instance_uuid) # Collect the networks nets = {} for netdesc in network: if netdesc['network_uuid'] not in nets: n = net.from_db(netdesc['network_uuid']) if not n: db.enqueue_instance_error(instance_uuid, 'missing network') return nets[netdesc['network_uuid']] = n # Create the networks with util.RecordedOperation('ensure networks exist', instance): for network_uuid in nets: n = nets[network_uuid] n.create() n.ensure_mesh() n.update_dhcp() # Now we can start the instance libvirt = util.get_libvirt() try: with util.RecordedOperation('instance creation', instance): instance.create(lock=lock) except libvirt.libvirtError as e: code = e.get_error_code() if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED, libvirt.VIR_ERR_XML_ERROR): db.enqueue_instance_error(instance_uuid, 'instance failed to start: %s' % e) return for iface in db.get_instance_interfaces(instance_uuid): db.update_network_interface_state(iface['uuid'], 'created')
def handle(jobname, workitem): log = LOG.withField('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if (InstanceTask.__subclasscheck__(type(task)) or isinstance(task, FetchImageTask)): instance_uuid = task.instance_uuid() if instance_uuid: log_i = log.withInstance(instance_uuid) else: log_i = log log_i.withField('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if instance_uuid: # TODO(andy) move to QueueTask db.add_event('instance', instance_uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), instance_uuid) elif isinstance(task, PreflightInstanceTask): redirect_to = instance_preflight(instance_uuid, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): instance_start(instance_uuid, task.network()) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'deleted') except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) elif isinstance(task, ErrorInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'error') if task.error_msg(): db.update_instance_error_message( instance_uuid, task.error_msg()) db.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) else: log_i.withField('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) finally: db.resolve(config.NODE_NAME, jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) log.info('Completed workitem')