def _process_network_node_workitems(self): jobname, workitem = db.dequeue('networknode') try: if not workitem: time.sleep(0.2) return log_ctx = LOG.withField('workitem', workitem) if not NetworkTask.__subclasscheck__(type(workitem)): raise exceptions.UnknownTaskException( 'Network workitem was not decoded: %s' % workitem) n = net.from_db(workitem.network_uuid()) if not n: log_ctx.withNetwork(workitem.network_uuid()).warning( 'Received work item for non-existent network') return # NOTE(mikal): there's really nothing stopping us from processing a bunch # of these jobs in parallel with a pool of workers, but I am not sure its # worth the complexity right now. Are we really going to be changing # networks that much? if isinstance(workitem, DeployNetworkTask): try: n.create() n.ensure_mesh() db.add_event('network', workitem.network_uuid(), 'network node', 'deploy', None, None) except exceptions.DeadNetwork as e: log_ctx.withField('exception', e).warning( 'DeployNetworkTask on dead network') elif isinstance(workitem, UpdateDHCPNetworkTask): try: n.create() n.ensure_mesh() n.update_dhcp() db.add_event('network', workitem.network_uuid(), 'network node', 'update dhcp', None, None) except exceptions.DeadNetwork as e: log_ctx.withField('exception', e).warning( 'UpdateDHCPNetworkTask on dead network') elif isinstance(workitem, RemoveDHCPNetworkTask): n.remove_dhcp() db.add_event('network', workitem.network_uuid(), 'network node', 'remove dhcp', None, None) finally: if jobname: db.resolve('networknode', jobname)
def run(self): logutil.info(None, 'Starting') gauges = { 'updated_at': Gauge('updated_at', 'The last time metrics were updated') } last_metrics = 0 def update_metrics(): global last_metrics stats = _get_stats() for metric in stats: if metric not in gauges: gauges[metric] = Gauge(metric, '') gauges[metric].set(stats[metric]) db.update_metrics_bulk(stats) logutil.info(None, 'Updated metrics') gauges['updated_at'].set_to_current_time() while True: try: jobname, _ = db.dequeue( '%s-metrics' % config.parsed.get('NODE_NAME')) if jobname: if time.time() - last_metrics > 2: update_metrics() last_metrics = time.time() db.resolve('%s-metrics' % config.parsed.get('NODE_NAME'), jobname) else: time.sleep(0.2) if time.time() - last_metrics > config.parsed.get('SCHEDULER_CACHE_TIMEOUT'): update_metrics() last_metrics = time.time() except Exception as e: util.ignore_exception('resource statistics', e)
def handle(jobname, workitem): log = LOG.withField('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if (InstanceTask.__subclasscheck__(type(task)) or isinstance(task, FetchImageTask)): instance_uuid = task.instance_uuid() if instance_uuid: log_i = log.withInstance(instance_uuid) else: log_i = log log_i.withField('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if instance_uuid: # TODO(andy) move to QueueTask db.add_event('instance', instance_uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), instance_uuid) elif isinstance(task, PreflightInstanceTask): redirect_to = instance_preflight(instance_uuid, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): instance_start(instance_uuid, task.network()) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'deleted') except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) elif isinstance(task, ErrorInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'error') if task.error_msg(): db.update_instance_error_message( instance_uuid, task.error_msg()) db.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) else: log_i.withField('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) finally: db.resolve(config.NODE_NAME, jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) log.info('Completed workitem')
def handle(jobname, workitem): j = JobName(jobname) logutil.info([j], 'Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): ro = [j] instance_uuid = task.get('instance_uuid') if instance_uuid: i = virt.from_db(instance_uuid) ro.append(i) if task.get('type').startswith('instance_') and not instance_uuid: logutil.error(ro, 'Instance task lacks instance uuid') return if instance_uuid: db.add_event('instance', instance_uuid, task.get('type').replace('_', ' '), 'dequeued', None, 'Work item %s' % jobname) logutil.info( ro, 'Executing task %s: %s' % (task.get('type', 'unknown'), task)) if task.get('type') == 'image_fetch': image_fetch(task.get('url'), instance_uuid) if task.get('type') == 'instance_preflight': redirect_to = instance_preflight(instance_uuid, task.get('network')) if redirect_to: util.log('info', ro, 'Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return if task.get('type') == 'instance_start': instance_start(instance_uuid, task.get('network')) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) if task.get('type') == 'instance_delete': try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, task.get('next_state', 'unknown')) if task.get('next_state_message'): db.update_instance_error_message( instance_uuid, task.get('next_state_message')) db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) except Exception as e: if instance_uuid: util.ignore_exception(daemon.process_name('queues'), e) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed queue task: %s' % e) finally: db.resolve(config.parsed.get('NODE_NAME'), jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) logutil.info([j], 'Completed workitem')