def parse(self): global CONFIG_DEFAULTS node_name = socket.getfqdn() try: node_ip = socket.gethostbyname(node_name) except Exception as e: # Only for localhost development environments node_ip = '127.0.0.1' util.ignore_exception('config parser', e) CONFIG_DEFAULTS['NODE_NAME'] = node_name CONFIG_DEFAULTS['NODE_IP'] = node_ip CONFIG_DEFAULTS['NETWORK_NODE_IP'] = node_ip self.config = copy.copy(CONFIG_DEFAULTS) for var in os.environ: if var.startswith('SHAKENFIST_'): flag = var.replace('SHAKENFIST_', '') value = os.environ[var] # We use the type of the default value to determine # what type we should force the value provided by an # environment variable into. if isinstance(CONFIG_DEFAULTS[flag], int): value = int(value) elif isinstance(CONFIG_DEFAULTS[flag], float): value = float(value) elif not isinstance(CONFIG_DEFAULTS[flag], str): raise exceptions.FlagException( 'Flag %s has unknown type.' % flag) self.config[flag] = value
def delete(self): with util.RecordedOperation('delete domain', self): try: self.power_off() instance = self._get_domain() if instance: instance.undefine() except Exception as e: util.ignore_exception('instance delete', e) with util.RecordedOperation('delete disks', self): try: if os.path.exists(self.instance_path): shutil.rmtree(self.instance_path) except Exception as e: util.ignore_exception('instance delete', e) with util.RecordedOperation('release network addresses', self): for ni in db.get_instance_interfaces(self.db_entry['uuid']): db.update_network_interface_state(ni['uuid'], 'deleted') with db.get_lock('ipmanager', None, ni['network_uuid'], ttl=120, op='Instance delete'): ipm = db.get_ipmanager(ni['network_uuid']) ipm.release(ni['ipv4']) db.persist_ipmanager(ni['network_uuid'], ipm.save()) db.free_console_port(self.db_entry['console_port']) db.free_console_port(self.db_entry['vdi_port'])
def run(self): workers = [] LOG.info('Starting Queues') libvirt = util.get_libvirt() conn = libvirt.open(None) present_cpus, _, _ = conn.getCPUMap() while True: try: for w in copy.copy(workers): if not w.is_alive(): w.join(1) workers.remove(w) if len(workers) < present_cpus / 2: jobname, workitem = db.dequeue(config.NODE_NAME) else: workitem = None if not workitem: time.sleep(0.2) continue p = multiprocessing.Process( target=handle, args=(jobname, workitem,), name='%s-worker' % daemon.process_name('queues')) p.start() workers.append(p) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e)
def _compact_etcd(self): try: # We need to determine what revision to compact to, so we keep a # key which stores when we last compacted and we use it's latest # revision number as the revision to compact to. Note that we use # a different library for compaction as our primary library does # not support it. c = etcd3.client() c.put('/sf/compact', json.dumps({'compacted_at': time.time()})) _, kv = c.get('/sf/compact') c.compact(kv.mod_revision, physical=True) c.defragment() LOG.info('Compacted etcd') except Exception as e: util.ignore_exception('etcd compaction', e)
def restore_instances(): # Ensure all instances for this node are defined networks = [] instances = [] for inst in list( db.get_instances(only_node=config.parsed.get('NODE_NAME'))): for iface in db.get_instance_interfaces(inst['uuid']): if not iface['network_uuid'] in networks: networks.append(iface['network_uuid']) instances.append(inst['uuid']) with util.RecordedOperation('restore networks', None): for network in networks: try: n = net.from_db(network) LOG.withObj(n).info('Restoring network') n.create() n.ensure_mesh() n.update_dhcp() except Exception as e: util.ignore_exception('restore network %s' % network, e) with util.RecordedOperation('restore instances', None): for instance in instances: try: with db.get_lock('instance', None, instance, ttl=120, timeout=120, op='Instance restore'): i = virt.from_db(instance) if not i: continue started = ['on', 'transition-to-on', 'initial', 'unknown'] if i.db_entry.get('power_state', 'unknown') not in started: continue LOG.withObj(i).info('Restoring instance') i.create() except Exception as e: util.ignore_exception('restore instance %s' % instance, e) db.enqueue_instance_error( instance, 'exception while restoring instance on daemon restart')
def run(self): logutil.info(None, 'Starting') gauges = { 'updated_at': Gauge('updated_at', 'The last time metrics were updated') } last_metrics = 0 def update_metrics(): global last_metrics stats = _get_stats() for metric in stats: if metric not in gauges: gauges[metric] = Gauge(metric, '') gauges[metric].set(stats[metric]) db.update_metrics_bulk(stats) logutil.info(None, 'Updated metrics') gauges['updated_at'].set_to_current_time() while True: try: jobname, _ = db.dequeue( '%s-metrics' % config.parsed.get('NODE_NAME')) if jobname: if time.time() - last_metrics > 2: update_metrics() last_metrics = time.time() db.resolve('%s-metrics' % config.parsed.get('NODE_NAME'), jobname) else: time.sleep(0.2) if time.time() - last_metrics > config.parsed.get('SCHEDULER_CACHE_TIMEOUT'): update_metrics() last_metrics = time.time() except Exception as e: util.ignore_exception('resource statistics', e)
def _get_stats(): libvirt = util.get_libvirt() retval = {} conn = libvirt.open(None) # CPU info present_cpus, _, available_cpus = conn.getCPUMap() retval.update({ 'cpu_max': present_cpus, 'cpu_available': available_cpus, }) retval['cpu_max_per_instance'] = conn.getMaxVcpus(None) # This is disabled as data we don't currently use # for i in range(present_cpus): # per_cpu_stats = conn.getCPUStats(i) # for key in per_cpu_stats: # retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key] try: load_1, load_5, load_15 = psutil.getloadavg() retval.update({ 'cpu_load_1': load_1, 'cpu_load_5': load_5, 'cpu_load_15': load_15, }) except Exception as e: util.ignore_exception('load average', e) # System memory info, converting bytes to mb stats = psutil.virtual_memory() retval.update({ 'memory_max': stats.total // 1024 // 1024, 'memory_available': stats.available // 1024 // 1024 }) # libvirt memory info, converting kb to mb memory_status = conn.getMemoryStats( libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS) retval.update({ 'memory_max_libvirt': memory_status['total'] // 1024, 'memory_available_libvirt': memory_status['free'] // 1024, }) # Kernel Shared Memory (KSM) information ksm_details = {} for ent in os.listdir('/sys/kernel/mm/ksm'): with open('/sys/kernel/mm/ksm/%s' % ent) as f: ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip()) retval.update(ksm_details) # Disk info s = os.statvfs(config.get('STORAGE_PATH')) disk_counters = psutil.disk_io_counters() retval.update({ 'disk_total': s.f_frsize * s.f_blocks, 'disk_free': s.f_frsize * s.f_bavail, 'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree), 'disk_read_bytes': disk_counters.read_bytes, 'disk_write_bytes': disk_counters.write_bytes, }) # Network info net_counters = psutil.net_io_counters() retval.update({ 'network_read_bytes': net_counters.bytes_recv, 'network_write_bytes': net_counters.bytes_sent, }) # Virtual machine consumption info total_instances = 0 total_active_instances = 0 total_instance_max_memory = 0 total_instance_actual_memory = 0 total_instance_vcpus = 0 total_instance_cpu_time = 0 for guest in conn.listAllDomains(): try: active = guest.isActive() == 1 if active: _, maxmem, mem, cpus, cpu_time = guest.info() except libvirt.libvirtError as e: LOG.debug('During resource calc ignored libvirt error: %s' % e) active = False if active: total_instances += 1 total_active_instances += 1 total_instance_max_memory += maxmem total_instance_actual_memory += mem total_instance_vcpus += cpus total_instance_cpu_time += cpu_time # Queue health statistics node_queue_processing, node_queue_waiting = db.get_queue_length( config.NODE_NAME) retval.update({ 'cpu_total_instance_vcpus': total_instance_vcpus, 'cpu_total_instance_cpu_time': total_instance_cpu_time, 'memory_total_instance_max': total_instance_max_memory // 1024, 'memory_total_instance_actual': total_instance_actual_memory // 1024, 'instances_total': total_instances, 'instances_active': total_active_instances, 'node_queue_processing': node_queue_processing, 'node_queue_waiting': node_queue_waiting, }) if util.is_network_node(): network_queue_processing, network_queue_waiting = db.get_queue_length( 'networknode') retval.update({ 'network_queue_processing': network_queue_processing, 'network_queue_waiting': network_queue_waiting, }) return retval
def handle(jobname, workitem): log = LOG.withField('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if (InstanceTask.__subclasscheck__(type(task)) or isinstance(task, FetchImageTask)): instance_uuid = task.instance_uuid() if instance_uuid: log_i = log.withInstance(instance_uuid) else: log_i = log log_i.withField('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if instance_uuid: # TODO(andy) move to QueueTask db.add_event('instance', instance_uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), instance_uuid) elif isinstance(task, PreflightInstanceTask): redirect_to = instance_preflight(instance_uuid, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): instance_start(instance_uuid, task.network()) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'deleted') except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) elif isinstance(task, ErrorInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'error') if task.error_msg(): db.update_instance_error_message( instance_uuid, task.error_msg()) db.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) else: log_i.withField('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) finally: db.resolve(config.NODE_NAME, jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) log.info('Completed workitem')
def handle(jobname, workitem): j = JobName(jobname) logutil.info([j], 'Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): ro = [j] instance_uuid = task.get('instance_uuid') if instance_uuid: i = virt.from_db(instance_uuid) ro.append(i) if task.get('type').startswith('instance_') and not instance_uuid: logutil.error(ro, 'Instance task lacks instance uuid') return if instance_uuid: db.add_event('instance', instance_uuid, task.get('type').replace('_', ' '), 'dequeued', None, 'Work item %s' % jobname) logutil.info( ro, 'Executing task %s: %s' % (task.get('type', 'unknown'), task)) if task.get('type') == 'image_fetch': image_fetch(task.get('url'), instance_uuid) if task.get('type') == 'instance_preflight': redirect_to = instance_preflight(instance_uuid, task.get('network')) if redirect_to: util.log('info', ro, 'Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return if task.get('type') == 'instance_start': instance_start(instance_uuid, task.get('network')) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) if task.get('type') == 'instance_delete': try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, task.get('next_state', 'unknown')) if task.get('next_state_message'): db.update_instance_error_message( instance_uuid, task.get('next_state_message')) db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) except Exception as e: if instance_uuid: util.ignore_exception(daemon.process_name('queues'), e) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed queue task: %s' % e) finally: db.resolve(config.parsed.get('NODE_NAME'), jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) logutil.info([j], 'Completed workitem')