def observe(path, instance_uuid): setproctitle.setproctitle('%s-%s' % (daemon.process_name('triggers'), instance_uuid)) regexps = {'login prompt': ['^.* login: .*', re.compile('.* login: .*')]} while not os.path.exists(path): time.sleep(1) fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK) logutil.info([virt.ThinInstance(instance_uuid)], 'Monitoring %s for triggers' % path) db.add_event('instance', instance_uuid, 'trigger monitor', 'detected console log', None, None) os.lseek(fd, 0, os.SEEK_END) buffer = '' while True: d = os.read(fd, 1024).decode('utf-8') if d: buffer += d lines = buffer.split('\n') buffer = lines[-1] for line in lines: if line: for trigger in regexps: m = regexps[trigger][1].match(line) if m: logutil.info([virt.ThinInstance(instance_uuid)], 'Trigger %s matched' % trigger) db.add_event('instance', instance_uuid, 'trigger', None, None, trigger) time.sleep(1)
def run(self): logutil.info(None, 'Starting') observers = {} while True: # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) logutil.info([virt.ThinInstance(instance_uuid)], 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Start missing observers extra_instances = list(observers.keys()) for inst in db.get_instances( only_node=config.parsed.get('NODE_NAME')): if inst['uuid'] in extra_instances: extra_instances.remove(inst['uuid']) if inst['state'] != 'created': continue if inst['uuid'] not in observers: console_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', inst['uuid'], 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, inst['uuid']), name='%s-%s' % (daemon.process_name('triggers'), inst['uuid'])) p.start() observers[inst['uuid']] = p logutil.info([virt.ThinInstance(inst['uuid'])], 'Started trigger observer') db.add_event('instance', inst['uuid'], 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) except Exception: pass del observers[instance_uuid] logutil.info([virt.ThinInstance(instance_uuid)], 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) time.sleep(1)
def wrapper(*args, **kwargs): if not kwargs.get('instance_from_db'): logutil.info([virt.ThinInstance(kwargs['instance_uuid'])], 'Instance not found, kwarg missing') return error(404, 'instance not found') if get_jwt_identity() not in [ kwargs['instance_from_db']['namespace'], 'system' ]: logutil.info([virt.ThinInstance(kwargs['instance_uuid'])], 'Instance not found, ownership test in decorator') return error(404, 'instance not found') return func(*args, **kwargs)
def run(self): logutil.info(None, 'Starting') last_compaction = 0 while True: # Update power state of all instances on this hypervisor logutil.info(None, 'Updating power states') self._update_power_states() # Cleanup soft deleted instances and networks delay = config.parsed.get('CLEANER_DELAY') for i in db.get_stale_instances(delay): logutil.info([virt.ThinInstance(i['uuid'])], 'Hard deleting instance') db.hard_delete_instance(i['uuid']) for n in db.get_stale_networks(delay): logutil.info([net.ThinNetwork(n['uuid'])], 'Hard deleting network') db.hard_delete_network(n['uuid']) for ni in db.get_stale_network_interfaces(delay): logutil.info([net.ThinNetworkInterface(ni['uuid'])], 'Hard deleting network interface') db.hard_delete_network_interface(ni['uuid']) # Perform etcd maintenance if time.time() - last_compaction > 1800: logutil.info(None, 'Compacting etcd') self._compact_etcd() last_compaction = time.time() time.sleep(60)
def wrapper(*args, **kwargs): if 'instance_uuid' in kwargs: kwargs['instance_from_db_virt'] = virt.from_db( kwargs['instance_uuid']) if not kwargs.get('instance_from_db_virt'): logutil.info([virt.ThinInstance(kwargs['instance_uuid'])], 'Instance not found, genuinely missing') return error(404, 'instance not found') return func(*args, **kwargs)
def post(self, name=None, cpus=None, memory=None, network=None, disk=None, ssh_key=None, user_data=None, placed_on=None, namespace=None, instance_uuid=None, video=None): global SCHEDULER # Check that the instance name is safe for use as a DNS host name if name != re.sub(r'([^a-zA-Z0-9_\-])', '', name) or len(name) > 63: return error(400, 'instance name must be useable as a DNS host name') # Sanity check if not disk: return error(400, 'instance must specify at least one disk') for d in disk: if not isinstance(d, dict): return error(400, 'disk specification should contain JSON objects') if network: for n in network: if not isinstance(n, dict): return error( 400, 'network specification should contain JSON objects') if 'network_uuid' not in n: return error( 400, 'network specification is missing network_uuid') if not video: video = {'model': 'cirrus', 'memory': 16384} if not namespace: namespace = get_jwt_identity() # Only system can specify a uuid if instance_uuid and get_jwt_identity() != 'system': return error(401, 'only system can specify an instance uuid') # If accessing a foreign namespace, we need to be an admin if get_jwt_identity() not in [namespace, 'system']: return error( 401, 'only admins can create resources in a different namespace') # The instance needs to exist in the DB before network interfaces are created if not instance_uuid: instance_uuid = str(uuid.uuid4()) db.add_event('instance', instance_uuid, 'uuid allocated', None, None, None) # Create instance object instance = virt.from_db(instance_uuid) if instance: if get_jwt_identity() not in [ instance.db_entry['namespace'], 'system' ]: logutil.info([virt.ThinInstance(instance_uuid)], 'Instance not found, ownership test') return error(404, 'instance not found') if not instance: instance = virt.from_definition(uuid=instance_uuid, name=name, disks=disk, memory_mb=memory, vcpus=cpus, ssh_key=ssh_key, user_data=user_data, owner=namespace, video=video, requested_placement=placed_on) # Initialise metadata db.persist_metadata('instance', instance_uuid, {}) # Allocate IP addresses order = 0 if network: for netdesc in network: n = net.from_db(netdesc['network_uuid']) if not n: db.enqueue_instance_delete( config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'missing network %s during IP allocation phase' % netdesc['network_uuid']) return error( 404, 'network %s not found' % netdesc['network_uuid']) with db.get_lock('ipmanager', None, netdesc['network_uuid'], ttl=120): db.add_event('network', netdesc['network_uuid'], 'allocate address', None, None, instance_uuid) ipm = db.get_ipmanager(netdesc['network_uuid']) if 'address' not in netdesc or not netdesc['address']: netdesc['address'] = ipm.get_random_free_address() else: if not ipm.reserve(netdesc['address']): db.enqueue_instance_delete( config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed to reserve an IP on network %s' % netdesc['network_uuid']) return error( 409, 'address %s in use' % netdesc['address']) db.persist_ipmanager(netdesc['network_uuid'], ipm.save()) if 'model' not in netdesc or not netdesc['model']: netdesc['model'] = 'virtio' db.create_network_interface(str(uuid.uuid4()), netdesc, instance_uuid, order) if not SCHEDULER: SCHEDULER = scheduler.Scheduler() try: # Have we been placed? if not placed_on: candidates = SCHEDULER.place_instance(instance, network) placement = candidates[0] else: SCHEDULER.place_instance(instance, network, candidates=[placed_on]) placement = placed_on except exceptions.LowResourceException as e: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'insufficient resources: ' + str(e)) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'scheduling failed') return error(507, str(e)) except exceptions.CandidateNodeNotFoundException as e: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'candidate node not found: ' + str(e)) db.enqueue_instance_delete(config.get.parsed('NODE_NAME'), instance_uuid, 'error', 'scheduling failed') return error(404, 'node not found: %s' % e) # Record placement db.place_instance(instance_uuid, placement) db.add_event('instance', instance_uuid, 'placement', None, None, placement) # Create a queue entry for the instance start tasks = [{ 'type': 'instance_preflight', 'instance_uuid': instance_uuid, 'network': network }] for disk in instance.db_entry['block_devices']['devices']: if 'base' in disk and disk['base']: tasks.append({ 'type': 'image_fetch', 'instance_uuid': instance_uuid, 'url': disk['base'] }) tasks.append({ 'type': 'instance_start', 'instance_uuid': instance_uuid, 'network': network }) # Enqueue creation tasks on desired node task queue db.enqueue(placement, {'tasks': tasks}) db.add_event('instance', instance_uuid, 'create', 'enqueued', None, None) # Watch for a while and return results if things are fast, give up # after a while and just return the current state start_time = time.time() while time.time() - start_time < config.parsed.get('API_ASYNC_WAIT'): i = db.get_instance(instance_uuid) if i['state'] in ['created', 'deleted', 'error']: return i time.sleep(0.5) return i
def _update_power_states(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: seen = [] # Active VMs have an ID. Active means running in libvirt # land. for domain_id in conn.listDomainsID(): domain = conn.lookupByID(domain_id) if not domain.name().startswith('sf:'): continue instance_uuid = domain.name().split(':')[1] instance = db.get_instance(instance_uuid) if not instance: # Instance is SF but not in database. Kill to reduce load. logutil.warning([virt.ThinInstance(instance_uuid)], 'Destroying unknown instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) seen.append(domain.name()) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue db.instance_enforced_deletes_increment(instance_uuid) attempts = instance.get('enforced_deletes', 0) if attempts > 5: # Sometimes we just can't delete the VM. Try the big hammer instead. logutil.warning( [virt.ThinInstance(instance_uuid)], 'Attempting alternate delete method for instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) db.add_event('instance', instance_uuid, 'enforced delete', 'complete', None, None) else: i = virt.from_db(instance_uuid) i.delete() i.update_instance_state('deleted') logutil.warning([virt.ThinInstance(instance_uuid)], 'Deleting stray instance (attempt %d)' % attempts) continue state = util.extract_power_state(libvirt, domain) db.update_instance_power_state(instance_uuid, state) if state == 'crashed': db.update_instance_state(instance_uuid, 'error') # Inactive VMs just have a name, and are powered off # in our state system. for domain_name in conn.listDefinedDomains(): if not domain_name.startswith('sf:'): continue if domain_name not in seen: instance_uuid = domain_name.split(':')[1] instance = db.get_instance(instance_uuid) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue domain = conn.lookupByName(domain_name) domain.undefine() logutil.info([virt.ThinInstance(instance_uuid)], 'Detected stray instance') db.add_event('instance', instance_uuid, 'deleted stray', 'complete', None, None) continue db.place_instance(instance_uuid, config.parsed.get('NODE_NAME')) instance_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', instance_uuid) if not os.path.exists(instance_path): # If we're inactive and our files aren't on disk, # we have a problem. logutil.info([virt.ThinInstance(instance_uuid)], 'Detected error state for instance') db.update_instance_state(instance_uuid, 'error') elif instance.get('power_state') != 'off': logutil.info([virt.ThinInstance(instance_uuid)], 'Detected power off for instance') db.update_instance_power_state(instance_uuid, 'off') db.add_event('instance', instance_uuid, 'detected poweroff', 'complete', None, None) except libvirt.libvirtError as e: logutil.error(None, 'Failed to lookup all domains: %s' % e)