def instance_preflight(instance_uuid, network): db.update_instance_state(instance_uuid, 'preflight') s = scheduler.Scheduler() instance = virt.from_db(instance_uuid) try: s.place_instance(instance, network, candidates=[config.parsed.get('NODE_NAME')]) return None except exceptions.LowResourceException as e: db.add_event('instance', instance_uuid, 'schedule', 'retry', None, 'insufficient resources: ' + str(e)) if instance.db_entry.get('placement_attempts') > 3: raise exceptions.AbortInstanceStartException('Too many start attempts') try: if instance.db_entry.get('requested_placement'): candidates = [instance.db_entry.get('requested_placement')] else: candidates = [] for node in s.metrics.keys(): if node != config.parsed.get('NODE_NAME'): candidates.append(node) candidates = s.place_instance(instance, network, candidates=candidates) return candidates[0] except exceptions.LowResourceException as e: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'insufficient resources: ' + str(e)) # This raise implies delete above raise exceptions.AbortInstanceStartException( 'Unable to find suitable node')
def instance_delete(instance_uuid): with db.get_lock('instance', None, instance_uuid): db.add_event('instance', instance_uuid, 'queued', 'delete', None, None) # Create list of networks used by instance instance_networks = [] for iface in list(db.get_instance_interfaces(instance_uuid)): if not iface['network_uuid'] in instance_networks: instance_networks.append(iface['network_uuid']) # Create list of networks used by all other instances host_networks = [] for inst in list( db.get_instances(only_node=config.parsed.get('NODE_NAME'))): if not inst['uuid'] == instance_uuid: for iface in db.get_instance_interfaces(inst['uuid']): if not iface['network_uuid'] in host_networks: host_networks.append(iface['network_uuid']) instance_from_db_virt = virt.from_db(instance_uuid) if instance_from_db_virt: instance_from_db_virt.delete() # Check each network used by the deleted instance for network in instance_networks: n = net.from_db(network) if n: # If network used by another instance, only update if network in host_networks: with util.RecordedOperation('deallocate ip address', instance_from_db_virt): n.update_dhcp() else: # Network not used by any other instance therefore delete with util.RecordedOperation('remove network', n): n.delete()
def _get(self, locks, related_object): """Fetch image if not downloaded and return image path.""" actual_image = self.version_image_path() with util.RecordedOperation('fetch image', related_object): resp = self._open_connection() diff_field = self._new_image_available(resp) if diff_field: self.log.withField( 'diff_field', diff_field).info('Fetch required due HTTP field change') if related_object: t, u = related_object.unique_label() msg = '%s: %s -> %s' % diff_field db.add_event(t, u, 'image requires fetch', None, None, msg) actual_image = self._fetch(resp, locks) # Ensure checksum is correct if not self.correct_checksum(actual_image): if isinstance(related_object, virt.Instance): related_object.add_event('fetch image', 'bad checksum') raise exceptions.BadCheckSum('url=%s' % self.url) # Only persist values after the file has been verified. # Otherwise diff_field will not trigger a new download in the # case of a checksum verification failure. self.fetched = email.utils.formatdate() self.modified = resp.headers.get('Last-Modified') self.size = resp.headers.get('Content-Length') self.persist() _transcode(locks, actual_image, related_object) return actual_image
def _process_network_node_workitems(self): jobname, workitem = db.dequeue('networknode') try: if not workitem: time.sleep(0.2) return log_ctx = LOG.withField('workitem', workitem) if not NetworkTask.__subclasscheck__(type(workitem)): raise exceptions.UnknownTaskException( 'Network workitem was not decoded: %s' % workitem) n = net.from_db(workitem.network_uuid()) if not n: log_ctx.withNetwork(workitem.network_uuid()).warning( 'Received work item for non-existent network') return # NOTE(mikal): there's really nothing stopping us from processing a bunch # of these jobs in parallel with a pool of workers, but I am not sure its # worth the complexity right now. Are we really going to be changing # networks that much? if isinstance(workitem, DeployNetworkTask): try: n.create() n.ensure_mesh() db.add_event('network', workitem.network_uuid(), 'network node', 'deploy', None, None) except exceptions.DeadNetwork as e: log_ctx.withField('exception', e).warning( 'DeployNetworkTask on dead network') elif isinstance(workitem, UpdateDHCPNetworkTask): try: n.create() n.ensure_mesh() n.update_dhcp() db.add_event('network', workitem.network_uuid(), 'network node', 'update dhcp', None, None) except exceptions.DeadNetwork as e: log_ctx.withField('exception', e).warning( 'UpdateDHCPNetworkTask on dead network') elif isinstance(workitem, RemoveDHCPNetworkTask): n.remove_dhcp() db.add_event('network', workitem.network_uuid(), 'network node', 'remove dhcp', None, None) finally: if jobname: db.resolve('networknode', jobname)
def __enter__(self): start_time = time.time() slow_warned = False threshold = int(config.parsed.get('SLOW_LOCK_THRESHOLD')) try: while time.time() - start_time < self.timeout: res = self.acquire() if res: return self duration = time.time() - start_time if (duration > threshold and not slow_warned): db.add_event(self.objecttype, self.objectname, 'lock', 'acquire', None, 'Waiting for lock more than threshold') node, pid = self.get_holder() logutil.info( self.relatedobjects, 'Waiting for lock on %s: %.02f seconds, threshold ' '%d seconds. Holder is pid %s on %s.' % (self.path, duration, threshold, pid, node)) slow_warned = True time.sleep(1) duration = time.time() - start_time db.add_event( self.objecttype, self.objectname, 'lock', 'failed', None, 'Failed to acquire lock after %.02f seconds' % duration) node, pid = self.get_holder() logutil.info( self.relatedobjects, 'Failed to acquire lock %s after %.02f seconds. Holder is pid %s on %s.' % (self.path, duration, pid, node)) raise exceptions.LockException( 'Cannot acquire lock %s, timed out after %.02f seconds' % (self.name, duration)) finally: duration = time.time() - start_time if duration > threshold: db.add_event(self.objecttype, self.objectname, 'lock', 'acquired', None, 'Waited %d seconds for lock' % duration) logutil.info( self.relatedobjects, 'Acquiring a lock on %s was slow: %.02f seconds' % (self.path, duration))
def __enter__(self): start_time = time.time() slow_warned = False threshold = int(config.SLOW_LOCK_THRESHOLD) while time.time() - start_time < self.timeout: res = self.acquire() if res: duration = time.time() - start_time if duration > threshold: db.add_event(self.objecttype, self.objectname, 'lock', 'acquired', None, 'Waited %d seconds for lock' % duration) self.log_ctx.with_field( 'duration', duration).info('Acquiring a lock was slow') return self duration = time.time() - start_time if (duration > threshold and not slow_warned): db.add_event(self.objecttype, self.objectname, 'lock', 'acquire', None, 'Waiting for lock more than threshold') node, pid = self.get_holder() self.log_ctx.with_fields({ 'duration': duration, 'threshold': threshold, 'holder-pid': pid, 'holder-node': node, 'requesting-op': self.operation, }).info('Waiting for lock') slow_warned = True time.sleep(1) duration = time.time() - start_time db.add_event(self.objecttype, self.objectname, 'lock', 'failed', None, 'Failed to acquire lock after %.02f seconds' % duration) node, pid = self.get_holder() self.log_ctx.with_fields({ 'duration': duration, 'holder-pid': pid, 'holder-node': node, 'requesting-op': self.operation, }).info('Failed to acquire lock') raise exceptions.LockException( 'Cannot acquire lock %s, timed out after %.02f seconds' % (self.name, self.timeout))
def post(self, netblock=None, provide_dhcp=None, provide_nat=None, name=None, namespace=None): try: ipaddress.ip_network(netblock) except ValueError as e: return error(400, 'cannot parse netblock: %s' % e) if not namespace: namespace = get_jwt_identity() # If accessing a foreign name namespace, we need to be an admin if get_jwt_identity() not in [namespace, 'system']: return error( 401, 'only admins can create resources in a different namespace') network = db.allocate_network(netblock, provide_dhcp, provide_nat, name, namespace) db.add_event('network', network['uuid'], 'api', 'create', None, None) # Networks should immediately appear on the network node db.enqueue('networknode', { 'type': 'deploy', 'network_uuid': network['uuid'] }) db.add_event('network', network['uuid'], 'deploy', 'enqueued', None, None) db.add_event('network', network['uuid'], 'api', 'created', None, None) db.update_network_state(network['uuid'], 'created') # Initialise metadata db.persist_metadata('network', network['uuid'], {}) return network
def place_instance(self, instance, network, candidates=None): with util.RecordedOperation('schedule', instance): log_ctx = LOG.withObj(instance) diff = time.time() - self.metrics_updated if diff > config.parsed.get('SCHEDULER_CACHE_TIMEOUT'): self.refresh_metrics() if candidates: log_ctx.info('Scheduling %s forced as candidates' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Forced candidates', None, str(candidates)) for node in candidates: if node not in self.metrics: raise exceptions.CandidateNodeNotFoundException(node) else: candidates = [] for node in self.metrics.keys(): candidates.append(node) log_ctx.info('Scheduling %s start as candidates' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Initial candidates', None, str(candidates)) if not candidates: raise exceptions.LowResourceException('No nodes with metrics') # Can we host that many vCPUs? for node in copy.copy(candidates): max_cpu = self.metrics[node].get('cpu_max_per_instance', 0) if instance.db_entry['cpus'] > max_cpu: candidates.remove(node) log_ctx.info('Scheduling %s have enough actual CPU' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough actual CPU', None, str(candidates)) if not candidates: raise exceptions.LowResourceException( 'Requested vCPUs exceeds vCPU limit') # Do we have enough idle CPU? for node in copy.copy(candidates): if not self._has_sufficient_cpu(instance.db_entry['cpus'], node): candidates.remove(node) log_ctx.info('Scheduling %s have enough idle CPU' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle CPU', None, str(candidates)) if not candidates: raise exceptions.LowResourceException( 'No nodes with enough idle CPU') # Do we have enough idle RAM? for node in copy.copy(candidates): if not self._has_sufficient_ram(instance.db_entry['memory'], node): candidates.remove(node) log_ctx.info('Scheduling %s have enough idle RAM' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle RAM', None, str(candidates)) if not candidates: raise exceptions.LowResourceException( 'No nodes with enough idle RAM') # Do we have enough idle disk? for node in copy.copy(candidates): if not self._has_sufficient_disk(instance, node): candidates.remove(node) log_ctx.info('Scheduling %s have enough idle disk' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle disk', None, str(candidates)) if not candidates: raise exceptions.LowResourceException( 'No nodes with enough disk space') # What nodes have the highest number of networks already present? if network: requested_networks = [] for net in network: network_uuid = net['network_uuid'] if network_uuid not in requested_networks: requested_networks.append(network_uuid) candidates = self._find_most_matching_networks( requested_networks, candidates) log_ctx.info('Scheduling %s have most matching networks' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have most matching networks', None, str(candidates)) # What nodes have the base image already? requested_images = [] for disk in instance.db_entry['block_devices']['devices']: if disk.get('base'): requested_images = disk.get('base') candidates = self._find_most_matching_images( requested_images, candidates) log_ctx.info('Scheduling %s have most matching images' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have most matching images', None, str(candidates)) # Avoid allocating to network node if possible net_node = db.get_network_node() if len(candidates) > 1 and net_node['fqdn'] in candidates: candidates.remove(net_node['fqdn']) log_ctx.info('Scheduling %s are non-network nodes' % candidates) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Are non-network nodes', None, str(candidates)) # Return a shuffled list of options random.shuffle(candidates) return candidates
def _update_power_states(self): libvirt = util.get_libvirt() conn = libvirt.open(None) try: seen = [] # Active VMs have an ID. Active means running in libvirt # land. for domain_id in conn.listDomainsID(): domain = conn.lookupByID(domain_id) if not domain.name().startswith('sf:'): continue instance_uuid = domain.name().split(':')[1] log_ctx = LOG.withInstance(instance_uuid) instance = db.get_instance(instance_uuid) if not instance: # Instance is SF but not in database. Kill to reduce load. log_ctx.warning('Destroying unknown instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) continue db.place_instance(instance_uuid, config.NODE_NAME) seen.append(domain.name()) if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue db.instance_enforced_deletes_increment(instance_uuid) attempts = instance.get('enforced_deletes', 0) if attempts > 5: # Sometimes we just can't delete the VM. Try the big hammer instead. log_ctx.warning( 'Attempting alternate delete method for instance') util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid) db.add_event('instance', instance_uuid, 'enforced delete', 'complete', None, None) else: i = virt.from_db(instance_uuid) i.delete() i.update_instance_state('deleted') log_ctx.withField( 'attempt', attempts).warning('Deleting stray instance') continue state = util.extract_power_state(libvirt, domain) db.update_instance_power_state(instance_uuid, state) if state == 'crashed': db.update_instance_state(instance_uuid, 'error') # Inactive VMs just have a name, and are powered off # in our state system. for domain_name in conn.listDefinedDomains(): if not domain_name.startswith('sf:'): continue if domain_name not in seen: instance_uuid = domain_name.split(':')[1] log_ctx = LOG.withInstance(instance_uuid) instance = db.get_instance(instance_uuid) if not instance: # Instance is SF but not in database. Kill because unknown. log_ctx.warning('Removing unknown inactive instance') domain = conn.lookupByName(domain_name) domain.undefine() continue if instance.get('state') == 'deleted': # NOTE(mikal): a delete might be in-flight in the queue. # We only worry about instances which should have gone # away five minutes ago. if time.time() - instance['state_updated'] < 300: continue domain = conn.lookupByName(domain_name) domain.undefine() log_ctx.info('Detected stray instance') db.add_event('instance', instance_uuid, 'deleted stray', 'complete', None, None) continue db.place_instance(instance_uuid, config.NODE_NAME) instance_path = os.path.join(config.get('STORAGE_PATH'), 'instances', instance_uuid) if not os.path.exists(instance_path): # If we're inactive and our files aren't on disk, # we have a problem. log_ctx.info('Detected error state for instance') db.update_instance_state(instance_uuid, 'error') elif instance.get('power_state') != 'off': log_ctx.info('Detected power off for instance') db.update_instance_power_state(instance_uuid, 'off') db.add_event('instance', instance_uuid, 'detected poweroff', 'complete', None, None) except libvirt.libvirtError as e: LOG.error('Failed to lookup all domains: %s' % e)
def post(self, instance_uuid=None, instance_from_db=None, instance_from_db_virt=None): db.add_event('instance', instance_uuid, 'api', 'unpause', None, None) return instance_from_db_virt.unpause()
def create(self): subst = self.subst_dict() with db.get_lock('network', None, self.uuid, ttl=120): if not util.check_for_interface(subst['vx_interface']): with util.RecordedOperation('create vxlan interface', self): util.execute( None, 'ip link add %(vx_interface)s type vxlan id %(vx_id)s ' 'dev %(physical_interface)s dstport 0' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.%(vx_interface)s.arp_notify=1' % subst) if not util.check_for_interface(subst['vx_bridge']): with util.RecordedOperation('create vxlan bridge', self): util.execute( None, 'ip link add %(vx_bridge)s type bridge' % subst) util.execute( None, 'ip link set %(vx_interface)s master %(vx_bridge)s' % subst) util.execute(None, 'ip link set %(vx_interface)s up' % subst) util.execute(None, 'ip link set %(vx_bridge)s up' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.%(vx_bridge)s.arp_notify=1' % subst) util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst) util.execute(None, 'brctl stp %(vx_bridge)s off' % subst) util.execute(None, 'brctl setageing %(vx_bridge)s 0' % subst) if util.is_network_node(): if not os.path.exists('/var/run/netns/%(netns)s' % subst): with util.RecordedOperation('create netns', self): util.execute(None, 'ip netns add %(netns)s' % subst) if not util.check_for_interface(subst['vx_veth_outer']): with util.RecordedOperation('create router veth', self): util.execute( None, 'ip link add %(vx_veth_outer)s type veth peer name %(vx_veth_inner)s' % subst) util.execute( None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst) util.execute( None, 'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst) util.execute(None, 'ip link set %(vx_veth_outer)s up' % subst) util.execute( None, '%(in_netns)s ip link set %(vx_veth_inner)s up' % subst) util.execute( None, '%(in_netns)s ip addr add %(router)s/%(netmask)s dev %(vx_veth_inner)s' % subst) if not util.check_for_interface(subst['physical_veth_outer']): with util.RecordedOperation('create physical veth', self): util.execute( None, 'ip link add %(physical_veth_outer)s type veth peer name ' '%(physical_veth_inner)s' % subst) util.execute( None, 'brctl addif %(physical_bridge)s %(physical_veth_outer)s' % subst) util.execute( None, 'ip link set %(physical_veth_outer)s up' % subst) util.execute( None, 'ip link set %(physical_veth_inner)s netns %(netns)s' % subst) self.deploy_nat() self.update_dhcp() else: db.enqueue('networknode', { 'type': 'deploy', 'network_uuid': self.uuid }) db.add_event('network', self.uuid, 'deploy', 'enqueued', None, None)
def create(self): subst = self.subst_dict() with db.get_object_lock(self, ttl=120, op='Network create'): # Ensure network was not deleted whilst waiting for the lock. if self.is_dead(): raise DeadNetwork('network=%s' % self) if not util.check_for_interface(subst['vx_interface']): with util.RecordedOperation('create vxlan interface', self): util.create_interface( subst['vx_interface'], 'vxlan', 'id %(vx_id)s dev %(physical_interface)s dstport 0' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.' '%(vx_interface)s.arp_notify=1' % subst) if not util.check_for_interface(subst['vx_bridge']): with util.RecordedOperation('create vxlan bridge', self): util.create_interface(subst['vx_bridge'], 'bridge', '') util.execute( None, 'ip link set %(vx_interface)s ' 'master %(vx_bridge)s' % subst) util.execute(None, 'ip link set %(vx_interface)s up' % subst) util.execute(None, 'ip link set %(vx_bridge)s up' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.' '%(vx_bridge)s.arp_notify=1' % subst) util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst) util.execute(None, 'brctl stp %(vx_bridge)s off' % subst) util.execute(None, 'brctl setageing %(vx_bridge)s 0' % subst) if util.is_network_node(): if not os.path.exists('/var/run/netns/%(netns)s' % subst): with util.RecordedOperation('create netns', self): util.execute(None, 'ip netns add %(netns)s' % subst) if not util.check_for_interface(subst['vx_veth_outer']): with util.RecordedOperation('create router veth', self): util.create_interface( subst['vx_veth_outer'], 'veth', 'peer name %(vx_veth_inner)s' % subst) util.execute( None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst) util.execute( None, 'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst) util.execute(None, 'ip link set %(vx_veth_outer)s up' % subst) util.execute( None, '%(in_netns)s ip link set %(vx_veth_inner)s up' % subst) util.execute( None, '%(in_netns)s ip addr add %(router)s/%(netmask)s ' 'dev %(vx_veth_inner)s' % subst) if not util.check_for_interface(subst['physical_veth_outer']): with util.RecordedOperation('create physical veth', self): util.create_interface( subst['physical_veth_outer'], 'veth', 'peer name %(physical_veth_inner)s' % subst) util.execute( None, 'brctl addif %(physical_bridge)s ' '%(physical_veth_outer)s' % subst) util.execute( None, 'ip link set %(physical_veth_outer)s up' % subst) util.execute( None, 'ip link set %(physical_veth_inner)s ' 'netns %(netns)s' % subst) self.deploy_nat() self.update_dhcp() else: db.enqueue('networknode', DeployNetworkTask(self.db_entry['uuid'])) db.add_event('network', self.db_entry['uuid'], 'deploy', 'enqueued', None, None)
def run(self): LOG.info('Starting Monitor Daemon') observers = {} while True: # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) LOG.withInstance(instance_uuid).info( 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Start missing observers extra_instances = list(observers.keys()) for inst in db.get_instances( only_node=config.parsed.get('NODE_NAME')): if inst['uuid'] in extra_instances: extra_instances.remove(inst['uuid']) if inst['state'] != 'created': continue if inst['uuid'] not in observers: console_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', inst['uuid'], 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, inst['uuid']), name='%s-%s' % (daemon.process_name('triggers'), inst['uuid'])) p.start() observers[inst['uuid']] = p LOG.withInstance( inst['uuid']).info('Started trigger observer') db.add_event('instance', inst['uuid'], 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) observers[instance_uuid].join(1) except Exception: pass del observers[instance_uuid] LOG.withInstance(instance_uuid).info( 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) time.sleep(1)
def instance_delete(inst): with inst.get_lock(op='Instance delete'): # There are two delete state flows: # - error transition states (preflight-error etc) to error # - created to deleted # # We don't need delete_wait for the error states as they're already # in a transition state. if not inst.state.value.endswith('-error'): inst.state = dbo.STATE_DELETE_WAIT db.add_event('instance', inst.uuid, 'queued', 'delete', None, None) # Create list of networks used by instance. We cannot use the # interfaces cached in the instance here, because the instance # may have failed to get to the point where it populates that # field (an image fetch failure for example). instance_networks = [] interfaces = [] for ni in networkinterface.interfaces_for_instance(inst): if ni: interfaces.append(ni) if ni.network_uuid not in instance_networks: instance_networks.append(ni.network_uuid) # Stop the instance inst.power_off() # Delete the instance's interfaces with util_general.RecordedOperation('release network addresses', inst): for ni in interfaces: ni.delete() # Create list of networks used by all other instances host_networks = [] for i in instance.Instances( [instance.this_node_filter, instance.active_states_filter]): if not i.uuid == inst.uuid: for iface_uuid in inst.interfaces: ni = networkinterface.NetworkInterface.from_db(iface_uuid) if ni and ni.network_uuid not in host_networks: host_networks.append(ni.network_uuid) inst.delete() # Check each network used by the deleted instance for network in instance_networks: n = net.Network.from_db(network) if n: # If network used by another instance, only update if network in host_networks: if n.state.value == dbo.STATE_DELETE_WAIT: # Do not update a network about to be deleted continue with util_general.RecordedOperation( 'deallocate ip address', inst): n.update_dhcp() else: # Network not used by any other instance therefore delete with util_general.RecordedOperation( 'remove network from node', n): n.delete_on_hypervisor()
def get(self, instance_uuid=None, instance_from_db=None): db.add_event('instance', instance_uuid, 'api', 'get interfaces', None, None) return list(db.get_instance_interfaces(instance_uuid))
def post(self, name=None, cpus=None, memory=None, network=None, disk=None, ssh_key=None, user_data=None, placed_on=None, namespace=None, instance_uuid=None): global SCHEDULER # We need to sanitise the name so its safe for DNS name = re.sub(r'([^a-zA-Z0-9_\-])', '', name) if not namespace: namespace = get_jwt_identity() # If accessing a foreign namespace, we need to be an admin if get_jwt_identity() not in [namespace, 'system']: return error( 401, 'only admins can create resources in a different namespace') # The instance needs to exist in the DB before network interfaces are created if not instance_uuid: instance_uuid = str(uuid.uuid4()) db.add_event('instance', instance_uuid, 'uuid allocated', None, None, None) # Create instance object instance = virt.from_db(instance_uuid) if instance: if get_jwt_identity() not in [ instance.db_entry['namespace'], 'system' ]: LOG.info('instance(%s): instance not found, ownership test' % instance_uuid) return error(404, 'instance not found') if not instance: instance = virt.from_definition(uuid=instance_uuid, name=name, disks=disk, memory_mb=memory, vcpus=cpus, ssh_key=ssh_key, user_data=user_data, owner=namespace) if not SCHEDULER: SCHEDULER = scheduler.Scheduler() # Have we been placed? if not placed_on: candidates = SCHEDULER.place_instance(instance, network) if len(candidates) == 0: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'insufficient resources') db.update_instance_state(instance_uuid, 'error') return error(507, 'insufficient capacity') placed_on = candidates[0] db.place_instance(instance_uuid, placed_on) db.add_event('instance', instance_uuid, 'placement', None, None, placed_on) else: try: candidates = SCHEDULER.place_instance(instance, network, candidates=[placed_on]) if len(candidates) == 0: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'insufficient resources') db.update_instance_state(instance_uuid, 'error') return error(507, 'insufficient capacity') except scheduler.CandidateNodeNotFoundException as e: return error(404, 'node not found: %s' % e) # Have we been placed on a different node? if not placed_on == config.parsed.get('NODE_NAME'): body = flask_get_post_body() body['placed_on'] = placed_on body['instance_uuid'] = instance_uuid body['namespace'] = namespace token = util.get_api_token( 'http://%s:%d' % (placed_on, config.parsed.get('API_PORT')), namespace=namespace) r = requests.request('POST', 'http://%s:%d/instances' % (placed_on, config.parsed.get('API_PORT')), data=json.dumps(body), headers={ 'Authorization': token, 'User-Agent': util.get_user_agent() }) LOG.info('Returning proxied request: %d, %s' % (r.status_code, r.text)) resp = flask.Response(r.text, mimetype='application/json') resp.status_code = r.status_code return resp # Check we can get the required IPs nets = {} allocations = {} def error_with_cleanup(status_code, message): for network_uuid in allocations: n = net.from_db(network_uuid) for addr, _ in allocations[network_uuid]: with db.get_lock('sf/ipmanager/%s' % n.uuid, ttl=120) as _: ipm = db.get_ipmanager(n.uuid) ipm.release(addr) db.persist_ipmanager(n.uuid, ipm.save()) return error(status_code, message) order = 0 if network: for netdesc in network: if 'network_uuid' not in netdesc or not netdesc['network_uuid']: return error_with_cleanup(404, 'network not specified') if netdesc['network_uuid'] not in nets: n = net.from_db(netdesc['network_uuid']) if not n: return error_with_cleanup( 404, 'network %s not found' % netdesc['network_uuid']) nets[netdesc['network_uuid']] = n n.create() with db.get_lock('sf/ipmanager/%s' % netdesc['network_uuid'], ttl=120) as _: db.add_event('network', netdesc['network_uuid'], 'allocate address', None, None, instance_uuid) allocations.setdefault(netdesc['network_uuid'], []) ipm = db.get_ipmanager(netdesc['network_uuid']) if 'address' not in netdesc or not netdesc['address']: netdesc['address'] = ipm.get_random_free_address() else: if not ipm.reserve(netdesc['address']): return error_with_cleanup( 409, 'address %s in use' % netdesc['address']) db.persist_ipmanager(netdesc['network_uuid'], ipm.save()) allocations[netdesc['network_uuid']].append( (netdesc['address'], order)) if 'model' not in netdesc or not netdesc['model']: netdesc['model'] = 'virtio' db.create_network_interface(str(uuid.uuid4()), netdesc, instance_uuid, order) order += 1 # Initialise metadata db.persist_metadata('instance', instance_uuid, {}) # Now we can start the instance with db.get_lock('sf/instance/%s' % instance.db_entry['uuid'], ttl=900) as lock: with util.RecordedOperation('ensure networks exist', instance) as _: for network_uuid in nets: n = nets[network_uuid] n.ensure_mesh() n.update_dhcp() with util.RecordedOperation('instance creation', instance) as _: instance.create(lock=lock) for iface in db.get_instance_interfaces(instance.db_entry['uuid']): db.update_network_interface_state(iface['uuid'], 'created') return db.get_instance(instance_uuid)
def get(self, instance_uuid=None, instance_from_db=None): db.add_event('instance', instance_uuid, 'api', 'get', None, None) return instance_from_db
def get(self, network_uuid=None, network_from_db=None): db.add_event('network', network_uuid, 'api', 'get events', None, None) return list(db.get_events('network', network_uuid))
def get(self, network_uuid=None, network_from_db=None): db.add_event('network', network_uuid, 'api', 'get', None, None) if network_from_db is not None and 'ipmanager' in network_from_db: del network_from_db['ipmanager'] return network_from_db
def run(self): LOG.info('Starting') observers = {} while not self.exit.is_set(): # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) LOG.with_instance(instance_uuid).info( 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Audit desired observers extra_instances = list(observers.keys()) missing_instances = [] with etcd.ThreadLocalReadOnlyCache(): for inst in instance.Instances([ instance.this_node_filter, partial(baseobject.state_filter, [instance.Instance.STATE_CREATED]) ]): if inst.uuid in extra_instances: extra_instances.remove(inst.uuid) if inst.uuid not in observers: missing_instances.append(inst.uuid) # Start missing observers for instance_uuid in missing_instances: console_path = os.path.join(config.STORAGE_PATH, 'instances', instance_uuid, 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, instance_uuid), name='%s-%s' % (daemon.process_name('triggers'), instance_uuid)) p.start() observers[instance_uuid] = p LOG.with_instance(instance_uuid).info( 'Started trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) observers[instance_uuid].join(1) except Exception: pass del observers[instance_uuid] LOG.with_instance(instance_uuid).info( 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) self.exit.wait(1) # No longer running, clean up all trigger deaemons for instance_uuid in observers: os.kill(observers[instance_uuid].pid, signal.SIGKILL)
def observe(path, instance_uuid): setproctitle.setproctitle('%s-%s' % (daemon.process_name('triggers'), instance_uuid)) regexps = { 'login prompt': re.compile('.* login: .*'), 'user-data script start': re.compile('.*Starting.*Execute cloud user/final scripts.*'), 'user-data script end': re.compile('.*Finished.*Execute cloud user/final scripts.*'), 'cloud-init complete': re.compile('.*Reached target.*Cloud-init target.*') } while not os.path.exists(path): time.sleep(1) fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK) log_ctx = LOG.with_instance(instance_uuid) log_ctx.with_field('path', path).info('Monitoring path for triggers') db.add_event('instance', instance_uuid, 'trigger monitor', 'detected console log', None, None) # Sometimes the trigger process is slow to start, so rewind 4KB to ensure # that the last few log lines are not missed. (4KB since Cloud-Init can be # noisy after the login prompt.) os.lseek(fd, max(0, os.fstat(fd).st_size - 4096), os.SEEK_SET) # Record how long the file is, because we need to detect truncations and # re-open. previous_size = os.stat(path).st_size buffer = '' while True: # Detect file truncations, and die if we see one. We will be restarted # by the monitor process. if not os.path.exists(path): return size = os.stat(path).st_size if size < previous_size: return previous_size = size # Read data, os.read() is non-blocking by the way. d = os.read(fd, 1024).decode('utf-8', errors='ignore') if d: buffer += d lines = buffer.split('\n') buffer = lines[-1] for line in lines: if line: for trigger in regexps: m = regexps[trigger].match(line) if m: log_ctx.with_field( 'trigger', trigger, ).info('Trigger matched') db.add_event('instance', instance_uuid, 'trigger', None, None, trigger) else: # Only pause if there was no data to read time.sleep(0.2)
def add_event(self, operation, phase, duration=None, msg=None): db.add_event('instance', self.uuid, operation, phase, duration, msg)
def handle(jobname, workitem): libvirt = util_libvirt.get_libvirt() log = LOG.with_field('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) inst = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if InstanceTask.__subclasscheck__(type(task)): inst = instance.Instance.from_db(task.instance_uuid()) if not inst: raise exceptions.InstanceNotInDBException( task.instance_uuid()) if isinstance(task, FetchImageTask): inst = instance.Instance.from_db(task.instance_uuid()) if isinstance(task, SnapshotTask): inst = instance.Instance.from_db(task.instance_uuid()) if inst: log_i = log.with_instance(inst) else: log_i = log log_i.with_field('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if inst: # TODO(andy) move to QueueTask db.add_event('instance', inst.uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), inst) elif isinstance(task, PreflightInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot preflight an instance in state %s, skipping task' % inst.state.value) continue redirect_to = instance_preflight(inst, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) etcd.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot start an instance in state %s, skipping task' % inst.state.value) continue instance_start(inst, task.network()) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(inst) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util_general.ignore_exception( 'instance %s delete task' % inst, e) elif isinstance(task, FloatNetworkInterfaceTask): # Just punt it to the network node now that the interface is ready etcd.enqueue('networknode', task) elif isinstance(task, SnapshotTask): snapshot(inst, task.disk(), task.artifact_uuid(), task.blob_uuid()) elif isinstance(task, DeleteNetworkWhenClean): # Check if any interfaces remain on network task_network = net.Network.from_db(task.network_uuid()) ifaces = networkinterface.interfaces_for_network(task_network) cur_interfaces = {i.uuid: i for i in ifaces} if cur_interfaces: LOG.with_network(task_network).error( 'During DeleteNetworkWhenClean new interfaces have ' 'connected to network: %s', cur_interfaces) # Only check those present at delete task initiation time. remain_interfaces = list( set(task.wait_interfaces()) & set(cur_interfaces)) if remain_interfaces: # Queue task on a node with a remaining instance first_iface = cur_interfaces[remain_interfaces[0]] inst = instance.Instance.from_db(first_iface.instance_uuid) etcd.enqueue(inst.placement['node'], { 'tasks': [ DeleteNetworkWhenClean(task.network_uuid(), remain_interfaces) ] }, delay=60) else: # All original instances deleted, safe to delete network etcd.enqueue('networknode', DestroyNetworkTask(task.network_uuid())) elif isinstance(task, HypervisorDestroyNetworkTask): n = net.Network.from_db(task.network_uuid()) n.delete_on_hypervisor() elif isinstance(task, FetchBlobTask): metrics = etcd.get('metrics', config.NODE_NAME, None) if metrics: metrics = metrics.get('metrics', {}) else: metrics = {} b = blob.Blob.from_db(task.blob_uuid()) if not b: log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, not found') elif (int(metrics.get('disk_free_blobs', 0)) - int(b.size) < config.MINIMUM_FREE_DISK): log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, insufficient space') else: log.with_object(b).info('Replicating blob') size = b.ensure_local([]) log.with_object(b).with_fields({ 'transferred': size, 'expected': b.size }).info('Replicating blob complete') else: log_i.with_field('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if inst: inst.enqueue_delete_due_error('Image fetch failed: %s' % e) except exceptions.ImagesCannotShrinkException as e: log.info('Fetch Resize Error: %s', e) if inst: inst.enqueue_delete_due_error('Image resize failed: %s' % e) except libvirt.libvirtError as e: log.info('Libvirt Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except exceptions.InstanceException as e: log.info('Instance Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except Exception as e: # Logging ignored exception - this should be investigated util_general.ignore_exception('queue worker', e) if inst: inst.enqueue_delete_due_error('Failed queue task: %s' % e) finally: etcd.resolve(config.NODE_NAME, jobname) if inst: inst.add_event('tasks complete', 'dequeued', msg='Work item %s' % jobname) log.info('Completed workitem')
def add_event(self, operation, phase, duration=None, msg=None): if not self.__in_memory_only: db.add_event(self.object_type, self.__uuid, operation, phase, duration, msg)
def handle(jobname, workitem): log = LOG.withField('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if (InstanceTask.__subclasscheck__(type(task)) or isinstance(task, FetchImageTask)): instance_uuid = task.instance_uuid() if instance_uuid: log_i = log.withInstance(instance_uuid) else: log_i = log log_i.withField('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if instance_uuid: # TODO(andy) move to QueueTask db.add_event('instance', instance_uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), instance_uuid) elif isinstance(task, PreflightInstanceTask): redirect_to = instance_preflight(instance_uuid, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): instance_start(instance_uuid, task.network()) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'deleted') except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) elif isinstance(task, ErrorInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'error') if task.error_msg(): db.update_instance_error_message( instance_uuid, task.error_msg()) db.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) else: log_i.withField('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) finally: db.resolve(config.NODE_NAME, jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) log.info('Completed workitem')
def post(self, name=None, cpus=None, memory=None, network=None, disk=None, ssh_key=None, user_data=None, placed_on=None, namespace=None, instance_uuid=None, video=None): global SCHEDULER # Check that the instance name is safe for use as a DNS host name if name != re.sub(r'([^a-zA-Z0-9_\-])', '', name) or len(name) > 63: return error(400, 'instance name must be useable as a DNS host name') # Sanity check if not disk: return error(400, 'instance must specify at least one disk') for d in disk: if not isinstance(d, dict): return error(400, 'disk specification should contain JSON objects') if network: for n in network: if not isinstance(n, dict): return error( 400, 'network specification should contain JSON objects') if 'network_uuid' not in n: return error( 400, 'network specification is missing network_uuid') if not video: video = {'model': 'cirrus', 'memory': 16384} if not namespace: namespace = get_jwt_identity() # Only system can specify a uuid if instance_uuid and get_jwt_identity() != 'system': return error(401, 'only system can specify an instance uuid') # If accessing a foreign namespace, we need to be an admin if get_jwt_identity() not in [namespace, 'system']: return error( 401, 'only admins can create resources in a different namespace') # The instance needs to exist in the DB before network interfaces are created if not instance_uuid: instance_uuid = str(uuid.uuid4()) db.add_event('instance', instance_uuid, 'uuid allocated', None, None, None) # Create instance object instance = virt.from_db(instance_uuid) if instance: if get_jwt_identity() not in [ instance.db_entry['namespace'], 'system' ]: logutil.info([virt.ThinInstance(instance_uuid)], 'Instance not found, ownership test') return error(404, 'instance not found') if not instance: instance = virt.from_definition(uuid=instance_uuid, name=name, disks=disk, memory_mb=memory, vcpus=cpus, ssh_key=ssh_key, user_data=user_data, owner=namespace, video=video, requested_placement=placed_on) # Initialise metadata db.persist_metadata('instance', instance_uuid, {}) # Allocate IP addresses order = 0 if network: for netdesc in network: n = net.from_db(netdesc['network_uuid']) if not n: db.enqueue_instance_delete( config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'missing network %s during IP allocation phase' % netdesc['network_uuid']) return error( 404, 'network %s not found' % netdesc['network_uuid']) with db.get_lock('ipmanager', None, netdesc['network_uuid'], ttl=120): db.add_event('network', netdesc['network_uuid'], 'allocate address', None, None, instance_uuid) ipm = db.get_ipmanager(netdesc['network_uuid']) if 'address' not in netdesc or not netdesc['address']: netdesc['address'] = ipm.get_random_free_address() else: if not ipm.reserve(netdesc['address']): db.enqueue_instance_delete( config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed to reserve an IP on network %s' % netdesc['network_uuid']) return error( 409, 'address %s in use' % netdesc['address']) db.persist_ipmanager(netdesc['network_uuid'], ipm.save()) if 'model' not in netdesc or not netdesc['model']: netdesc['model'] = 'virtio' db.create_network_interface(str(uuid.uuid4()), netdesc, instance_uuid, order) if not SCHEDULER: SCHEDULER = scheduler.Scheduler() try: # Have we been placed? if not placed_on: candidates = SCHEDULER.place_instance(instance, network) placement = candidates[0] else: SCHEDULER.place_instance(instance, network, candidates=[placed_on]) placement = placed_on except exceptions.LowResourceException as e: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'insufficient resources: ' + str(e)) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'scheduling failed') return error(507, str(e)) except exceptions.CandidateNodeNotFoundException as e: db.add_event('instance', instance_uuid, 'schedule', 'failed', None, 'candidate node not found: ' + str(e)) db.enqueue_instance_delete(config.get.parsed('NODE_NAME'), instance_uuid, 'error', 'scheduling failed') return error(404, 'node not found: %s' % e) # Record placement db.place_instance(instance_uuid, placement) db.add_event('instance', instance_uuid, 'placement', None, None, placement) # Create a queue entry for the instance start tasks = [{ 'type': 'instance_preflight', 'instance_uuid': instance_uuid, 'network': network }] for disk in instance.db_entry['block_devices']['devices']: if 'base' in disk and disk['base']: tasks.append({ 'type': 'image_fetch', 'instance_uuid': instance_uuid, 'url': disk['base'] }) tasks.append({ 'type': 'instance_start', 'instance_uuid': instance_uuid, 'network': network }) # Enqueue creation tasks on desired node task queue db.enqueue(placement, {'tasks': tasks}) db.add_event('instance', instance_uuid, 'create', 'enqueued', None, None) # Watch for a while and return results if things are fast, give up # after a while and just return the current state start_time = time.time() while time.time() - start_time < config.parsed.get('API_ASYNC_WAIT'): i = db.get_instance(instance_uuid) if i['state'] in ['created', 'deleted', 'error']: return i time.sleep(0.5) return i
def place_instance(self, instance, network, candidates=None): with util.RecordedOperation('schedule', instance) as _: if time.time() - self.metrics_updated > config.parsed.get( 'SCHEDULER_CACHE_TIMEOUT'): self.refresh_metrics() if candidates: LOG.info('Scheduling %s, %s forced as candidates' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Forced candidates', None, str(candidates)) for node in candidates: if node not in self.metrics: raise CandidateNodeNotFoundException(node) else: candidates = [] for node in self.metrics.keys(): candidates.append(node) LOG.info('Scheduling %s, %s start as candidates' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Initial candidates', None, str(candidates)) # Can we host that many vCPUs? for node in copy.copy(candidates): if instance.db_entry['cpus'] > self.metrics[node].get( 'cpu_max_per_instance', 0): candidates.remove(node) LOG.info('Scheduling %s, %s have enough actual CPU' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough actual CPU', None, str(candidates)) # Do we have enough idle CPU? for node in copy.copy(candidates): if not self._has_sufficient_cpu(instance.db_entry['cpus'], node): candidates.remove(node) LOG.info('Scheduling %s, %s have enough idle CPU' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle CPU', None, str(candidates)) # Do we have enough idle RAM? for node in copy.copy(candidates): if not self._has_sufficient_ram(instance.db_entry['memory'], node): candidates.remove(node) LOG.info('Scheduling %s, %s have enough idle RAM' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle RAM', None, str(candidates)) # Do we have enough idle disk? for node in copy.copy(candidates): if not self._has_sufficient_disk(instance, node): candidates.remove(node) LOG.info('Scheduling %s, %s have enough idle disk' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have enough idle disk', None, str(candidates)) # What nodes have the highest number of networks already present? if network: requested_networks = [] for net in network: network_uuid = net['network_uuid'] if network_uuid not in requested_networks: requested_networks.append(network_uuid) candidates = self._find_most_matching_networks( requested_networks, candidates) LOG.info('Scheduling %s, %s have most matching networks' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have most matching networks', None, str(candidates)) # What nodes have the base image already? requested_images = [] for disk in instance.db_entry['block_devices']['devices']: if disk.get('base'): requested_images = disk.get('base') candidates = self._find_most_matching_images( requested_images, candidates) LOG.info('Scheduling %s, %s have most matching images' % (instance, candidates)) db.add_event('instance', instance.db_entry['uuid'], 'schedule', 'Have most matching images', None, str(candidates)) # Return a shuffled list of options random.shuffle(candidates) return candidates
def post(self, instance_uuid=None, instance_from_db=None, instance_from_db_virt=None): db.add_event('instance', instance_uuid, 'api', 'poweron', None, None) return instance_from_db_virt.power_on()
def get(self, instance_uuid=None, instance_from_db=None): db.add_event('instance', instance_uuid, 'api', 'get events', None, None) return list(db.get_events('instance', instance_uuid))
def handle(jobname, workitem): j = JobName(jobname) logutil.info([j], 'Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): ro = [j] instance_uuid = task.get('instance_uuid') if instance_uuid: i = virt.from_db(instance_uuid) ro.append(i) if task.get('type').startswith('instance_') and not instance_uuid: logutil.error(ro, 'Instance task lacks instance uuid') return if instance_uuid: db.add_event('instance', instance_uuid, task.get('type').replace('_', ' '), 'dequeued', None, 'Work item %s' % jobname) logutil.info( ro, 'Executing task %s: %s' % (task.get('type', 'unknown'), task)) if task.get('type') == 'image_fetch': image_fetch(task.get('url'), instance_uuid) if task.get('type') == 'instance_preflight': redirect_to = instance_preflight(instance_uuid, task.get('network')) if redirect_to: util.log('info', ro, 'Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return if task.get('type') == 'instance_start': instance_start(instance_uuid, task.get('network')) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) if task.get('type') == 'instance_delete': try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, task.get('next_state', 'unknown')) if task.get('next_state_message'): db.update_instance_error_message( instance_uuid, task.get('next_state_message')) db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) except Exception as e: if instance_uuid: util.ignore_exception(daemon.process_name('queues'), e) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed queue task: %s' % e) finally: db.resolve(config.parsed.get('NODE_NAME'), jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) logutil.info([j], 'Completed workitem')