def test_namespace_filter(self, mock_get_all): uuids = [] for i in instance.Instances( [partial(baseobject.namespace_filter, 'gerkin')]): uuids.append(i.uuid) self.assertEqual(['373a165e-9720-4e14-bd0e-9612de79ff15'], uuids)
def external_view(self): # If this is an external view, then mix back in attributes that users # expect a = self.external_view_without_index() a.update(self.most_recent_index) # Build list of instances for each blob blob_usage = defaultdict(list) for inst in instance.Instances([instance.healthy_states_filter]): # inst.block_devices isn't populated until the instance is created, # so it may not be ready yet. This means we will miss instances # which have been requested but not yet started. for d in inst.block_devices.get('devices', []): blob_usage[d.get('blob_uuid')].append(inst.uuid) # Insert blob information blobs = {} for blob_index in self.get_all_indexes(): blob_uuid = blob_index['blob_uuid'] b = blob.Blob.from_db(blob_uuid) if b: # Blobs might have a UUID listed but not yet be instantiated. # TODO(andy): Artifacts should not reference non-existent blobs blobs[blob_index['index']] = { 'uuid': blob_uuid, 'instances': blob_usage.get(blob_uuid, []), 'size': b.size, 'reference_count': b.ref_count, } a['blobs'] = blobs return a
def test_state_filter_none(self, mock_get_all, mock_attr): uuids = [] for i in instance.Instances( [partial(baseobject.state_filter, dbo.STATE_CREATED)]): uuids.append(i.uuid) self.assertEqual([], uuids)
def test_placement_filter_none(self, mock_get_all, mock_attr): uuids = [] for i in instance.Instances( [partial(instance.placement_filter, 'node1')]): uuids.append(i.uuid) self.assertEqual([], uuids)
def test_base_iteration(self, mock_get_all): uuids = [] for i in instance.Instances([]): uuids.append(i.uuid) self.assertEqual([ '373a165e-9720-4e14-bd0e-9612de79ff15', 'b078cb4e-857c-4f04-b011-751742ef5817', 'a7c5ecec-c3a9-4774-ad1b-249d9e90e806' ], uuids)
def test_state_filter_all(self, mock_get_all, mock_attr): uuids = [] for i in instance.Instances( [partial(baseobject.state_filter, dbo.STATE_CREATED)]): uuids.append(i.uuid) self.assertEqual([ '373a165e-9720-4e14-bd0e-9612de79ff15', 'b078cb4e-857c-4f04-b011-751742ef5817', 'a7c5ecec-c3a9-4774-ad1b-249d9e90e806' ], uuids)
def test_placement_filter_all(self, mock_get_all, mock_attr): uuids = [] for i in instance.Instances( [partial(instance.placement_filter, 'node1')]): uuids.append(i.uuid) self.assertEqual([ '373a165e-9720-4e14-bd0e-9612de79ff15', 'b078cb4e-857c-4f04-b011-751742ef5817', 'a7c5ecec-c3a9-4774-ad1b-249d9e90e806' ], uuids)
def instances(self): """Build a list of instances that are using the blob as a block device. Returns a list of instance UUIDs. """ instance_uuids = [] for inst in instance.Instances([instance.healthy_states_filter]): # inst.block_devices isn't populated until the instance is created, # so it may not be ready yet. This means we will miss instances # which have been requested but not yet started. for d in inst.block_devices.get('devices', []): if d.get('blob_uuid') == self.uuid: instance_uuids.append(inst.uuid) return instance_uuids
def instance_delete(inst): with inst.get_lock(op='Instance delete'): # There are two delete state flows: # - error transition states (preflight-error etc) to error # - created to deleted # # We don't need delete_wait for the error states as they're already # in a transition state. if not inst.state.value.endswith('-error'): inst.state = dbo.STATE_DELETE_WAIT db.add_event('instance', inst.uuid, 'queued', 'delete', None, None) # Create list of networks used by instance. We cannot use the # interfaces cached in the instance here, because the instance # may have failed to get to the point where it populates that # field (an image fetch failure for example). instance_networks = [] interfaces = [] for ni in networkinterface.interfaces_for_instance(inst): if ni: interfaces.append(ni) if ni.network_uuid not in instance_networks: instance_networks.append(ni.network_uuid) # Stop the instance inst.power_off() # Delete the instance's interfaces with util_general.RecordedOperation('release network addresses', inst): for ni in interfaces: ni.delete() # Create list of networks used by all other instances host_networks = [] for i in instance.Instances( [instance.this_node_filter, instance.active_states_filter]): if not i.uuid == inst.uuid: for iface_uuid in inst.interfaces: ni = networkinterface.NetworkInterface.from_db(iface_uuid) if ni and ni.network_uuid not in host_networks: host_networks.append(ni.network_uuid) inst.delete() # Check each network used by the deleted instance for network in instance_networks: n = net.Network.from_db(network) if n: # If network used by another instance, only update if network in host_networks: if n.state.value == dbo.STATE_DELETE_WAIT: # Do not update a network about to be deleted continue with util_general.RecordedOperation( 'deallocate ip address', inst): n.update_dhcp() else: # Network not used by any other instance therefore delete with util_general.RecordedOperation( 'remove network from node', n): n.delete_on_hypervisor()
def test_state_filter_active(self, mock_get_all, mock_attr): uuids = [] for i in instance.Instances([instance.active_states_filter]): uuids.append(i.uuid) self.assertEqual(['a7c5ecec-c3a9-4774-ad1b-249d9e90e806'], uuids)
def run(self): LOG.info('Starting') observers = {} while not self.exit.is_set(): # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) LOG.with_instance(instance_uuid).info( 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Audit desired observers extra_instances = list(observers.keys()) missing_instances = [] with etcd.ThreadLocalReadOnlyCache(): for inst in instance.Instances([ instance.this_node_filter, partial(baseobject.state_filter, [instance.Instance.STATE_CREATED]) ]): if inst.uuid in extra_instances: extra_instances.remove(inst.uuid) if inst.uuid not in observers: missing_instances.append(inst.uuid) # Start missing observers for instance_uuid in missing_instances: console_path = os.path.join(config.STORAGE_PATH, 'instances', instance_uuid, 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, instance_uuid), name='%s-%s' % (daemon.process_name('triggers'), instance_uuid)) p.start() observers[instance_uuid] = p LOG.with_instance(instance_uuid).info( 'Started trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) observers[instance_uuid].join(1) except Exception: pass del observers[instance_uuid] LOG.with_instance(instance_uuid).info( 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) self.exit.wait(1) # No longer running, clean up all trigger deaemons for instance_uuid in observers: os.kill(observers[instance_uuid].pid, signal.SIGKILL)
def _maintain_networks(self): LOG.info('Maintaining networks') # Discover what networks are present _, _, vxid_to_mac = util_network.discover_interfaces() # Determine what networks we should be on host_networks = [] seen_vxids = [] if not config.NODE_IS_NETWORK_NODE: # For normal nodes, just the ones we have instances for. We need # to use the more expensive interfaces_for_instance() method of # looking up instance interfaces here if the instance cachce hasn't # been populated yet (i.e. the instance is still being created) for inst in instance.Instances([instance.this_node_filter, instance.active_states_filter]): ifaces = inst.interfaces if not ifaces: ifaces = list( networkinterface.interfaces_for_instance(inst)) for iface_uuid in ifaces: ni = networkinterface.NetworkInterface.from_db(iface_uuid) if not ni: LOG.with_instance( inst).with_networkinterface( iface_uuid).error('Network interface does not exist') elif ni.network_uuid not in host_networks: host_networks.append(ni.network_uuid) else: # For network nodes, its all networks for n in net.Networks([baseobject.active_states_filter]): host_networks.append(n.uuid) # Ensure we are on every network we have a host for for network in host_networks: try: n = net.Network.from_db(network) if not n: continue # If this network is in state delete_wait, then we should remove # it if it has no interfaces left. if n.state.value == dbo.STATE_DELETE_WAIT: if not networkinterface.interfaces_for_network(n): LOG.with_network(n).info( 'Removing stray delete_wait network') etcd.enqueue('networknode', DestroyNetworkTask(n.uuid)) # We skip maintenance on all delete_wait networks continue # Track what vxlan ids we've seen seen_vxids.append(n.vxid) if time.time() - n.state.update_time < 60: # Network state changed in the last minute, punt for now continue if not n.is_okay(): if config.NODE_IS_NETWORK_NODE: LOG.with_network(n).info( 'Recreating not okay network on network node') n.create_on_network_node() # If the network node was missing a network, then that implies # that we also need to re-create all of the floating IPs for # that network. for ni in networkinterface.interfaces_for_network(n): if ni.floating.get('floating_address'): LOG.with_fields( { 'instance': ni.instance_uuid, 'networkinterface': ni.uuid, 'floating': ni.floating.get('floating_address') }).info('Refloating interface') n.add_floating_ip(ni.floating.get( 'floating_address'), ni.ipv4) else: LOG.with_network(n).info( 'Recreating not okay network on hypervisor') n.create_on_hypervisor() n.ensure_mesh() except exceptions.LockException as e: LOG.warning( 'Failed to acquire lock while maintaining networks: %s' % e) except exceptions.DeadNetwork as e: LOG.with_field('exception', e).info( 'maintain_network attempted on dead network') except processutils.ProcessExecutionError as e: LOG.error('Network maintenance failure: %s', e) # Determine if there are any extra vxids extra_vxids = set(vxid_to_mac.keys()) - set(seen_vxids) # We keep a global cache of extra vxlans we've seen before, so that # we only warn about them when they've been stray for five minutes. global EXTRA_VLANS_HISTORY for vxid in EXTRA_VLANS_HISTORY.copy(): if vxid not in extra_vxids: del EXTRA_VLANS_HISTORY[vxid] for vxid in extra_vxids: if vxid not in EXTRA_VLANS_HISTORY: EXTRA_VLANS_HISTORY[vxid] = time.time() # Warn of extra vxlans which have been present for more than five minutes for vxid in EXTRA_VLANS_HISTORY: if time.time() - EXTRA_VLANS_HISTORY[vxid] > 5 * 60: LOG.with_field('vxid', vxid).warning( 'Extra vxlan present!')
def restore_instances(): # Ensure all instances for this node are defined and have up to date data. networks = [] instances = [] for inst in instance.Instances([instance.this_node_filter, instance.healthy_states_filter]): instance_problems = [] inst_interfaces = inst.interfaces if not inst_interfaces: inst_interfaces = [] updated_interfaces = False for ni in interfaces_for_instance(inst): if ni.network_uuid not in networks: networks.append(ni.network_uuid) if ni.uuid not in inst_interfaces: inst_interfaces.append(ni.uuid) updated_interfaces = True # We do not need a lock here because this loop only runs on the node # with the instance, and interfaces don't change post instance # creation. if updated_interfaces: inst.interfaces = inst_interfaces # TODO(mikal): do better here. # for disk in inst.disk_spec: # if disk.get('base'): # img = images.Image.new(disk['base']) # # NOTE(mikal): this check isn't great -- it checks for the original # # downloaded image, not the post transcode version # if (img.state in [dbo.STATE_DELETED, dbo.STATE_ERROR] or # not os.path.exists(img.version_image_path())): # instance_problems.append( # '%s missing from image cache' % disk['base']) # img.delete() if instance_problems: inst.enqueue_delete_due_error( 'instance bad on startup: %s' % '; '.join(instance_problems)) else: instances.append(inst) with util_general.RecordedOperation('restore networks', None): for network in networks: try: n = net.Network.from_db(network) if not n.is_dead(): LOG.with_object(n).info('Restoring network') n.create_on_hypervisor() n.ensure_mesh() except Exception as e: util_general.ignore_exception( 'restore network %s' % network, e) with util_general.RecordedOperation('restore instances', None): for inst in instances: try: with inst.get_lock(ttl=120, timeout=120, op='Instance restore'): started = ['on', 'transition-to-on', instance.Instance.STATE_INITIAL, 'unknown'] if inst.power_state not in started: continue LOG.with_object(inst).info('Restoring instance') inst.create_on_hypervisor() except Exception as e: util_general.ignore_exception( 'restore instance %s' % inst, e) inst.etcd.enqueue_delete_due_error( 'exception while restoring instance on daemon restart')