def request_replication(self, allow_excess=0): with self.get_lock_attr('locations', 'Request replication'): locations = self.locations # Filter out absent locations for node_name in self.locations: n = Node.from_db(node_name) if n.state.value != Node.STATE_CREATED: locations.remove(node_name) replica_count = len(locations) targets = config.BLOB_REPLICATION_FACTOR + allow_excess - replica_count self.log.info( 'Desired replica count is %d, we have %d, excess of %d requested' % (config.BLOB_REPLICATION_FACTOR, replica_count, allow_excess)) if targets > 0: blob_size_gb = int(int(self.size) / GiB) nodes = nodes_by_free_disk_descending(minimum=blob_size_gb + config.MINIMUM_FREE_DISK, intention='blobs') # Don't copy to locations which already have the blob for n in self.locations: if n in nodes: nodes.remove(n) self.log.with_field( 'nodes', nodes).debug('Considered for blob replication') for n in nodes[:targets]: etcd.enqueue(n, {'tasks': [FetchBlobTask(self.uuid)]}) self.log.with_field('node', n).info('Instructed to replicate blob')
def remove_dhcp(self): if config.NODE_IS_NETWORK_NODE: subst = self.subst_dict() with util_general.RecordedOperation('remove dhcp', self): with self.get_lock(op='Network remove DHCP'): d = dhcp.DHCP(self, subst['vx_veth_inner']) d.remove_dhcpd() else: etcd.enqueue('networknode', RemoveDHCPNetworkTask(self.uuid))
def post(self, interface_uuid=None): ni, n, err = api_util.safe_get_network_interface(interface_uuid) if err: return err err = api_util.assign_floating_ip(ni) if err: return err etcd.enqueue('networknode', FloatNetworkInterfaceTask(n.uuid, interface_uuid))
def post(self, url=None): # The only artifact type you can force the cluster to fetch is an # image, so TYPE_IMAGE is assumed here. We ensure that the image exists # in the database in an initial state here so that it will show up in # image list requests. The image is fetched by the queued job later. a = Artifact.from_url(Artifact.TYPE_IMAGE, url) etcd.enqueue(config.NODE_NAME, { 'tasks': [FetchImageTask(url)], }) return a.external_view()
def remove_nat(self): if config.NODE_IS_NETWORK_NODE: if self.floating_gateway: with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Remove NAT'): ipm = IPManager.from_db('floating') ipm.release(self.floating_gateway) ipm.persist() self.update_floating_gateway(None) else: etcd.enqueue('networknode', RemoveNATNetworkTask(self.uuid))
def update_dhcp(self): if not self.provide_dhcp: return if config.NODE_IS_NETWORK_NODE: subst = self.subst_dict() with util_general.RecordedOperation('update dhcp', self): with self.get_lock(op='Network update DHCP'): d = dhcp.DHCP(self, subst['vx_veth_inner']) d.restart_dhcpd() else: etcd.enqueue('networknode', UpdateDHCPNetworkTask(self.uuid))
def delete(self): if self.floating['floating_address']: etcd.enqueue( 'networknode', DefloatNetworkInterfaceTask(self.network_uuid, self.uuid)) with db.get_lock('ipmanager', None, self.network_uuid, ttl=120, op='Release fixed IP'): ipm = IPManager.from_db(self.network_uuid) ipm.release(self.ipv4) ipm.persist() self.state = dbo.STATE_DELETED
def post(self, interface_uuid=None): ni, n, err = api_util.safe_get_network_interface(interface_uuid) if err: return err float_net = net.Network.from_db('floating') if not float_net: return api_base.error(404, 'floating network not found') # Address is freed as part of the job, so code is "unbalanced" compared # to above for reasons. etcd.enqueue('networknode', DefloatNetworkInterfaceTask(n.uuid, interface_uuid))
def delete_on_network_node(self): with self.get_lock(op='Network delete'): subst = self.subst_dict() if util_network.check_for_interface(subst['vx_veth_outer']): with util_general.RecordedOperation('delete router veth', self): util_process.execute( None, 'ip link delete %(vx_veth_outer)s' % subst) if util_network.check_for_interface(subst['egress_veth_outer']): with util_general.RecordedOperation('delete egress veth', self): util_process.execute( None, 'ip link delete %(egress_veth_outer)s' % subst) if os.path.exists('/var/run/netns/%s' % self.uuid): with util_general.RecordedOperation('delete netns', self): util_process.execute( None, 'ip netns del %s' % self.uuid) if self.floating_gateway: with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Network delete'): ipm = IPManager.from_db('floating') ipm.release(self.floating_gateway) ipm.persist() self.update_floating_gateway(None) self.state = self.STATE_DELETED # Ensure that all hypervisors remove this network. This is really # just catching strays, apart from on the network node where we # absolutely need to do this thing. for hyp in Nodes([active_nodes]): etcd.enqueue(hyp.uuid, {'tasks': [ HypervisorDestroyNetworkTask(self.uuid) ]}) self.remove_dhcp() self.remove_nat() ipm = IPManager.from_db(self.uuid) ipm.delete()
def new(cls, name, namespace, netblock, provide_dhcp=False, provide_nat=False, uuid=None, vxid=None): if not uuid: # uuid should only be specified in testing uuid = str(uuid4()) if not vxid: vxid = Network.allocate_vxid(uuid) # Pre-create the IPManager IPManager.new(uuid, netblock) Network._db_create( uuid, { 'vxid': vxid, 'name': name, 'namespace': namespace, 'netblock': netblock, 'provide_dhcp': provide_dhcp, 'provide_nat': provide_nat, 'version': cls.current_version } ) n = Network.from_db(uuid) n.state = Network.STATE_INITIAL # Networks should immediately appear on the network node etcd.enqueue('networknode', DeployNetworkTask(uuid)) # TODO(andy): Integrate metadata into each object type # Initialise metadata db.persist_metadata('network', uuid, {}) return n
def enqueue_delete_remote(self, node): etcd.enqueue(node, {'tasks': [DeleteInstanceTask(self.uuid)]})
def enqueue(queuename, workitem): etcd.enqueue(queuename, workitem)
def post(self, instance_ref=None, instance_from_db=None, all=None, device=None, max_versions=0): disks = instance_from_db.block_devices['devices'] if instance_from_db.uefi: disks.append({ 'type': 'nvram', 'device': 'nvram', 'path': os.path.join(instance_from_db.instance_path, 'nvram'), 'snapshot_ignores': False }) # Filter if requested if device: new_disks = [] for d in disks: if d['device'] == device: new_disks.append(d) disks = new_disks elif not all: disks = [disks[0]] LOG.with_fields({ 'instance': instance_from_db.uuid, 'devices': disks }).info('Devices for snapshot') out = {} for disk in disks: if disk['snapshot_ignores']: continue if disk['type'] not in ['qcow2', 'nvram']: continue if not os.path.exists(disk['path']): continue a = Artifact.from_url( Artifact.TYPE_SNAPSHOT, '%s%s/%s' % (artifact.INSTANCE_URL, instance_from_db.uuid, disk['device']), max_versions) blob_uuid = str(uuid.uuid4()) entry = a.add_index(blob_uuid) out[disk['device']] = { 'source_url': a.source_url, 'artifact_uuid': a.uuid, 'artifact_index': entry['index'], 'blob_uuid': blob_uuid } if disk['type'] == 'nvram': # These are small and don't use qemu-img to capture, so just # do them now. blob.ensure_blob_path() dest_path = blob.Blob.filepath(blob_uuid) shutil.copyfile(disk['path'], dest_path) st = os.stat(dest_path) b = blob.Blob.new(blob_uuid, st.st_size, time.time(), time.time()) b.observe() a.state = Artifact.STATE_CREATED else: etcd.enqueue( config.NODE_NAME, { 'tasks': [ SnapshotTask(instance_from_db.uuid, disk, a.uuid, blob_uuid) ], }) instance_from_db.add_event( 'api', 'snapshot of %s requested' % disk['path'].split('/')[-1], None, a.uuid) return out
def handle(jobname, workitem): libvirt = util_libvirt.get_libvirt() log = LOG.with_field('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) inst = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if InstanceTask.__subclasscheck__(type(task)): inst = instance.Instance.from_db(task.instance_uuid()) if not inst: raise exceptions.InstanceNotInDBException( task.instance_uuid()) if isinstance(task, FetchImageTask): inst = instance.Instance.from_db(task.instance_uuid()) if isinstance(task, SnapshotTask): inst = instance.Instance.from_db(task.instance_uuid()) if inst: log_i = log.with_instance(inst) else: log_i = log log_i.with_field('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if inst: # TODO(andy) move to QueueTask db.add_event('instance', inst.uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), inst) elif isinstance(task, PreflightInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot preflight an instance in state %s, skipping task' % inst.state.value) continue redirect_to = instance_preflight(inst, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) etcd.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot start an instance in state %s, skipping task' % inst.state.value) continue instance_start(inst, task.network()) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(inst) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util_general.ignore_exception( 'instance %s delete task' % inst, e) elif isinstance(task, FloatNetworkInterfaceTask): # Just punt it to the network node now that the interface is ready etcd.enqueue('networknode', task) elif isinstance(task, SnapshotTask): snapshot(inst, task.disk(), task.artifact_uuid(), task.blob_uuid()) elif isinstance(task, DeleteNetworkWhenClean): # Check if any interfaces remain on network task_network = net.Network.from_db(task.network_uuid()) ifaces = networkinterface.interfaces_for_network(task_network) cur_interfaces = {i.uuid: i for i in ifaces} if cur_interfaces: LOG.with_network(task_network).error( 'During DeleteNetworkWhenClean new interfaces have ' 'connected to network: %s', cur_interfaces) # Only check those present at delete task initiation time. remain_interfaces = list( set(task.wait_interfaces()) & set(cur_interfaces)) if remain_interfaces: # Queue task on a node with a remaining instance first_iface = cur_interfaces[remain_interfaces[0]] inst = instance.Instance.from_db(first_iface.instance_uuid) etcd.enqueue(inst.placement['node'], { 'tasks': [ DeleteNetworkWhenClean(task.network_uuid(), remain_interfaces) ] }, delay=60) else: # All original instances deleted, safe to delete network etcd.enqueue('networknode', DestroyNetworkTask(task.network_uuid())) elif isinstance(task, HypervisorDestroyNetworkTask): n = net.Network.from_db(task.network_uuid()) n.delete_on_hypervisor() elif isinstance(task, FetchBlobTask): metrics = etcd.get('metrics', config.NODE_NAME, None) if metrics: metrics = metrics.get('metrics', {}) else: metrics = {} b = blob.Blob.from_db(task.blob_uuid()) if not b: log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, not found') elif (int(metrics.get('disk_free_blobs', 0)) - int(b.size) < config.MINIMUM_FREE_DISK): log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, insufficient space') else: log.with_object(b).info('Replicating blob') size = b.ensure_local([]) log.with_object(b).with_fields({ 'transferred': size, 'expected': b.size }).info('Replicating blob complete') else: log_i.with_field('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if inst: inst.enqueue_delete_due_error('Image fetch failed: %s' % e) except exceptions.ImagesCannotShrinkException as e: log.info('Fetch Resize Error: %s', e) if inst: inst.enqueue_delete_due_error('Image resize failed: %s' % e) except libvirt.libvirtError as e: log.info('Libvirt Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except exceptions.InstanceException as e: log.info('Instance Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except Exception as e: # Logging ignored exception - this should be investigated util_general.ignore_exception('queue worker', e) if inst: inst.enqueue_delete_due_error('Failed queue task: %s' % e) finally: etcd.resolve(config.NODE_NAME, jobname) if inst: inst.add_event('tasks complete', 'dequeued', msg='Work item %s' % jobname) log.info('Completed workitem')
def _maintain_networks(self): LOG.info('Maintaining networks') # Discover what networks are present _, _, vxid_to_mac = util_network.discover_interfaces() # Determine what networks we should be on host_networks = [] seen_vxids = [] if not config.NODE_IS_NETWORK_NODE: # For normal nodes, just the ones we have instances for. We need # to use the more expensive interfaces_for_instance() method of # looking up instance interfaces here if the instance cachce hasn't # been populated yet (i.e. the instance is still being created) for inst in instance.Instances([instance.this_node_filter, instance.active_states_filter]): ifaces = inst.interfaces if not ifaces: ifaces = list( networkinterface.interfaces_for_instance(inst)) for iface_uuid in ifaces: ni = networkinterface.NetworkInterface.from_db(iface_uuid) if not ni: LOG.with_instance( inst).with_networkinterface( iface_uuid).error('Network interface does not exist') elif ni.network_uuid not in host_networks: host_networks.append(ni.network_uuid) else: # For network nodes, its all networks for n in net.Networks([baseobject.active_states_filter]): host_networks.append(n.uuid) # Ensure we are on every network we have a host for for network in host_networks: try: n = net.Network.from_db(network) if not n: continue # If this network is in state delete_wait, then we should remove # it if it has no interfaces left. if n.state.value == dbo.STATE_DELETE_WAIT: if not networkinterface.interfaces_for_network(n): LOG.with_network(n).info( 'Removing stray delete_wait network') etcd.enqueue('networknode', DestroyNetworkTask(n.uuid)) # We skip maintenance on all delete_wait networks continue # Track what vxlan ids we've seen seen_vxids.append(n.vxid) if time.time() - n.state.update_time < 60: # Network state changed in the last minute, punt for now continue if not n.is_okay(): if config.NODE_IS_NETWORK_NODE: LOG.with_network(n).info( 'Recreating not okay network on network node') n.create_on_network_node() # If the network node was missing a network, then that implies # that we also need to re-create all of the floating IPs for # that network. for ni in networkinterface.interfaces_for_network(n): if ni.floating.get('floating_address'): LOG.with_fields( { 'instance': ni.instance_uuid, 'networkinterface': ni.uuid, 'floating': ni.floating.get('floating_address') }).info('Refloating interface') n.add_floating_ip(ni.floating.get( 'floating_address'), ni.ipv4) else: LOG.with_network(n).info( 'Recreating not okay network on hypervisor') n.create_on_hypervisor() n.ensure_mesh() except exceptions.LockException as e: LOG.warning( 'Failed to acquire lock while maintaining networks: %s' % e) except exceptions.DeadNetwork as e: LOG.with_field('exception', e).info( 'maintain_network attempted on dead network') except processutils.ProcessExecutionError as e: LOG.error('Network maintenance failure: %s', e) # Determine if there are any extra vxids extra_vxids = set(vxid_to_mac.keys()) - set(seen_vxids) # We keep a global cache of extra vxlans we've seen before, so that # we only warn about them when they've been stray for five minutes. global EXTRA_VLANS_HISTORY for vxid in EXTRA_VLANS_HISTORY.copy(): if vxid not in extra_vxids: del EXTRA_VLANS_HISTORY[vxid] for vxid in extra_vxids: if vxid not in EXTRA_VLANS_HISTORY: EXTRA_VLANS_HISTORY[vxid] = time.time() # Warn of extra vxlans which have been present for more than five minutes for vxid in EXTRA_VLANS_HISTORY: if time.time() - EXTRA_VLANS_HISTORY[vxid] > 5 * 60: LOG.with_field('vxid', vxid).warning( 'Extra vxlan present!')