def delete_instances_and_wait(self, instance_infos): """Deletes the nova instance and waits for its deletion to complete""" to_poll = list(instance_infos) for inst in instance_infos: try: self.destroy_instance(inst) except novaclient_exceptions.NotFound: pass except Exception: LOG.exception(_LE('Error deleting instance %s' % inst.id_)) to_poll.remove(inst) # XXX parallelize this timed_out = [] for inst in to_poll: start = time.time() i = 0 while time.time() - start < cfg.CONF.boot_timeout: i += 1 if not self.get_instance_by_id(inst.id_): LOG.debug('Instance %s has been deleted', inst.id_) break LOG.debug('Instance %s has not finished stopping', inst.id_) time.sleep(cfg.CONF.retry_delay) else: timed_out.append(inst) LOG.error(_LE('Instance %s failed to stop within %d secs'), inst.id_, cfg.CONF.boot_timeout) if timed_out: raise NovaInstanceDeleteTimeout()
def stop(self, worker_context): """Attempts to destroy the instance with configured timeout. :param worker_context: :returns: """ self.log.info(_LI('Destroying instance')) self.driver.delete_ports(worker_context) if not self.instance_info: self.log.info(_LI('Instance already destroyed.')) if self.state != states.GONE: self.state = states.DOWN return self.state try: worker_context.nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE('Error deleting router instance')) start = time.time() i = 0 while time.time() - start < cfg.CONF.boot_timeout: i += 1 if not worker_context.nova_client.\ get_instance_by_id(self.instance_info.id_): if self.state != states.GONE: self.state = states.DOWN return self.state self.log.debug('Router has not finished stopping') time.sleep(cfg.CONF.retry_delay) self.log.error(_LE( 'Router failed to stop within %d secs'), cfg.CONF.boot_timeout)
def delete_instances_and_wait(self, instance_infos): """Deletes the nova instance and waits for its deletion to complete""" to_poll = list(instance_infos) for inst in instance_infos: try: self.destroy_instance(inst) except novaclient_exceptions.NotFound: pass except Exception: LOG.exception( _LE('Error deleting instance %s' % inst.id_)) to_poll.remove(inst) # XXX parallelize this timed_out = [] for inst in to_poll: start = time.time() i = 0 while time.time() - start < cfg.CONF.boot_timeout: i += 1 if not self.get_instance_by_id(inst.id_): LOG.debug('Instance %s has been deleted', inst.id_) break LOG.debug( 'Instance %s has not finished stopping', inst.id_) time.sleep(cfg.CONF.retry_delay) else: timed_out.append(inst) LOG.error(_LE( 'Instance %s failed to stop within %d secs'), inst.id_, cfg.CONF.boot_timeout) if timed_out: raise NovaInstanceDeleteTimeout()
def get_bridge_for_iface(root_helper, iface): args = ["ovs-vsctl", "--timeout=2", "iface-to-br", iface] try: return utils.execute(args, root_helper=root_helper).strip() except Exception: LOG.exception(_LE("Interface %s not found."), iface) return None
def _check_del_instances(self, pools): """Scans the pool for deleted instances and checks deletion timers""" # XXX: What do we do with instances stuck in deleting? # For now, just return stuck instances to caller and we can figure # out what to do with them later. stuck_instances = [] del_instances = [] for resource, pool in pools.items(): del_instances += [i for i in pool if i.status == DELETING] # clean out counters for old instances that have been deleted entirely if self._delete_counters: del_instance_ids = [i.id for i in del_instances] for inst_id in copy.copy(self._delete_counters): if inst_id not in del_instance_ids: self._delete_counters.pop(inst_id) for del_inst in del_instances: if del_inst.id not in self._delete_counters: self._delete_counters[del_inst.id] = timeutils.utcnow() else: if timeutils.is_older_than(self._delete_counters[del_inst.id], self.delete_timeout): LOG.error(_LE( 'Instance %s is stuck in %s for more than %s ' 'seconds.'), i.id, DELETING, self.delete_timeout) stuck_instances.append(del_inst) return stuck_instances
def update(self, worker_context): "Called when the router config should be changed" while self._queue: while True: if self.deleted: self.driver.log.debug("skipping update because the router is being deleted") return try: self.driver.log.debug( "%s.execute(%s) instance.state=%s", self.state, self.action, self.instance.state ) self.action = self.state.execute(self.action, worker_context) self.driver.log.debug( "%s.execute -> %s instance.state=%s", self.state, self.action, self.instance.state ) except: self.driver.log.exception(_LE("%s.execute() failed for action: %s"), self.state, self.action) old_state = self.state self.state = self.state.transition(self.action, worker_context) self.driver.log.debug( "%s.transition(%s) -> %s instance.state=%s", old_state, self.action, self.state, self.instance.state ) # Yield control each time we stop to figure out what # to do next. if isinstance(self.state, CalcAction): return # yield # We have reached the exit state, so the router has # been deleted somehow. if isinstance(self.state, Exit): self._do_delete() return
def __call__(self, req): try: if req.method != 'PUT': return webob.exc.HTTPMethodNotAllowed() args = filter(None, req.path.split('/')) if not args: return webob.exc.HTTPNotFound() command, _, _ = self.ctl.command_manager.find_command(args) if command.interactive: return webob.exc.HTTPNotImplemented() return str(self.ctl.run(['--debug'] + args)) except SystemExit: # cliff invokes -h (help) on argparse failure # (which in turn results in sys.exit call) return webob.exc.HTTPBadRequest() except ValueError: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = _('An unknown error has occurred. ' 'Please try your request again.') return webob.exc.HTTPInternalServerError( explanation=six.text_type(msg))
def get_bridges(root_helper): args = ["ovs-vsctl", "--timeout=2", "list-br"] try: return utils.execute(args, root_helper=root_helper).strip().split("\n") except Exception: LOG.exception(_LE("Unable to retrieve bridges.")) return []
def boot(self, worker_context): """Boots the instances with driver pre/post boot hooks. :returns: None """ self.log.info('Booting %s' % self.resource.RESOURCE_NAME) if self.state != states.DEGRADED: self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.resource.pre_boot(worker_context) try: self.instances.create(worker_context) if not self.instances: self.log.info(_LI('Previous instances are still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() return except: self.log.exception(_LE('Instances failed to start boot')) else: self.state = states.BOOTING # driver post boot hook self.resource.post_boot(worker_context)
def _update_config(self, instance, config): self.log.debug( 'Updating config for instance %s on resource %s', instance.id_, self.resource.id) self.log.debug('New config: %r', config) attempts = cfg.CONF.max_retries for i in six.moves.range(attempts): try: self.resource.update_config( instance.management_address, config) except Exception: if i == attempts - 1: # Only log the traceback if we encounter it many times. self.log.exception(_LE('failed to update config')) else: self.log.debug( 'failed to update config, attempt %d', i ) time.sleep(cfg.CONF.retry_delay) else: self.log.info('Instance config updated') return True else: return False
def _check_del_instances(self, pools): """Scans the pool for deleted instances and checks deletion timers""" # XXX: What do we do with instances stuck in deleting? # For now, just return stuck instances to caller and we can figure # out what to do with them later. stuck_instances = [] del_instances = [] for resource, pool in pools.items(): del_instances += [i for i in pool if i.status == DELETING] # clean out counters for old instances that have been deleted entirely if self._delete_counters: del_instance_ids = [i.id for i in del_instances] for inst_id in copy.copy(self._delete_counters): if inst_id not in del_instance_ids: self._delete_counters.pop(inst_id) for del_inst in del_instances: if del_inst.id not in self._delete_counters: self._delete_counters[del_inst.id] = timeutils.utcnow() else: if timeutils.is_older_than(self._delete_counters[del_inst.id], self.delete_timeout): LOG.error( _LE('Instance %s is stuck in %s for more than %s ' 'seconds.'), i.id, DELETING, self.delete_timeout) stuck_instances.append(del_inst) return stuck_instances
def get_instance_provider(provider): try: return INSTANCE_PROVIDERS[provider] except KeyError: default = INSTANCE_PROVIDERS['default'] LOG.error(_LE('Could not find %s instance provider, using default %s'), provider, default) return default
def run_vsctl(self, args): full_args = ["ovs-vsctl", "--timeout=2"] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error(_LE( "Unable to execute %(cmd)s. Exception: %(exception)s"), {'cmd': full_args, 'exception': e})
def run_ofctl(self, cmd, args): full_args = ["ovs-ofctl", cmd, self.br_name] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error(_LE( "Unable to execute %(cmd)s. Exception: %(exception)s"), {'cmd': full_args, 'exception': e})
def shutdown(self): LOG.info('shutting down') for resource_id, sm in self.state_machines.items(): try: sm.service_shutdown() except Exception: LOG.exception(_LE('Failed to shutdown state machine for %s'), resource_id)
def unplug(self, device_name, bridge=None, namespace=None, prefix=None): """Unplug the interface.""" device = ip_lib.IPDevice(device_name, self.root_helper, namespace) try: device.link.delete() LOG.debug("Unplugged interface '%s'", device_name) except RuntimeError: LOG.exception(_LE( "Failed unplugging interface '%s'"), device_name)
def shutdown(self): LOG.info('shutting down') for resource_id, sm in self.state_machines.items(): try: sm.service_shutdown() except Exception: LOG.exception(_LE( 'Failed to shutdown state machine for %s'), resource_id )
def get_xapi_iface_id(self, xs_vif_uuid): args = ["xe", "vif-param-get", "param-name=other-config", "param-key=nicira-iface-id", "uuid=%s" % xs_vif_uuid] try: return utils.execute(args, root_helper=self.root_helper).strip() except Exception, e: LOG.error(_LE( "Unable to execute %(cmd)s. Exception: %(exception)s"), {'cmd': args, 'exception': e})
def run_vsctl(self, args): full_args = ["ovs-vsctl", "--timeout=2"] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': full_args, 'exception': e })
def run_ofctl(self, cmd, args): full_args = ["ovs-ofctl", cmd, self.br_name] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': full_args, 'exception': e })
def _check_err_instances(self, pools): """Scans the pool and deletes any instances in error state""" for resource, pool in copy.copy(pools).items(): err_instances = [i for i in pool if i.status == ERROR] for err_inst in err_instances: LOG.error(_LE('Instance %s is in %s state, deleting.'), i.id, ERROR) del_instance = self.delete_instance(err_inst.id) i = pool.index(err_inst) pools[resource][i] = del_instance
def update(self, worker_context): "Called when the router config should be changed" while self._queue: while True: if self.deleted: self.resource.log.debug( 'skipping update because the router is being deleted' ) return try: self.resource.log.debug( '%s.execute(%s) instance.state=%s', self.state, self.action, self.instance.state) self.action = self.state.execute( self.action, worker_context, ) self.resource.log.debug( '%s.execute -> %s instance.state=%s', self.state, self.action, self.instance.state) except: self.resource.log.exception( _LE('%s.execute() failed for action: %s'), self.state, self.action ) old_state = self.state self.state = self.state.transition( self.action, worker_context, ) self.resource.log.debug( '%s.transition(%s) -> %s instance.state=%s', old_state, self.action, self.state, self.instance.state ) # Yield control each time we stop to figure out what # to do next. if isinstance(self.state, CalcAction): return # yield # We have reached the exit state, so the router has # been deleted somehow. if isinstance(self.state, Exit): self._do_delete() return
def _check_err_instances(self, pools): """Scans the pool and deletes any instances in error state""" for resource, pool in copy.copy(pools).items(): err_instances = [i for i in pool if i.status == ERROR] for err_inst in err_instances: LOG.error(_LE( 'Instance %s is in %s state, deleting.'), i.id, ERROR) del_instance = self.delete_instance(err_inst.id) i = pool.index(err_inst) pools[resource][i] = del_instance
def get_xapi_iface_id(self, xs_vif_uuid): args = [ "xe", "vif-param-get", "param-name=other-config", "param-key=nicira-iface-id", "uuid=%s" % xs_vif_uuid ] try: return utils.execute(args, root_helper=self.root_helper).strip() except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': args, 'exception': e })
def __init__(self, conf): self.conf = conf ks_session = keystone.KeystoneSession() self.client = client.Client(version='2', session=ks_session.session, region_name=conf.auth_region) try: self.instance_provider = get_instance_provider( conf.instance_provider)(self.client) except AttributeError: default = INSTANCE_PROVIDERS['default'] LOG.error(_LE('Could not find provider config, using default %s'), default) self.instance_provider = default(self.client)
def __init__(self, conf): self.conf = conf ks_session = keystone.KeystoneSession() self.client = client.Client( version='2', session=ks_session.session, region_name=conf.auth_region) try: self.instance_provider = get_instance_provider( conf.instance_provider)(self.client) except AttributeError: default = INSTANCE_PROVIDERS['default'] LOG.error(_LE('Could not find provider config, using default %s'), default) self.instance_provider = default(self.client)
def _load_resource_from_message(self, worker_context, message): if cfg.CONF.enable_byonf: byonf_res = worker_context.neutron.tenant_has_byo_for_function( tenant_id=self.tenant_id.replace('-', ''), function_type=message.resource.driver) if byonf_res: try: return drivers.load_from_byonf(worker_context, byonf_res, message.resource.id) except drivers.InvalidDriverException: LOG.exception( _LE('Could not load BYONF driver, falling back to ' 'configured image')) pass return drivers.get(message.resource.driver)(worker_context, message.resource.id)
def _send(self, ready): """Deliver notification messages from the in-process queue to the appropriate topic via the AMQP service. """ # setup notifier driver ahead a time self.get_notifier() # Tell the start() method that we have set up the AMQP # communication stuff and are ready to do some work. ready.set() while True: msg = self._q.get() if msg is None: break LOG.debug('sending notification %r', msg) try: self.send(event_type=msg['event_type'], message=msg['payload']) except Exception: LOG.exception(_LE('could not publish notification'))
def unplug(self, device_name, bridge=None, namespace=None, prefix=None): """Unplug the interface.""" if not bridge: bridge = self.conf.ovs_integration_bridge tap_name = self._get_tap_name(device_name, prefix) self.check_bridge_exists(bridge) ovs = ovs_lib.OVSBridge(bridge, self.root_helper) try: ovs.delete_port(tap_name) if self.conf.ovs_use_veth: device = ip_lib.IPDevice(device_name, self.root_helper, namespace) device.link.delete() LOG.debug(_("Unplugged interface '%s'"), device_name) except RuntimeError: LOG.exception(_LE("Failed unplugging interface '%s'"), device_name)
def shuffle_notifications(notification_queue, sched): """Copy messages from the notification queue into the scheduler. """ while True: try: target, message = notification_queue.get() if target is None: break sched.handle_message(target, message) except IOError: # FIXME(rods): if a signal arrive during an IO operation # an IOError is raised. We catch the exceptions in # meantime waiting for a better solution. pass except KeyboardInterrupt: LOG.info(_LI('got Ctrl-C')) break except: LOG.exception(_LE('unhandled exception processing message'))
def boot(self, worker_context): """Boots the instance with driver pre/post boot hooks. :returns: None """ self._ensure_cache(worker_context) self.log.info('Booting %s' % self.driver.RESOURCE_NAME) self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.driver.pre_boot(worker_context) # try to boot the instance try: instance_info = worker_context.nova_client.boot_instance( resource_type=self.driver.RESOURCE_NAME, prev_instance_info=self.instance_info, name=self.driver.name, image_uuid=self.driver.image_uuid, flavor=self.driver.flavor, make_ports_callback=self.driver.make_ports(worker_context) ) if not instance_info: self.log.info(_LI('Previous instance is still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE('Instance failed to start boot')) self.driver.delete_ports(worker_context) else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = states.BOOTING self.instance_info = instance_info # driver post boot hook self.driver.post_boot(worker_context)
def _load_resource_from_message(self, worker_context, message): if cfg.CONF.enable_byonf: byonf_res = worker_context.neutron.tenant_has_byo_for_function( tenant_id=self.tenant_id.replace('-', ''), function_type=message.resource.driver) if byonf_res: try: return drivers.load_from_byonf( worker_context, byonf_res, message.resource.id) except drivers.InvalidDriverException: LOG.exception(_LE( 'Could not load BYONF driver, falling back to ' 'configured image')) pass return drivers.get(message.resource.driver)( worker_context, message.resource.id)
def __call__(self, req): """Inital handler for an incoming `webob.Request`. :param req: The webob.Request to handle :returns: returns a valid HTTP Response or Error """ try: LOG.debug("Request: %s", req) instance_id = self._get_instance_id(req) if instance_id: return self._proxy_request(instance_id, req) else: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = ('An unknown error has occurred. ' 'Please try your request again.') return webob.exc.HTTPInternalServerError(explanation=unicode(msg))
def __call__(self, req): """Inital handler for an incoming `webob.Request`. :param req: The webob.Request to handle :returns: returns a valid HTTP Response or Error """ try: LOG.debug("Request: %s", req) instance_id = self._get_instance_id(req) if instance_id: return self._proxy_request(instance_id, req) else: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = ('An unknown error has occurred. ' 'Please try your request again.') return webob.exc.HTTPInternalServerError( explanation=six.text_type(msg))
def _update_config(self, instance, config): self.log.debug('Updating config for instance %s on resource %s', instance.id_, self.resource.id) self.log.debug('New config: %r', config) attempts = cfg.CONF.max_retries for i in six.moves.range(attempts): try: self.resource.update_config(instance.management_address, config) except Exception: if i == attempts - 1: # Only log the traceback if we encounter it many times. self.log.exception(_LE('failed to update config')) else: self.log.debug('failed to update config, attempt %d', i) time.sleep(cfg.CONF.retry_delay) else: self.log.info('Instance config updated') return True else: return False
def stop(self, worker_context): """Attempts to destroy the instance cluster :param worker_context: :returns: """ self.log.info(_LI('Destroying instance')) self.resource.delete_ports(worker_context) if not self.instances: self.log.info(_LI('Instance(s) already destroyed.')) if self.state != states.GONE: self.state = states.DOWN return self.state try: self.instances.destroy(worker_context) if self.state != states.GONE: self.state = states.DOWN except Exception: self.log.exception(_LE('Failed to stop instance(s)'))
def _worker(inq, worker_factory, scheduler, proc_name): """Scheduler's worker process main function. """ daemon.ignore_signals() LOG.debug('starting worker process') worker = worker_factory(scheduler=scheduler, proc_name=proc_name) while True: try: data = inq.get() except IOError: # NOTE(dhellmann): Likely caused by a signal arriving # during processing, especially SIGCHLD. data = None if data is None: target, message = None, None else: target, message = data try: worker.handle_message(target, message) except Exception: LOG.exception(_LE('Error processing data %s'), unicode(data)) if data is None: break LOG.debug('exiting')
def _worker(inq, worker_factory, scheduler, proc_name): """Scheduler's worker process main function. """ daemon.ignore_signals() LOG.debug('starting worker process') worker = worker_factory(scheduler=scheduler, proc_name=proc_name) while True: try: data = inq.get() except IOError: # NOTE(dhellmann): Likely caused by a signal arriving # during processing, especially SIGCHLD. data = None if data is None: target, message = None, None else: target, message = data try: worker.handle_message(target, message) except Exception: LOG.exception(_LE('Error processing data %s'), six.text_type(data)) if data is None: break LOG.debug('exiting')
def get_state_machines(self, message, worker_context): """Return the state machines and the queue for sending it messages for the logical resource being addressed by the message. """ if (not message.resource or (message.resource and not message.resource.id)): LOG.error(_LE( 'Cannot get state machine for message with ' 'no message.resource')) raise InvalidIncomingMessage() state_machines = [] # Send to all of our resources. if message.resource.id == '*': LOG.debug('routing to all state machines') state_machines = self.state_machines.values() # Ignore messages to deleted resources. elif self.state_machines.has_been_deleted(message.resource.id): LOG.debug('dropping message for deleted resource') return [] # Send to resources that have an ERROR status elif message.resource.id == 'error': state_machines = [ sm for sm in self.state_machines.values() if sm.has_error() ] LOG.debug('routing to %d errored state machines', len(state_machines)) # Create a new state machine for this router. elif message.resource.id not in self.state_machines: LOG.debug('creating state machine for %s', message.resource.id) # load the driver if not message.resource.driver: LOG.error(_LE('cannot create state machine without specifying' 'a driver.')) return [] driver_obj = \ drivers.get(message.resource.driver)(worker_context, message.resource.id) if not driver_obj: # this means the driver didn't load for some reason.. # this might not be needed at all. LOG.debug('for some reason loading the driver failed') return [] def deleter(): self._delete_resource(message.resource) new_state_machine = state.Automaton( driver=driver_obj, resource_id=message.resource.id, tenant_id=self.tenant_id, delete_callback=deleter, bandwidth_callback=self._report_bandwidth, worker_context=worker_context, queue_warning_threshold=self._queue_warning_threshold, reboot_error_threshold=self._reboot_error_threshold, ) self.state_machines[message.resource.id] = new_state_machine state_machines = [new_state_machine] # Send directly to an existing router. elif message.resource.id: state_machines = [self.state_machines[message.resource.id]] # Filter out any deleted state machines. return [ machine for machine in state_machines if (not machine.deleted and not self.state_machines.has_been_deleted(machine.resource_id)) ]
def _dispatch_command(self, target, message): if not self._should_process_command(message): return instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing resource %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource( id=resource_id, driver=sm.driver.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext(self.management_address) while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug('Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # In the event that a rebalance took place while processing an # event, it may have been put back into the work queue. Check # the hash table once more to find out if we still manage it # and do some cleanup if not. if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( sm.resource_id) if self.host not in target_hosts: LOG.debug('Skipping update of router %s, it no longer ' 'maps here.', sm.resource_id) trm = self.tenant_managers[sm.tenant_id] trm.unmanage_resource(sm.resource_id) self.work_queue.task_done() with self.lock: self._release_resource_lock(sm) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ( 'finalizing task for %s' % sm.resource_id ) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def configure(self, worker_context): """Pushes config to instance :param worker_context: :param failure_state: :param attempts: :returns: """ self.log.debug('Begin instance config') self.state = states.UP attempts = cfg.CONF.max_retries if self.driver.get_state(worker_context) == states.GONE: return states.GONE interfaces = self.driver.get_interfaces( self.instance_info.management_address) if not self._verify_interfaces(self.driver.ports, interfaces): self.log.debug("Interfaces aren't plugged as expected.") self.state = states.REPLUG return self.state # TODO(mark): We're in the first phase of VRRP, so we need # map the interface to the network ID. # Eventually we'll send VRRP data and real interface data port_mac_to_net = { p.mac_address: p.network_id for p in self.instance_info.ports } # Add in the management port mgt_port = self.instance_info.management_port port_mac_to_net[mgt_port.mac_address] = mgt_port.network_id # this is a network to logical interface id iface_map = { port_mac_to_net[i['lladdr']]: i['ifname'] for i in interfaces if i['lladdr'] in port_mac_to_net } # sending all the standard config over to the driver for final updates config = self.driver.build_config( worker_context, mgt_port, iface_map ) self.log.debug('preparing to update config to %r', config) for i in six.moves.range(attempts): try: self.driver.update_config( self.instance_info.management_address, config) except Exception: if i == attempts - 1: # Only log the traceback if we encounter it many times. self.log.exception(_LE('failed to update config')) else: self.log.debug( 'failed to update config, attempt %d', i ) time.sleep(cfg.CONF.retry_delay) else: self.state = states.CONFIGURED self.log.info('Instance config updated') return self.state else: self.state = states.RESTART return self.state
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext(self.management_address) while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug( 'Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # In the event that a rebalance took place while processing an # event, it may have been put back into the work queue. Check # the hash table once more to find out if we still manage it # and do some cleanup if not. if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( sm.resource_id) if self.host not in target_hosts: LOG.debug( 'Skipping update of router %s, it no longer ' 'maps here.', sm.resource_id) trm = self.tenant_managers[sm.tenant_id] trm.unmanage_resource(sm.resource_id) self.work_queue.task_done() with self.lock: self._release_resource_lock(sm) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ('finalizing task for %s' % sm.resource_id) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def _dispatch_command(self, target, message): if not self._should_process_command(message): return instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning( _LW('Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning( _LW('Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing resource %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning( _LW('Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource(id=resource_id, driver=sm.resource.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning( _LW('Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def get_state_machines(self, message, worker_context): """Return the state machines and the queue for sending it messages for the logical resource being addressed by the message. """ if (not message.resource or (message.resource and not message.resource.id)): LOG.error( _LE('Cannot get state machine for message with ' 'no message.resource')) raise InvalidIncomingMessage() state_machines = [] # Send to all of our resources. if message.resource.id == '*': LOG.debug('routing to all state machines') state_machines = self.state_machines.values() # Ignore messages to deleted resources. elif self.state_machines.has_been_deleted(message.resource.id): LOG.debug('dropping message for deleted resource') return [] # Send to resources that have an ERROR status elif message.resource.id == 'error': state_machines = [ sm for sm in self.state_machines.values() if sm.has_error() ] LOG.debug('routing to %d errored state machines', len(state_machines)) # Create a new state machine for this router. elif message.resource.id not in self.state_machines: LOG.debug('creating state machine for %s', message.resource.id) # load the driver if not message.resource.driver: LOG.error( _LE('cannot create state machine without specifying' 'a driver.')) return [] resource_obj = self._load_resource_from_message( worker_context, message) if not resource_obj: # this means the driver didn't load for some reason.. # this might not be needed at all. LOG.debug('for some reason loading the driver failed') return [] def deleter(): self._delete_resource(message.resource) new_state_machine = state.Automaton( resource=resource_obj, tenant_id=self.tenant_id, delete_callback=deleter, bandwidth_callback=self._report_bandwidth, worker_context=worker_context, queue_warning_threshold=self._queue_warning_threshold, reboot_error_threshold=self._reboot_error_threshold, ) self.state_machines[message.resource.id] = new_state_machine state_machines = [new_state_machine] # Send directly to an existing router. elif message.resource.id: state_machines = [self.state_machines[message.resource.id]] # Filter out any deleted state machines. return [ machine for machine in state_machines if (not machine.deleted and not self.state_machines.has_been_deleted(machine.resource.id)) ]