def stop(self, worker_context): """Attempts to destroy the instance with configured timeout. :param worker_context: :returns: """ self.log.info(_LI('Destroying instance')) self.driver.delete_ports(worker_context) if not self.instance_info: self.log.info(_LI('Instance already destroyed.')) if self.state != states.GONE: self.state = states.DOWN return self.state try: worker_context.nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE('Error deleting router instance')) start = time.time() i = 0 while time.time() - start < cfg.CONF.boot_timeout: i += 1 if not worker_context.nova_client.\ get_instance_by_id(self.instance_info.id_): if self.state != states.GONE: self.state = states.DOWN return self.state self.log.debug('Router has not finished stopping') time.sleep(cfg.CONF.retry_delay) self.log.error(_LE( 'Router failed to stop within %d secs'), cfg.CONF.boot_timeout)
def send_message(self, message): "Called when the worker put a message in the state machine queue" if self.deleted: # Ignore any more incoming messages self.driver.log.debug("deleted state machine, ignoring incoming message %s", message) return False # NOTE(dhellmann): This check is largely redundant with the # one in CalcAction.transition() but it may allow us to avoid # adding poll events to the queue at all, and therefore cut # down on the number of times a worker thread wakes up to # process something on a router that isn't going to actually # do any work. if message.crud == POLL and self.instance.state == states.ERROR: self.driver.log.info(_LI("Resource status is ERROR, ignoring POLL message: %s"), message) return False if message.crud == REBUILD: if message.body.get("image_uuid"): self.driver.log.info(_LI("Resource is being REBUILT with custom image %s"), message.body["image_uuid"]) self.image_uuid = message.body["image_uuid"] else: self.image_uuid = self.driver.image_uuid self._queue.append(message.crud) queue_len = len(self._queue) if queue_len > self._queue_warning_threshold: logger = self.driver.log.warning else: logger = self.driver.log.debug logger(_LW("incoming message brings queue length to %s"), queue_len) return True
def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr, gateway_ip, enable_dhcp, dns_nameservers, host_routes, ipv6_ra_mode): self.id = id_ self.name = name self.tenant_id = tenant_id self.network_id = network_id self.ip_version = ip_version try: self.cidr = netaddr.IPNetwork(cidr) except (TypeError, netaddr.AddrFormatError) as e: raise ValueError( _('Invalid CIDR %r for subnet %s of network %s: %s') % ( cidr, id_, network_id, e, )) try: self.gateway_ip = netaddr.IPAddress(gateway_ip) except (TypeError, netaddr.AddrFormatError) as e: self.gateway_ip = None LOG.info(_LI('Bad gateway_ip on subnet %s: %r (%s)'), id_, gateway_ip, e) self.enable_dhcp = enable_dhcp self.dns_nameservers = dns_nameservers self.host_routes = host_routes self.ipv6_ra_mode = ipv6_ra_mode
def launch_instances(self, count, driver): LOG.info(_LI( 'Launching %s %s instances.'), driver.RESOURCE_NAME, count) for i in range(0, count): # NOTE: Use a fake UUID so astara-neutron's name matching still # catches this port as an astara port. This can be avoided if # we use a mgt security group in the future. mgt_port = self.ctxt.neutron_client.create_management_port( '00000000-0000-0000-0000-000000000000') nics = [{ 'net-id': mgt_port.network_id, 'v4-fixed-ip': '', 'port-id': mgt_port.id}] instance_name = INSTANCE_FREE % { 'resource_name': driver.RESOURCE_NAME } image = self.images[driver.RESOURCE_NAME] flavor = self.flavors[driver.RESOURCE_NAME] self.ctxt.nova_client.client.servers.create( name=instance_name, image=image, flavor=flavor, nics=nics, config_drive=True, userdata=nova.format_userdata(mgt_port), )
def start(self): """The pool manager main loop. The bulk of the algorithm exists in the 'unused_instances' property. This main loop simply checks for a deficit in the pool and dispatches a 'launch_instances' call when a deficit needs to be filled. """ while True: cur_pools = self.unused_instances report = [] for driver in self.drivers: report.append( '%s:%s/%s' % (driver.RESOURCE_NAME, len(cur_pools[driver.RESOURCE_NAME]), self.pool_size)) LOG.debug('Current pools: %s' % ' '.join(report)) for driver in self.drivers: cur_pool = cur_pools[driver.RESOURCE_NAME] deficit = self.pool_size - len(cur_pool) if deficit: LOG.info(_LI( 'Need to launch %s more %s instance(s).'), deficit, driver.RESOURCE_NAME) self.launch_instances( driver=driver, count=deficit) time.sleep(self.poll_interval)
def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr, gateway_ip, enable_dhcp, dns_nameservers, host_routes, ipv6_ra_mode): self.id = id_ self.name = name self.tenant_id = tenant_id self.network_id = network_id self.ip_version = ip_version try: self.cidr = netaddr.IPNetwork(cidr) except (TypeError, netaddr.AddrFormatError) as e: raise ValueError( _('Invalid CIDR %r for subnet %s of network %s: %s') % ( cidr, id_, network_id, e, ) ) try: self.gateway_ip = netaddr.IPAddress(gateway_ip) except (TypeError, netaddr.AddrFormatError) as e: self.gateway_ip = None LOG.info(_LI( 'Bad gateway_ip on subnet %s: %r (%s)'), id_, gateway_ip, e) self.enable_dhcp = enable_dhcp self.dns_nameservers = dns_nameservers self.host_routes = host_routes self.ipv6_ra_mode = ipv6_ra_mode
def run(self, ip_address, port=cfg.CONF.astara_metadata_port): """Run the MetadataProxy. :param ip_address: the ip address to bind to for incoming requests :param port: the port to bind to for incoming requests :returns: returns nothing """ app = MetadataProxyHandler() for i in six.moves.range(5): LOG.info(_LI('Starting the metadata proxy on %s:%s'), ip_address, port) try: sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128) except socket.error as err: if err.errno != 99: raise LOG.warning(_LW('Could not create metadata proxy socket: %s'), err) LOG.warning(_LW('Sleeping %s before trying again'), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError( _('Could not establish metadata proxy socket on %s:%s') % (ip_address, port)) eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=LOG)
def start(self): """The pool manager main loop. The bulk of the algorithm exists in the 'unused_instances' property. This main loop simply checks for a deficit in the pool and dispatches a 'launch_instances' call when a deficit needs to be filled. """ while True: cur_pools = self.unused_instances report = [] for driver in self.drivers: report.append( '%s:%s/%s' % (driver.RESOURCE_NAME, len( cur_pools[driver.RESOURCE_NAME]), self.pool_size)) LOG.debug('Current pools: %s' % ' '.join(report)) for driver in self.drivers: cur_pool = cur_pools[driver.RESOURCE_NAME] deficit = self.pool_size - len(cur_pool) if deficit: LOG.info(_LI('Need to launch %s more %s instance(s).'), deficit, driver.RESOURCE_NAME) self.launch_instances(driver=driver, count=deficit) time.sleep(self.poll_interval)
def launch_instances(self, count, driver): LOG.info(_LI('Launching %s %s instances.'), driver.RESOURCE_NAME, count) for i in range(0, count): # NOTE: Use a fake UUID so astara-neutron's name matching still # catches this port as an astara port. This can be avoided if # we use a mgt security group in the future. mgt_port = self.ctxt.neutron_client.create_management_port( '00000000-0000-0000-0000-000000000000') nics = [{ 'net-id': mgt_port.network_id, 'v4-fixed-ip': '', 'port-id': mgt_port.id }] instance_name = INSTANCE_FREE % { 'resource_name': driver.RESOURCE_NAME } image = self.images[driver.RESOURCE_NAME] flavor = self.flavors[driver.RESOURCE_NAME] self.ctxt.nova_client.client.servers.create( name=instance_name, image=image, flavor=flavor, nics=nics, config_drive=True, userdata=nova.format_userdata(mgt_port), )
def boot(self, worker_context): """Boots the instances with driver pre/post boot hooks. :returns: None """ self.log.info('Booting %s' % self.resource.RESOURCE_NAME) if self.state != states.DEGRADED: self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.resource.pre_boot(worker_context) try: self.instances.create(worker_context) if not self.instances: self.log.info(_LI('Previous instances are still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() return except: self.log.exception(_LE('Instances failed to start boot')) else: self.state = states.BOOTING # driver post boot hook self.resource.post_boot(worker_context)
def get_instance(self, resource_type, name, management_port=None, instance_ports=None): """Get an instance from the pool. This involves popping it out of the pool, updating its name and attaching any ports. :param resource_type: The str driver name of the resource :param name: The requested name of the instance :param managment_port: The management port dict that was created for the instance by the RUG. :param instance_ports: A list of dicts of ports to be attached to instance upon reservation. :returns: A tuple containing (novaclient server object for the reserved server, a port object for the management port, a list of port objects that were attached the server) """ instance_ports = instance_ports or [] try: server = self.unused_instances[resource_type][0] except IndexError: raise PezPoolExhausted() LOG.info(_LI('Renaming instance %s to %s'), server.name, name) server = self.ctxt.nova_client.client.servers.update(server, name=name) for port in instance_ports: LOG.info(_LI('Attaching instance port %s to %s (%s)'), port['id'], server.name, server.id) self.ctxt.nova_client.client.servers.interface_attach( server=server, port_id=port['id'], net_id=None, fixed_ip=None) mgt_port, instance_ports = ( self.ctxt.neutron_client.get_ports_for_instance(server.id)) return ( self.ctxt.nova_client.client.servers.get(server.id), mgt_port, instance_ports, )
def get_instance(self, resource_type, name, management_port=None, instance_ports=None): """Get an instance from the pool. This involves popping it out of the pool, updating its name and attaching any ports. :param resource_type: The str driver name of the resource :param name: The requested name of the instance :param managment_port: The management port dict that was created for the instance by the RUG. :param instance_ports: A list of dicts of ports to be attached to instance upon reservation. :returns: A tuple containing (novaclient server object for the reserved server, a port object for the management port, a list of port objects that were attached the server) """ instance_ports = instance_ports or [] try: server = self.unused_instances[resource_type][0] except IndexError: raise PezPoolExhausted() LOG.info(_LI('Renaming instance %s to %s'), server.name, name) server = self.ctxt.nova_client.client.servers.update( server, name=name) for port in instance_ports: LOG.info(_LI('Attaching instance port %s to %s (%s)'), port['id'], server.name, server.id) self.ctxt.nova_client.client.servers.interface_attach( server=server, port_id=port['id'], net_id=None, fixed_ip=None) mgt_port, instance_ports = ( self.ctxt.neutron_client.get_ports_for_instance(server.id) ) return ( self.ctxt.nova_client.client.servers.get(server.id), mgt_port, instance_ports, )
def run(self): try: while True: self._coordinator.heartbeat() self._coordinator.run_watchers() time.sleep(self.heartbeat_interval) except CoordinatorDone: LOG.info(_LI('Stopping RUG coordinator.')) return
def _check_outdated_instances(self, pools): outdated_instances = [] for resource, pool in pools.items(): for server in pool: if server.image['id'] != str(self.images[resource]): LOG.info( _LI('Deleting instance %s with outdated image, ' '%s != %s'), server.id, server.image['id'], self.image_uuid) outdated_instances.append(server) elif server.flavor['id'] != str(self.flavors[resource]): LOG.info( _LI('Deleting instance %s with outdated flavor, ' '%s != %s'), server.id, server.flavor['id'], self.flavor) outdated_instances.append(server) if outdated_instances: [self.delete_instance(i.id) for i in outdated_instances]
def main(argv=sys.argv[1:]): ak_cfg.parse_config(argv) log.setup(CONF, 'astara-pez') CONF.log_opt_values(LOG, logging.INFO) LOG.info(_LI("Starting Astara Pez service.")) mgr = PezService() launcher = service.launch(CONF, mgr) launcher.wait()
def _check_outdated_instances(self, pools): outdated_instances = [] for resource, pool in pools.items(): for server in pool: if server.image['id'] != str(self.images[resource]): LOG.info(_LI( 'Deleting instance %s with outdated image, ' '%s != %s'), server.id, server.image['id'], self.image_uuid) outdated_instances.append(server) elif server.flavor['id'] != str(self.flavors[resource]): LOG.info(_LI( 'Deleting instance %s with outdated flavor, ' '%s != %s'), server.id, server.flavor['id'], self.flavor) outdated_instances.append(server) if outdated_instances: [self.delete_instance(i.id) for i in outdated_instances]
def update_loadbalancer_status(self, loadbalancer_id, status): try: self.api_client.update_loadbalancer_status(loadbalancer_id, status) except Exception as e: # We don't want to die just because we can't tell neutron # what the status of the router should be. Log the error # but otherwise ignore it. LOG.info(_LI( 'ignoring failure to update status for %s to %s: %s'), id, status, e, )
def get_network_subnets(self, network_id): response = [] subnet_response = self.api_client.list_subnets(network_id=network_id) subnets = subnet_response['subnets'] for s in subnets: try: response.append(Subnet.from_dict(s)) except Exception as e: LOG.info(_LI('ignoring subnet %s (%s) on network %s: %s'), s.get('id'), s.get('cidr'), network_id, e) return response
def send_message(self, message): "Called when the worker put a message in the state machine queue" if self.deleted: # Ignore any more incoming messages self.resource.log.debug( 'deleted state machine, ignoring incoming message %s', message) return False # NOTE(dhellmann): This check is largely redundant with the # one in CalcAction.transition() but it may allow us to avoid # adding poll events to the queue at all, and therefore cut # down on the number of times a worker thread wakes up to # process something on a router that isn't going to actually # do any work. if message.crud == POLL and \ self.instance.state == states.ERROR: self.resource.log.info(_LI( 'Resource status is ERROR, ignoring POLL message: %s'), message, ) return False if message.crud == REBUILD: if message.body.get('image_uuid'): self.resource.log.info(_LI( 'Resource is being REBUILT with custom image %s'), message.body['image_uuid'] ) self.image_uuid = message.body['image_uuid'] else: self.image_uuid = self.resource.image_uuid self._queue.append(message.crud) queue_len = len(self._queue) if queue_len > self._queue_warning_threshold: logger = self.resource.log.warning else: logger = self.resource.log.debug logger(_LW('incoming message brings queue length to %s'), queue_len) return True
def execute(self, action, worker_context): # Check for a loop where the resource keeps failing to boot or # accept the configuration. if self.instance.attempts >= self.params.reboot_error_threshold: self.params.driver.log.info(_LI("Dropping out of boot loop after " " %s trials"), self.instance.attempts) self.instance.set_error(worker_context) return action self.instance.boot(worker_context) self.params.driver.log.debug( "CreateInstance attempt %s/%s", self.instance.attempts, self.params.reboot_error_threshold ) return action
def _should_process_message(self, target, message): """Determines whether a message should be processed or not.""" global_debug, reason = self.db_api.global_debug() if global_debug: LOG.info( 'Skipping incoming event, cluster in global debug ' 'mode. (reason: %s)', reason) return False if message.resource.id not in commands.WILDCARDS: message = self._populate_resource_id(message) if not message.resource.id: LOG.info(_LI('Ignoring message with no resource found.')) return False should_ignore, reason = \ self.db_api.tenant_in_debug(message.resource.tenant_id) if should_ignore: LOG.info( 'Ignoring message intended for tenant %s in debug mode ' '(reason: %s): %s', message.resource.tenant_id, reason, message, ) return False should_ignore, reason = self.db_api.resource_in_debug( message.resource.id) if should_ignore: LOG.info( 'Ignoring message intended for resource %s in ' 'debug mode (reason: %s): %s', message.resource.id, reason, message, ) return False if target in commands.WILDCARDS: return message if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( message.resource.id) if self.host not in target_hosts: LOG.debug( 'Ignoring message intended for resource %s as it ' 'does not map to this Rug process.', message.resource.id) return False return message
def update_loadbalancer_status(self, loadbalancer_id, status): try: self.api_client.update_loadbalancer_status(loadbalancer_id, status) except Exception as e: # We don't want to die just because we can't tell neutron # what the status of the router should be. Log the error # but otherwise ignore it. LOG.info( _LI('ignoring failure to update status for %s to %s: %s'), id, status, e, )
def stop(self, worker_context): """Attempts to destroy the instance cluster :param worker_context: :returns: """ self.log.info(_LI('Destroying instance')) self.resource.delete_ports(worker_context) if not self.instances: self.log.info(_LI('Instance(s) already destroyed.')) if self.state != states.GONE: self.state = states.DOWN return self.state try: self.instances.destroy(worker_context) if self.state != states.GONE: self.state = states.DOWN except Exception: self.log.exception(_LE('Failed to stop instance(s)'))
def stop(self): """Shutdown all workers cleanly. """ LOG.info('shutting down scheduler') # Send a poison pill to all of the workers for w in self.workers: LOG.debug('sending stop message to %s', w['worker'].name) w['queue'].put(None) # Wait for the workers to finish and be ready to exit. for w in self.workers: LOG.debug('waiting for queue for %s', w['worker'].name) w['queue'].close() LOG.debug('waiting for worker %s', w['worker'].name) w['worker'].join() LOG.info(_LI('scheduler shutdown'))
def execute(self, action, worker_context): # Check for a loop where the resource keeps failing to boot or # accept the configuration. if (not self.instance.state == states.DEGRADED and self.instance.attempts >= self.params.reboot_error_threshold): self.params.resource.log.info(_LI( 'Dropping out of boot loop after %s trials'), self.instance.attempts) self.instance.set_error(worker_context) return action self.instance.boot(worker_context) self.params.resource.log.debug('CreateInstance attempt %s/%s', self.instance.attempts, self.params.reboot_error_threshold) return action
def report_status(self, show_config=True): if show_config: cfg.CONF.log_opt_values(LOG, INFO) LOG.info(_LI( 'Number of state machines in work queue: %d'), self.work_queue.qsize() ) LOG.info(_LI( 'Number of tenant resource managers managed: %d'), len(self.tenant_managers) ) for thread in self.threads: LOG.info(_LI( 'Thread %s is %s. Last seen: %s'), thread.name, 'alive' if thread.isAlive() else 'DEAD', self._thread_status.get(thread.name, 'UNKNOWN'), ) debug_tenants = self.db_api.tenants_in_debug() if debug_tenants: for t_uuid, reason in debug_tenants: LOG.info(_LI('Debugging tenant: %s (reason: %s)'), t_uuid, reason) else: LOG.info(_LI('No tenants in debug mode')) debug_resources = self.db_api.resources_in_debug() if debug_resources: for resource_id, reason in debug_resources: LOG.info(_LI('Debugging resource: %s (reason: %s)'), resource_id, reason) else: LOG.info(_LI('No resources in debug mode')) if cfg.CONF.coordination.enabled: # NOTE(adam_g): This list could be big with a large cluster. LOG.info(_LI('Peer astara-orchestrator hosts: %s'), self.hash_ring_mgr.hosts) else: LOG.info(_LI( 'No peer astara-orchestrator hosts, coordination disabled.'))
def delete_vrrp_port(self, object_id, label='VRRP'): name = 'ASTARA:%s:%s' % (label, object_id) response = self.api_client.list_ports(name=name) port_data = response.get('ports') if not port_data and self.conf.legacy_fallback_mode: name = name.replace('ASTARA', 'AKANDA') LOG.info(_LI('Attempting legacy query for %s.'), name) response = self.api_client.list_ports(name=name) port_data = response.get('ports') if not port_data: LOG.warning(_LW( 'Unable to find VRRP port to delete with name %s.'), name) for port in port_data: self.api_client.delete_port(port['id'])
def delete_vrrp_port(self, object_id, label='VRRP'): name = 'ASTARA:%s:%s' % (label, object_id) response = self.api_client.list_ports(name=name) port_data = response.get('ports') if not port_data and self.conf.legacy_fallback_mode: name = name.replace('ASTARA', 'AKANDA') LOG.info(_LI('Attempting legacy query for %s.'), name) response = self.api_client.list_ports(name=name) port_data = response.get('ports') if not port_data: LOG.warning( _LW('Unable to find VRRP port to delete with name %s.'), name) for port in port_data: self.api_client.delete_port(port['id'])
def _should_process_message(self, target, message): """Determines whether a message should be processed or not.""" global_debug, reason = self.db_api.global_debug() if global_debug: LOG.info('Skipping incoming event, cluster in global debug ' 'mode. (reason: %s)', reason) return False if message.resource.id not in commands.WILDCARDS: message = self._populate_resource_id(message) if not message.resource.id: LOG.info(_LI('Ignoring message with no resource found.')) return False should_ignore, reason = \ self.db_api.tenant_in_debug(message.resource.tenant_id) if should_ignore: LOG.info( 'Ignoring message intended for tenant %s in debug mode ' '(reason: %s): %s', message.resource.tenant_id, reason, message, ) return False should_ignore, reason = self.db_api.resource_in_debug( message.resource.id) if should_ignore: LOG.info( 'Ignoring message intended for resource %s in ' 'debug mode (reason: %s): %s', message.resource.id, reason, message, ) return False if target in commands.WILDCARDS: return message if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( message.resource.id) if self.host not in target_hosts: LOG.debug('Ignoring message intended for resource %s as it ' 'does not map to this Rug process.', message.resource.id) return False return message
def ignore_signals(): """Ignore signals that might interrupt processing Since the RUG doesn't want to be asynchronously interrupted, various signals received needs to be ignored. The registered signals including SIGHUP, SIGALRM, and default signals SIGUSR1 and SIGUSR2 are captured and ignored through the SIG_IGN action. :param: None :returns: None """ for s in [signal.SIGHUP, signal.SIGUSR1, signal.SIGUSR2, signal.SIGALRM]: logging.getLogger(__name__).info(_LI('ignoring signal %s'), s) signal.signal(s, signal.SIG_IGN)
def report_status(self, show_config=True): if show_config: cfg.CONF.log_opt_values(LOG, INFO) LOG.info(_LI('Number of state machines in work queue: %d'), self.work_queue.qsize()) LOG.info(_LI('Number of tenant resource managers managed: %d'), len(self.tenant_managers)) for thread in self.threads: LOG.info( _LI('Thread %s is %s. Last seen: %s'), thread.name, 'alive' if thread.isAlive() else 'DEAD', self._thread_status.get(thread.name, 'UNKNOWN'), ) debug_tenants = self.db_api.tenants_in_debug() if debug_tenants: for t_uuid, reason in debug_tenants: LOG.info(_LI('Debugging tenant: %s (reason: %s)'), t_uuid, reason) else: LOG.info(_LI('No tenants in debug mode')) debug_resources = self.db_api.resources_in_debug() if debug_resources: for resource_id, reason in debug_resources: LOG.info(_LI('Debugging resource: %s (reason: %s)'), resource_id, reason) else: LOG.info(_LI('No resources in debug mode')) if cfg.CONF.coordination.enabled: # NOTE(adam_g): This list could be big with a large cluster. LOG.info(_LI('Peer astara-orchestrator hosts: %s'), self.hash_ring_mgr.hosts) else: LOG.info( _LI('No peer astara-orchestrator hosts, coordination disabled.' ))
def shuffle_notifications(notification_queue, sched): """Copy messages from the notification queue into the scheduler. """ while True: try: target, message = notification_queue.get() if target is None: break sched.handle_message(target, message) except IOError: # FIXME(rods): if a signal arrive during an IO operation # an IOError is raised. We catch the exceptions in # meantime waiting for a better solution. pass except KeyboardInterrupt: LOG.info(_LI('got Ctrl-C')) break except: LOG.exception(_LE('unhandled exception processing message'))
def boot(self, worker_context): """Boots the instance with driver pre/post boot hooks. :returns: None """ self._ensure_cache(worker_context) self.log.info('Booting %s' % self.driver.RESOURCE_NAME) self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.driver.pre_boot(worker_context) # try to boot the instance try: instance_info = worker_context.nova_client.boot_instance( resource_type=self.driver.RESOURCE_NAME, prev_instance_info=self.instance_info, name=self.driver.name, image_uuid=self.driver.image_uuid, flavor=self.driver.flavor, make_ports_callback=self.driver.make_ports(worker_context) ) if not instance_info: self.log.info(_LI('Previous instance is still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE('Instance failed to start boot')) self.driver.delete_ports(worker_context) else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = states.BOOTING self.instance_info = instance_info # driver post boot hook self.driver.post_boot(worker_context)
def get_default_v4_gateway(client, router, networks): """Find the IPv4 default gateway for the router. """ LOG.debug('networks = %r', networks) if router.external_port: LOG.debug('external interface = %s', router.external_port.mac_address) # Now find the subnet that our external IP is on, and return its # gateway. for n in networks: if n['network_type'] == EXTERNAL_NET: v4_addresses = [ addr for addr in (netaddr.IPAddress(ip.partition('/')[0]) for ip in n['interface']['addresses']) if addr.version == 4 ] for s in n['subnets']: subnet = netaddr.IPNetwork(s['cidr']) if subnet.version != 4: continue LOG.debug( '%s: checking if subnet %s should have the default route', router.id, s['cidr']) for addr in v4_addresses: if addr in subnet: LOG.debug( '%s: found gateway %s for subnet %s on network %s', router.id, s['gateway_ip'], s['cidr'], n['network_id'], ) return s['gateway_ip'] # Sometimes we are asked to build a configuration for the server # when the external interface is still marked as "down". We can # report that case, but we don't treat it as an error here because # we'll be asked to do it again when the interface comes up. LOG.info(_LI('%s: no default gateway was found'), router.id) return ''
def start(self): """Brings up coordination service online This connects the coordination service to its tooz backend. This involves: - connecting to the cluster - creating the coordination group (if required) - joining the coordination group - registering callbacks to respond to join/leave membership events After the local node has joined the cluster and knows its remote peers, it fires off an initial rebalance event to the workers so they can seed their hash ring with the current membership. """ LOG.info(_LI('Starting RUG coordinator process for host %s on %s'), self.host, self.url) self._coordinator = tz_coordination.get_coordinator( self.url, self.host) self._coordinator.start() try: self._coordinator.create_group(self.group).get() except tooz.coordination.GroupAlreadyExist: pass try: self._coordinator.join_group(self.group).get() self._coordinator.heartbeat() except tooz.coordination.MemberAlreadyExist: pass self._coordinator.watch_join_group(self.group, self.cluster_changed) self._coordinator.watch_leave_group(self.group, self.cluster_changed) self._coordinator.heartbeat() LOG.debug("Sending initial event changed for members: %s" % self.members) self.cluster_changed(event=None, node_bootstrap=True)
def run(self, ip_address, port): app = RugAPI() try: socket.inet_pton(socket.AF_INET6, ip_address) family = socket.AF_INET6 except Exception: family = socket.AF_INET for i in six.moves.range(5): LOG.info(_LI( 'Starting the rug-api on %s:%s'), ip_address, port, ) try: sock = eventlet.listen( (ip_address, port), family=family, backlog=128 ) except socket.error as err: if err.errno != 99: # EADDRNOTAVAIL raise LOG.warning(_LW('Could not create rug-api socket: %s'), err) LOG.warning(_LW('Sleeping %s before trying again'), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError(_( 'Could not establish rug-api socket on %s:%s') % (ip_address, port) ) eventlet.wsgi.server( sock, app, custom_pool=self.pool, log=LOG)
def run(self, ip_address, port=cfg.CONF.astara_metadata_port): """Run the MetadataProxy. :param ip_address: the ip address to bind to for incoming requests :param port: the port to bind to for incoming requests :returns: returns nothing """ app = MetadataProxyHandler() for i in six.moves.range(5): LOG.info(_LI( 'Starting the metadata proxy on %s:%s'), ip_address, port ) try: sock = eventlet.listen( (ip_address, port), family=socket.AF_INET6, backlog=128 ) except socket.error as err: if err.errno != 99: raise LOG.warning( _LW('Could not create metadata proxy socket: %s'), err) LOG.warning(_LW('Sleeping %s before trying again'), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError( _('Could not establish metadata proxy socket on %s:%s') % (ip_address, port) ) eventlet.wsgi.server( sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
def _dispatch_command(self, target, message): if not self._should_process_command(message): return instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing resource %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource( id=resource_id, driver=sm.driver.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def _ensure_local_port(self, network_id, subnet_id, prefix, network_type): driver = importutils.import_object(self.conf.interface_driver, self.conf) host_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, socket.gethostname())) name = 'ASTARA:RUG:%s' % network_type.upper() query_dict = dict(device_owner=DEVICE_OWNER_RUG, device_id=host_id, name=name, network_id=network_id) ports = self.api_client.list_ports(**query_dict)['ports'] if not ports and self.conf.legacy_fallback_mode: LOG.info(_LI('Attempting legacy query for %s.'), name) query_dict.update({ 'name': name.replace('ASTARA', 'AKANDA'), 'device_owner': DEVICE_OWNER_RUG.replace('astara', 'akanda') }) ports = self.api_client.list_ports(**query_dict)['ports'] if ports and 'AKANDA' in ports[0]['name']: port = Port.from_dict(ports[0]) LOG.info( _LI('migrating port to ASTARA for port %r and using local %s'), port, network_type) self.api_client.update_port( port.id, { 'port': { 'name': port.name.replace('AKANDA', 'ASTARA'), 'device_owner': DEVICE_OWNER_RUG } }) elif ports: port = Port.from_dict(ports[0]) LOG.info(_LI('already have local %s port, using %r'), network_type, port) else: LOG.info(_LI('creating a new local %s port'), network_type) port_dict = { 'admin_state_up': True, 'network_id': network_id, 'device_owner': DEVICE_OWNER_ROUTER_INT, # lying here for IP 'name': name, 'device_id': host_id, 'fixed_ips': [{ 'subnet_id': subnet_id }], 'binding:host_id': socket.gethostname() } port = Port.from_dict( self.api_client.create_port(dict(port=port_dict))['port']) # remove lie that enabled us pick IP on slaac subnet self.api_client.update_port( port.id, {'port': { 'device_owner': DEVICE_OWNER_RUG }}) port.device_owner = DEVICE_OWNER_RUG LOG.info(_LI('new local %s port: %r'), network_type, port) # create the tap interface if it doesn't already exist if not ip_lib.device_exists(driver.get_device_name(port)): driver.plug(port.network_id, port.id, driver.get_device_name(port), port.mac_address) # add sleep to ensure that port is setup before use time.sleep(1) try: fixed_ip = [ fip for fip in port.fixed_ips if fip.subnet_id == subnet_id ][0] except IndexError: raise MissingIPAllocation(port.id) ip_cidr = '%s/%s' % (fixed_ip.ip_address, prefix.split('/')[1]) driver.init_l3(driver.get_device_name(port), [ip_cidr]) return ip_cidr
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext(self.management_address) while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug('Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # In the event that a rebalance took place while processing an # event, it may have been put back into the work queue. Check # the hash table once more to find out if we still manage it # and do some cleanup if not. if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( sm.resource_id) if self.host not in target_hosts: LOG.debug('Skipping update of router %s, it no longer ' 'maps here.', sm.resource_id) trm = self.tenant_managers[sm.tenant_id] trm.unmanage_resource(sm.resource_id) self.work_queue.task_done() with self.lock: self._release_resource_lock(sm) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ( 'finalizing task for %s' % sm.resource_id ) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def main(argv=sys.argv[1:]): """Main Entry point into the astara-orchestrator This is the main entry point into the astara-orchestrator. On invocation of this method, logging, local network connectivity setup is performed. This information is obtained through the 'ak-config' file, passed as arguement to this method. Worker threads are spawned for handling various tasks that are associated with processing as well as responding to different Neutron events prior to starting a notification dispatch loop. :param argv: list of Command line arguments :returns: None :raises: None """ # TODO(rama) Error Handling to be added as part of the docstring # description # Change the process and thread name so the logs are cleaner. p = multiprocessing.current_process() p.name = 'pmain' t = threading.current_thread() t.name = 'tmain' ak_cfg.parse_config(argv) log.setup(cfg.CONF, 'astara-orchestrator') cfg.CONF.log_opt_values(LOG, logging.INFO) neutron = neutron_api.Neutron(cfg.CONF) # TODO(mark): develop better way restore after machine reboot # neutron.purge_management_interface() # bring the mgt tap interface up mgt_ip_address = neutron.ensure_local_service_port().split('/')[0] # Set up the queue to move messages between the eventlet-based # listening process and the scheduler. notification_queue = multiprocessing.Queue() # Ignore signals that might interrupt processing. daemon.ignore_signals() # If we see a SIGINT, stop processing. def _stop_processing(*args): notification_queue.put((None, None)) signal.signal(signal.SIGINT, _stop_processing) # Listen for notifications. notification_proc = multiprocessing.Process( target=notifications.listen, kwargs={ 'notification_queue': notification_queue }, name='notification-listener', ) notification_proc.start() if CONF.coordination.enabled: coordinator_proc = multiprocessing.Process( target=coordination.start, kwargs={ 'notification_queue': notification_queue }, name='coordinator', ) coordinator_proc.start() else: coordinator_proc = None metadata_proc = multiprocessing.Process( target=metadata.serve, args=(mgt_ip_address,), name='metadata-proxy' ) metadata_proc.start() from astara.api import rug as rug_api rug_api_proc = multiprocessing.Process( target=rug_api.serve, name='rug-api' ) rug_api_proc.start() # Set up the notifications publisher Publisher = (notifications.Publisher if cfg.CONF.ceilometer.enabled else notifications.NoopPublisher) publisher = Publisher( topic=cfg.CONF.ceilometer.topic, ) # Set up a factory to make Workers that know how many threads to # run. worker_factory = functools.partial( worker.Worker, notifier=publisher, management_address=mgt_ip_address, ) # Set up the scheduler that knows how to manage the routers and # dispatch messages. sched = scheduler.Scheduler( worker_factory=worker_factory, ) # Prepopulate the workers with existing routers on startup populate.pre_populate_workers(sched) # Set up the periodic health check health.start_inspector(cfg.CONF.health_check_period, sched) # Block the main process, copying messages from the notification # listener to the scheduler try: shuffle_notifications(notification_queue, sched) finally: LOG.info(_LI('Stopping scheduler.')) sched.stop() LOG.info(_LI('Stopping notification publisher.')) publisher.stop() # Terminate the subprocesses for subproc in [notification_proc, coordinator_proc, metadata_proc, rug_api_proc]: if not subproc: continue LOG.info(_LI('Stopping %s.'), subproc.name) subproc.terminate()
def update_state(self, worker_context, silent=False): """Updates state of the instance and, by extension, its logical resource :param worker_context: :param silent: :returns: state """ if self.driver.get_state(worker_context) == states.GONE: self.log.debug('%s driver reported its state is %s', self.driver.RESOURCE_NAME, states.GONE) self.state = states.GONE return self.state if self.instance_info is None: self.log.info(_LI('no backing instance, marking as %s'), states.DOWN) self.state = states.DOWN return self.state addr = self.instance_info.management_address if not addr: self.log.debug('waiting for instance ports to be attached') self.state = states.BOOTING return self.state for i in six.moves.range(cfg.CONF.max_retries): if self.driver.is_alive(self.instance_info.management_address): if self.state != states.CONFIGURED: self.state = states.UP break if not silent: self.log.debug('Alive check failed. Attempt %d of %d', i, cfg.CONF.max_retries) time.sleep(cfg.CONF.retry_delay) else: old_state = self.state self._check_boot_timeout() # If the instance isn't responding, make sure Nova knows about it instance = worker_context.nova_client.get_instance_for_obj(self.id) if instance is None and self.state != states.ERROR: self.log.info('No instance was found; rebooting') self.state = states.DOWN self.instance_info = None # update_state() is called from Alive() to check the # status of the router. If we can't talk to the API at # that point, the router should be considered missing and # we should reboot it, so mark it states.DOWN if we think it was # configured before. if old_state == states.CONFIGURED and self.state != states.ERROR: self.log.debug('Instance not alive, marking it as %s', states.DOWN) self.state = states.DOWN # After the instance is all the way up, record how long it took # to boot and accept a configuration. self.instance_info = ( worker_context.nova_client.update_instance_info( self.instance_info)) if not self.instance_info.booting and self.state == states.CONFIGURED: # If we didn't boot the server (because we were restarted # while it remained running, for example), we won't have a # duration to log. if not self._boot_logged: boot_time = self.instance_info.time_since_boot.total_seconds() self.log.info('%s booted in %s seconds after %s attempts', self.driver.RESOURCE_NAME, boot_time, self._boot_counter.count) self._boot_logged = True # Always reset the boot counter, even if we didn't boot # the server ourself, so we don't accidentally think we # have an erroring router. self._boot_counter.reset() return self.state
def main(argv=sys.argv[1:]): """Main Entry point into the astara-orchestrator This is the main entry point into the astara-orchestrator. On invocation of this method, logging, local network connectivity setup is performed. This information is obtained through the 'ak-config' file, passed as arguement to this method. Worker threads are spawned for handling various tasks that are associated with processing as well as responding to different Neutron events prior to starting a notification dispatch loop. :param argv: list of Command line arguments :returns: None :raises: None """ # TODO(rama) Error Handling to be added as part of the docstring # description # Change the process and thread name so the logs are cleaner. p = multiprocessing.current_process() p.name = 'pmain' t = threading.current_thread() t.name = 'tmain' ak_cfg.parse_config(argv) log.setup(cfg.CONF, 'astara-orchestrator') cfg.CONF.log_opt_values(LOG, logging.INFO) neutron = neutron_api.Neutron(cfg.CONF) # TODO(mark): develop better way restore after machine reboot # neutron.purge_management_interface() # bring the mgt tap interface up mgt_ip_address = neutron.ensure_local_service_port().split('/')[0] # Set up the queue to move messages between the eventlet-based # listening process and the scheduler. notification_queue = multiprocessing.Queue() # Ignore signals that might interrupt processing. daemon.ignore_signals() # If we see a SIGINT, stop processing. def _stop_processing(*args): notification_queue.put((None, None)) signal.signal(signal.SIGINT, _stop_processing) # Listen for notifications. notification_proc = multiprocessing.Process( target=notifications.listen, kwargs={'notification_queue': notification_queue}, name='notification-listener', ) notification_proc.start() if CONF.coordination.enabled: coordinator_proc = multiprocessing.Process( target=coordination.start, kwargs={'notification_queue': notification_queue}, name='coordinator', ) coordinator_proc.start() else: coordinator_proc = None metadata_proc = multiprocessing.Process(target=metadata.serve, args=(mgt_ip_address, ), name='metadata-proxy') metadata_proc.start() from astara.api import rug as rug_api rug_api_proc = multiprocessing.Process(target=rug_api.serve, name='rug-api') rug_api_proc.start() # Set up the notifications publisher Publisher = (notifications.Publisher if cfg.CONF.ceilometer.enabled else notifications.NoopPublisher) publisher = Publisher(topic=cfg.CONF.ceilometer.topic, ) # Set up a factory to make Workers that know how many threads to # run. worker_factory = functools.partial( worker.Worker, notifier=publisher, management_address=mgt_ip_address, ) # Set up the scheduler that knows how to manage the routers and # dispatch messages. sched = scheduler.Scheduler(worker_factory=worker_factory, ) # Prepopulate the workers with existing routers on startup populate.pre_populate_workers(sched) # Set up the periodic health check health.start_inspector(cfg.CONF.health_check_period, sched) # Block the main process, copying messages from the notification # listener to the scheduler try: shuffle_notifications(notification_queue, sched) finally: LOG.info(_LI('Stopping scheduler.')) sched.stop() LOG.info(_LI('Stopping notification publisher.')) publisher.stop() # Terminate the subprocesses for subproc in [ notification_proc, coordinator_proc, metadata_proc, rug_api_proc ]: if not subproc: continue LOG.info(_LI('Stopping %s.'), subproc.name) subproc.terminate()
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext(self.management_address) while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug( 'Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # In the event that a rebalance took place while processing an # event, it may have been put back into the work queue. Check # the hash table once more to find out if we still manage it # and do some cleanup if not. if cfg.CONF.coordination.enabled: target_hosts = self.hash_ring_mgr.ring.get_hosts( sm.resource_id) if self.host not in target_hosts: LOG.debug( 'Skipping update of router %s, it no longer ' 'maps here.', sm.resource_id) trm = self.tenant_managers[sm.tenant_id] trm.unmanage_resource(sm.resource_id) self.work_queue.task_done() with self.lock: self._release_resource_lock(sm) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ('finalizing task for %s' % sm.resource_id) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def update_state(self, worker_context, silent=False): """Updates state of the instance and, by extension, its logical resource :param worker_context: :param silent: :returns: state """ if self.resource.get_state(worker_context) == states.GONE: self.log.debug('%s driver reported its state is %s', self.resource.RESOURCE_NAME, states.GONE) self.state = states.GONE return self.state if not self.instances: self.log.info(_LI('no backing instance(s), marking as %s'), states.DOWN) self.state = states.DOWN return self.state elif self.instances.cluster_degraded is True: self.log.info(_LI( 'instance cluster for resource %s reports degraded'), self.resource.id) self.state = states.DEGRADED return self.state has_ports, no_ports = self.instances.validate_ports() # ports_state=None means no instances have ports if not has_ports: self.log.debug('waiting for instance ports to be attached') self.state = states.BOOTING return self.state # XXX TODO need to account for when only a subset of the cluster have # correct ports, kick back to Replug alive, dead = self.instances.are_alive() if not alive: # alive checked failed on all instances for an already configured # resource, mark it down. # XXX need to track timeouts per instance # self._check_boot_timeout() if self.state == states.CONFIGURED: self.log.debug('No instance(s) alive, marking it as %s', states.DOWN) self.state = states.DOWN return self.state elif dead: # some subset of instances reported not alive, mark it degraded. if self.state == states.CONFIGURED: for i in dead: instance = worker_context.nova_client.get_instance_by_id( i.id_) if instance is None and self.state != states.ERROR: self.log.info( 'Instance %s was found; rebooting', i.id_) self.instances.delete(i) self.state = states.DEGRADED return self.state self.instances.refresh(worker_context) if self.state == states.CONFIGURED: for i in alive: if not i.booting and i not in self._boot_logged: self.log.info( '%s booted in %s seconds after %s attempts', self.resource.RESOURCE_NAME, i.time_since_boot.total_seconds(), self._boot_counter.count) self._boot_logged.append(i) self.reset_boot_counter() else: if alive: self.state = states.UP return self.state
def __init__(self, client): super(PezInstanceProvider, self).__init__(client) self.rpc_client = pez_api.AstaraPezAPI(rpc_topic='astara-pez') LOG.debug(_LI('Initialized %s with rpc client %s'), self.__class__.__name__, self.rpc_client)
def update_state(self, worker_context, silent=False): """Updates state of the instance and, by extension, its logical resource :param worker_context: :param silent: :returns: state """ if self.resource.get_state(worker_context) == states.GONE: self.log.debug('%s driver reported its state is %s', self.resource.RESOURCE_NAME, states.GONE) self.state = states.GONE return self.state if not self.instances: self.log.info(_LI('no backing instance(s), marking as %s'), states.DOWN) self.state = states.DOWN return self.state elif self.instances.cluster_degraded is True: self.log.info( _LI('instance cluster for resource %s reports degraded'), self.resource.id) self.state = states.DEGRADED return self.state has_ports, no_ports = self.instances.validate_ports() # ports_state=None means no instances have ports if not has_ports: self.log.debug('waiting for instance ports to be attached') self.state = states.BOOTING return self.state # XXX TODO need to account for when only a subset of the cluster have # correct ports, kick back to Replug alive, dead = self.instances.are_alive() if not alive: # alive checked failed on all instances for an already configured # resource, mark it down. # XXX need to track timeouts per instance # self._check_boot_timeout() if self.state == states.CONFIGURED: self.log.debug('No instance(s) alive, marking it as %s', states.DOWN) self.state = states.DOWN return self.state elif dead: # some subset of instances reported not alive, mark it degraded. if self.state == states.CONFIGURED: for i in dead: instance = worker_context.nova_client.get_instance_by_id( i.id_) if instance is None and self.state != states.ERROR: self.log.info('Instance %s was found; rebooting', i.id_) self.instances.delete(i) self.state = states.DEGRADED return self.state self.instances.refresh(worker_context) if self.state == states.CONFIGURED: for i in alive: if not i.booting and i not in self._boot_logged: self.log.info('%s booted in %s seconds after %s attempts', self.resource.RESOURCE_NAME, i.time_since_boot.total_seconds(), self._boot_counter.count) self._boot_logged.append(i) self.reset_boot_counter() else: if alive: self.state = states.UP return self.state
def delete_instance(self, instance_uuid): LOG.info(_LI('Deleting instance %s.'), instance_uuid) self.ctxt.nova_client.client.servers.delete(instance_uuid) self._delete_counters[instance_uuid] = timeutils.utcnow()
def _dispatch_command(self, target, message): if not self._should_process_command(message): return instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning( _LW('Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning( _LW('Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing resource %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning( _LW('Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource(id=resource_id, driver=sm.resource.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning( _LW('Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)