Beispiel #1
0
    def stop(self, worker_context):
        """Attempts to destroy the instance with configured timeout.

        :param worker_context:
        :returns:
        """
        self._ensure_cache(worker_context)
        self.log.info(_LI('Destroying instance'))

        if not self.instance_info:
            self.log.info(_LI('Instance already destroyed.'))
            return

        try:
            worker_context.nova_client.destroy_instance(self.instance_info)
        except Exception:
            self.log.exception(_LE('Error deleting router instance'))

        start = time.time()
        while time.time() - start < cfg.CONF.boot_timeout:
            if not worker_context.nova_client.\
                    get_instance_by_id(self.instance_info.id_):
                if self.state != states.GONE:
                    self.state = states.DOWN
                return self.state
            self.log.debug('Router has not finished stopping')
            time.sleep(cfg.CONF.retry_delay)
        self.log.error(_LE(
            'Router failed to stop within %d secs'),
            cfg.CONF.boot_timeout)
Beispiel #2
0
    def _wait_child(self):
        try:
            # Don't block if no child processes have exited
            pid, status = os.waitpid(0, os.WNOHANG)
            if not pid:
                return None
        except OSError as exc:
            if exc.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return None

        if os.WIFSIGNALED(status):
            sig = os.WTERMSIG(status)
            LOG.info(_LI('Child %(pid)d killed by signal %(sig)d'),
                     dict(pid=pid, sig=sig))
        else:
            code = os.WEXITSTATUS(status)
            LOG.info(_LI('Child %(pid)s exited with status %(code)d'),
                     dict(pid=pid, code=code))

        if pid not in self.children:
            LOG.warning(_LW('pid %d not in child list'), pid)
            return None

        wrap = self.children.pop(pid)
        wrap.children.remove(pid)
        return wrap
Beispiel #3
0
    def send_message(self, message):
        "Called when the worker put a message in the state machine queue"
        if self.deleted:
            # Ignore any more incoming messages
            self.log.debug("deleted state machine, ignoring incoming message %s", message)
            return False

        # NOTE(dhellmann): This check is largely redundant with the
        # one in CalcAction.transition() but it may allow us to avoid
        # adding poll events to the queue at all, and therefore cut
        # down on the number of times a worker thread wakes up to
        # process something on a router that isn't going to actually
        # do any work.
        if message.crud == POLL and self.instance.state == instance_manager.ERROR:
            self.log.info(_LI("Router status is ERROR, ignoring POLL message: %s"), message)
            return False

        if message.crud == REBUILD:
            if message.body.get("router_image_uuid"):
                self.log.info(_LI("Router is being REBUILT with custom image %s"), message.body["router_image_uuid"])
                self.router_image_uuid = message.body["router_image_uuid"]
            else:
                self.router_image_uuid = cfg.CONF.router_image_uuid

        self._queue.append(message.crud)
        queue_len = len(self._queue)
        if queue_len > self._queue_warning_threshold:
            logger = self.log.warning
        else:
            logger = self.log.debug
        logger(_LW("incoming message brings queue length to %s"), queue_len)
        return True
Beispiel #4
0
    def _wait_child(self):
        try:
            # Don't block if no child processes have exited
            pid, status = os.waitpid(0, os.WNOHANG)
            if not pid:
                return None
        except OSError as exc:
            if exc.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return None

        if os.WIFSIGNALED(status):
            sig = os.WTERMSIG(status)
            LOG.info(_LI('Child %(pid)d killed by signal %(sig)d'),
                     dict(pid=pid, sig=sig))
        else:
            code = os.WEXITSTATUS(status)
            LOG.info(_LI('Child %(pid)s exited with status %(code)d'),
                     dict(pid=pid, code=code))

        if pid not in self.children:
            LOG.warning(_LW('pid %d not in child list'), pid)
            return None

        wrap = self.children.pop(pid)
        wrap.children.remove(pid)
        return wrap
Beispiel #5
0
    def _ensure_local_port(self, network_id, subnet_id,
                           network_type, ip_address):
        driver = importutils.import_object(self.conf.interface_driver,
                                           self.conf)

        host_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, socket.gethostname()))

        name = 'AKANDA:RUG:%s' % network_type.upper()

        query_dict = dict(device_owner=DEVICE_OWNER_RUG,
                          device_id=host_id,
                          name=name,
                          network_id=network_id)

        ports = self.api_client.list_ports(**query_dict)['ports']

        if ports:
            port = Port.from_dict(ports[0])
            LOG.info(_LI('already have local %s port, using %r'),
                     network_type, port)
        else:
            LOG.info(_LI('creating a new local %s port'), network_type)
            port_dict = {
                'admin_state_up': True,
                'network_id': network_id,
                'device_owner': DEVICE_OWNER_ROUTER_INT,  # lying here for IP
                'name': name,
                'device_id': host_id,
                'fixed_ips': [{
                    'ip_address': ip_address.split('/')[0],
                    'subnet_id': subnet_id
                }],
                'binding:host_id': socket.gethostname()
            }
            port = Port.from_dict(
                self.api_client.create_port(dict(port=port_dict))['port'])

            # remove lie that enabled us pick IP on slaac subnet
            self.api_client.update_port(
                port.id,
                {'port': {'device_owner': DEVICE_OWNER_RUG}}
            )
            port.device_owner = DEVICE_OWNER_RUG

            LOG.info(_LI('new local %s port: %r'), network_type, port)

        # create the tap interface if it doesn't already exist
        if not ip_lib.device_exists(driver.get_device_name(port)):
            driver.plug(
                port.network_id,
                port.id,
                driver.get_device_name(port),
                port.mac_address)

            # add sleep to ensure that port is setup before use
            time.sleep(1)

        driver.init_l3(driver.get_device_name(port), [ip_address])
        return port
    def update_state(self, worker_context, silent=False):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            self.log.debug("not updating state of deleted router")
            return self.state

        if self.instance_info is None:
            self.log.debug("no backing instance, marking router as down")
            self.state = DOWN
            return self.state

        addr = self.instance_info.management_address
        for i in xrange(cfg.CONF.max_retries):
            if router_api.is_alive(addr, cfg.CONF.akanda_mgt_service_port):
                if self.state != CONFIGURED:
                    self.state = UP
                break
            if not silent:
                self.log.debug("Alive check failed. Attempt %d of %d", i, cfg.CONF.max_retries)
            time.sleep(cfg.CONF.retry_delay)
        else:
            old_state = self.state
            self._check_boot_timeout()

            # If the router isn't responding, make sure Nova knows about it
            instance = worker_context.nova_client.get_instance_for_obj(self.router_id)
            if instance is None and self.state != ERROR:
                self.log.info(_LI("No instance was found; rebooting"))
                self.state = DOWN
                self.instance_info = None

            # update_state() is called from Alive() to check the
            # status of the router. If we can't talk to the API at
            # that point, the router should be considered missing and
            # we should reboot it, so mark it down if we think it was
            # configured before.
            if old_state == CONFIGURED and self.state != ERROR:
                self.log.debug("Did not find router alive, marking it as down")
                self.state = DOWN

        # After the router is all the way up, record how long it took
        # to boot and accept a configuration.
        if self.instance_info.booting and self.state == CONFIGURED:
            # If we didn't boot the server (because we were restarted
            # while it remained running, for example), we won't have a
            # duration to log.
            self.instance_info.confirm_up()
            if self.instance_info.boot_duration:
                self.log.info(
                    _LI("Router booted in %s seconds after %s attempts"),
                    self.instance_info.boot_duration.total_seconds(),
                    self._boot_counter.count,
                )
            # Always reset the boot counter, even if we didn't boot
            # the server ourself, so we don't accidentally think we
            # have an erroring router.
            self._boot_counter.reset()
        return self.state
Beispiel #7
0
 def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr,
              gateway_ip, enable_dhcp, dns_nameservers, host_routes,
              ipv6_ra_mode):
     self.id = id_
     self.name = name
     self.tenant_id = tenant_id
     self.network_id = network_id
     self.ip_version = ip_version
     try:
         self.cidr = netaddr.IPNetwork(cidr)
     except (TypeError, netaddr.AddrFormatError) as e:
         raise ValueError(
             _('Invalid CIDR %r for subnet %s of network %s: %s') % (
                 cidr, id_, network_id, e,
             )
         )
     try:
         self.gateway_ip = netaddr.IPAddress(gateway_ip)
     except (TypeError, netaddr.AddrFormatError) as e:
         self.gateway_ip = None
         LOG.info(_LI('Bad gateway_ip on subnet %s: %r (%s)'),
                  id_, gateway_ip, e)
     self.enable_dhcp = enable_dhcp
     self.dns_nameservers = dns_nameservers
     self.host_routes = host_routes
     self.ipv6_ra_mode = ipv6_ra_mode
Beispiel #8
0
 def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr,
              gateway_ip, enable_dhcp, dns_nameservers, host_routes,
              ipv6_ra_mode):
     self.id = id_
     self.name = name
     self.tenant_id = tenant_id
     self.network_id = network_id
     self.ip_version = ip_version
     try:
         self.cidr = netaddr.IPNetwork(cidr)
     except (TypeError, netaddr.AddrFormatError) as e:
         raise ValueError(
             _('Invalid CIDR %r for subnet %s of network %s: %s') % (
                 cidr,
                 id_,
                 network_id,
                 e,
             ))
     try:
         self.gateway_ip = netaddr.IPAddress(gateway_ip)
     except (TypeError, netaddr.AddrFormatError) as e:
         self.gateway_ip = None
         LOG.info(_LI('Bad gateway_ip on subnet %s: %r (%s)'), id_,
                  gateway_ip, e)
     self.enable_dhcp = enable_dhcp
     self.dns_nameservers = dns_nameservers
     self.host_routes = host_routes
     self.ipv6_ra_mode = ipv6_ra_mode
Beispiel #9
0
    def _should_process(self, message):
        """Determines whether a message should be processed or not."""
        global_debug, reason = self.db_api.global_debug()
        if global_debug:
            LOG.info('Skipping incoming event, cluster in global debug '
                     'mode. (reason: %s)', reason)
            return False

        if message.resource.id not in commands.WILDCARDS:
            message = self._populate_resource_id(message)
            if not message.resource.id:
                LOG.info(_LI('Ignoring message with no resource found.'))
                return False

            should_ignore, reason = \
                self.db_api.tenant_in_debug(message.resource.tenant_id)
            if should_ignore:
                LOG.info(
                    'Ignoring message intended for tenant %s in debug mode '
                    '(reason: %s): %s',
                    message.resource.tenant_id, reason, message,
                )
                return False

            should_ignore, reason = self.db_api.resource_in_debug(
                message.resource.id)
            if should_ignore:
                LOG.info(
                    'Ignoring message intended for resource %s in '
                    'debug mode (reason: %s): %s',
                    message.resource.id, reason, message,
                )
                return False

        return message
Beispiel #10
0
 def run(self, ip_address, port=cfg.CONF.rug_api_port):
     app = RugAPI()
     for i in xrange(5):
         LOG.info(
             _LI('Starting the rug-api on %s/%s'),
             ip_address,
             port,
         )
         try:
             sock = eventlet.listen((ip_address, port),
                                    family=socket.AF_INET6,
                                    backlog=128)
         except socket.error as err:
             if err.errno != 99:  # EADDRNOTAVAIL
                 raise
             LOG.warning(_LW('Could not create rug-api socket: %s'), err)
             LOG.warning(_LW('Sleeping %s before trying again'), i + 1)
             eventlet.sleep(i + 1)
         else:
             break
     else:
         raise RuntimeError(
             _('Could not establish rug-api socket on %s/%s') %
             (ip_address, port))
     eventlet.wsgi.server(sock,
                          app,
                          custom_pool=self.pool,
                          log=loggers.WritableLogger(LOG))
Beispiel #11
0
 def run(self, ip_address, port=RUG_META_PORT):
     app = MetadataProxyHandler()
     for i in xrange(5):
         LOG.info(_LI('Starting the metadata proxy on %s/%s'), ip_address,
                  port)
         try:
             sock = eventlet.listen((ip_address, port),
                                    family=socket.AF_INET6,
                                    backlog=128)
         except socket.error as err:
             if err.errno != 99:
                 raise
             LOG.warning(_LW('Could not create metadata proxy socket: %s'),
                         err)
             LOG.warning(_LW('Sleeping %s before trying again'), i + 1)
             eventlet.sleep(i + 1)
         else:
             break
     else:
         raise RuntimeError(
             _('Could not establish metadata proxy socket on %s/%s') %
             (ip_address, port))
     eventlet.wsgi.server(sock,
                          app,
                          custom_pool=self.pool,
                          log=loggers.WritableLogger(LOG))
Beispiel #12
0
    def _pipe_watcher(self):
        # This will block until the write end is closed when the parent
        # dies unexpectedly
        self.readpipe.read()

        LOG.info(_LI('Parent process has died unexpectedly, exiting'))

        sys.exit(1)
Beispiel #13
0
    def _pipe_watcher(self):
        # This will block until the write end is closed when the parent
        # dies unexpectedly
        self.readpipe.read()

        LOG.info(_LI('Parent process has died unexpectedly, exiting'))

        sys.exit(1)
    def boot(self, worker_context, router_image_uuid):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            self.log.info(_LI("Not booting deleted router"))
            return

        self.log.info(_LI("Booting router"))
        self.state = DOWN
        self._boot_counter.start()

        def make_vrrp_ports():
            mgt_port = worker_context.neutron.create_management_port(self.router_obj.id)

            # FIXME(mark): ideally this should be ordered and de-duped
            instance_ports = [
                worker_context.neutron.create_vrrp_port(self.router_obj.id, n)
                for n in (p.network_id for p in self.router_obj.ports)
            ]

            return mgt_port, instance_ports

        try:
            # TODO(mark): make this pluggable
            self._ensure_provider_ports(self.router_obj, worker_context)

            # TODO(mark): make this handle errors more gracefully on cb fail
            # TODO(mark): checkout from a pool - boot on demand for now
            instance_info = worker_context.nova_client.boot_instance(
                self.instance_info, self.router_obj.id, router_image_uuid, make_vrrp_ports
            )
            if not instance_info:
                self.log.info(_LI("Previous router is deleting"))
                # Reset the VM manager, causing the state machine to start
                # again with a new VM.
                self.reset_boot_counter()
                self.instance_info = None
                return
        except:
            self.log.exception(_LE("Router failed to start boot"))
            # TODO(mark): attempt clean-up of failed ports
            return
        else:
            # We have successfully started a (re)boot attempt so
            # record the timestamp so we can report how long it takes.
            self.state = BOOTING
            self.instance_info = instance_info
Beispiel #15
0
    def report_status(self, show_config=True):
        if show_config:
            cfg.CONF.log_opt_values(LOG, INFO)
        LOG.info(_LI(
            'Number of state machines in work queue: %d'),
            self.work_queue.qsize()
        )
        LOG.info(_LI(
            'Number of tenant resource managers managed: %d'),
            len(self.tenant_managers)
        )
        for thread in self.threads:
            LOG.info(_LI(
                'Thread %s is %s. Last seen: %s'),
                thread.name,
                'alive' if thread.isAlive() else 'DEAD',
                self._thread_status.get(thread.name, 'UNKNOWN'),
            )
        debug_tenants = self.db_api.tenants_in_debug()
        if debug_tenants:
            for t_uuid, reason in debug_tenants:
                LOG.info(_LI('Debugging tenant: %s (reason: %s)'),
                         t_uuid, reason)
        else:
            LOG.info(_LI('No tenants in debug mode'))

        debug_resources = self.db_api.resources_in_debug()
        if debug_resources:
            for resource_id, reason in debug_resources:
                LOG.info(_LI('Debugging resource: %s (reason: %s)'),
                         resource_id, reason)
        else:
            LOG.info(_LI('No resources in debug mode'))
Beispiel #16
0
    def _start_child(self, wrap):
        if len(wrap.forktimes) > wrap.workers:
            # Limit ourselves to one process a second (over the period of
            # number of workers * 1 second). This will allow workers to
            # start up quickly but ensure we don't fork off children that
            # die instantly too quickly.
            if time.time() - wrap.forktimes[0] < wrap.workers:
                LOG.info(_LI('Forking too fast, sleeping'))
                time.sleep(1)

            wrap.forktimes.pop(0)

        wrap.forktimes.append(time.time())

        pid = os.fork()
        if pid == 0:
            # NOTE(johannes): All exceptions are caught to ensure this
            # doesn't fallback into the loop spawning children. It would
            # be bad for a child to spawn more children.
            status = 0
            try:
                self._child_process(wrap.service)
            except SignalExit as exc:
                signame = {
                    signal.SIGTERM: 'SIGTERM',
                    signal.SIGINT: 'SIGINT'
                }[exc.signo]
                LOG.info(_LI('Caught %s, exiting'), signame)
                status = exc.code
            except SystemExit as exc:
                status = exc.code
            except BaseException:
                LOG.exception(_LE('Unhandled exception'))
                status = 2
            finally:
                wrap.service.stop()

            os._exit(status)

        LOG.info(_LI('Started child %d'), pid)

        wrap.children.add(pid)
        self.children[pid] = wrap

        return pid
Beispiel #17
0
    def _start_child(self, wrap):
        if len(wrap.forktimes) > wrap.workers:
            # Limit ourselves to one process a second (over the period of
            # number of workers * 1 second). This will allow workers to
            # start up quickly but ensure we don't fork off children that
            # die instantly too quickly.
            if time.time() - wrap.forktimes[0] < wrap.workers:
                LOG.info(_LI('Forking too fast, sleeping'))
                time.sleep(1)

            wrap.forktimes.pop(0)

        wrap.forktimes.append(time.time())

        pid = os.fork()
        if pid == 0:
            # NOTE(johannes): All exceptions are caught to ensure this
            # doesn't fallback into the loop spawning children. It would
            # be bad for a child to spawn more children.
            status = 0
            try:
                self._child_process(wrap.service)
            except SignalExit as exc:
                signame = {signal.SIGTERM: 'SIGTERM',
                           signal.SIGINT: 'SIGINT'}[exc.signo]
                LOG.info(_LI('Caught %s, exiting'), signame)
                status = exc.code
            except SystemExit as exc:
                status = exc.code
            except BaseException:
                LOG.exception(_LE('Unhandled exception'))
                status = 2
            finally:
                wrap.service.stop()

            os._exit(status)

        LOG.info(_LI('Started child %d'), pid)

        wrap.children.add(pid)
        self.children[pid] = wrap

        return pid
Beispiel #18
0
 def execute(self, action, worker_context):
     # Check for a loop where the router keeps failing to boot or
     # accept the configuration.
     if self.instance.attempts >= self.params.reboot_error_threshold:
         self.log.info(_LI("Dropping out of boot loop after %s trials"), self.instance.attempts)
         self.instance.set_error(worker_context)
         return action
     self.instance.boot(worker_context, self.router_image_uuid)
     self.log.debug("CreateInstance attempt %s/%s", self.instance.attempts, self.params.reboot_error_threshold)
     return action
    def check_boot(self, worker_context):
        ready_states = (UP, CONFIGURED)
        if self.update_state(worker_context, silent=True) in ready_states:
            self.log.info(_LI("Router has booted, attempting initial config"))
            self.configure(worker_context, BOOTING, attempts=1)
            if self.state != CONFIGURED:
                self._check_boot_timeout()
            return self.state == CONFIGURED

        self.log.debug("Router is %s", self.state.upper())
        return False
Beispiel #20
0
 def update_router_status(self, router_id, status):
     try:
         self.api_client.update_router_status(router_id, status)
     except Exception as e:
         # We don't want to die just because we can't tell neutron
         # what the status of the router should be. Log the error
         # but otherwise ignore it.
         LOG.info(_LI(
             'ignoring failure to update status for router %s to %s: %s'),
             router_id, status, e,
         )
Beispiel #21
0
 def get_network_subnets(self, network_id):
     response = []
     subnet_response = self.api_client.list_subnets(network_id=network_id)
     subnets = subnet_response['subnets']
     for s in subnets:
         try:
             response.append(Subnet.from_dict(s))
         except Exception as e:
             LOG.info(_LI('ignoring subnet %s (%s) on network %s: %s'),
                      s.get('id'), s.get('cidr'), network_id, e)
     return response
Beispiel #22
0
 def get_network_subnets(self, network_id):
     response = []
     subnet_response = self.api_client.list_subnets(network_id=network_id)
     subnets = subnet_response['subnets']
     for s in subnets:
         try:
             response.append(Subnet.from_dict(s))
         except Exception as e:
             LOG.info(_LI('ignoring subnet %s (%s) on network %s: %s'),
                      s.get('id'), s.get('cidr'),
                      network_id, e)
     return response
    def stop(self, worker_context):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            self.log.info(_LI("Destroying router neutron has deleted"))
        else:
            self.log.info(_LI("Destroying router"))

        try:
            nova_client = worker_context.nova_client
            nova_client.destroy_instance(self.instance_info)
        except Exception:
            self.log.exception(_LE("Error deleting router instance"))

        start = time.time()
        while time.time() - start < cfg.CONF.boot_timeout:
            if not nova_client.get_instance_by_id(self.instance_info.id_):
                if self.state != GONE:
                    self.state = DOWN
                return
            self.log.debug("Router has not finished stopping")
            time.sleep(cfg.CONF.retry_delay)
        self.log.error(_LE("Router failed to stop within %d secs"), cfg.CONF.boot_timeout)
Beispiel #24
0
    def send_message(self, message):
        "Called when the worker put a message in the state machine queue"
        if self.deleted:
            # Ignore any more incoming messages
            self.driver.log.debug(
                'deleted state machine, ignoring incoming message %s', message)
            return False

        # NOTE(dhellmann): This check is largely redundant with the
        # one in CalcAction.transition() but it may allow us to avoid
        # adding poll events to the queue at all, and therefore cut
        # down on the number of times a worker thread wakes up to
        # process something on a router that isn't going to actually
        # do any work.
        if message.crud == POLL and \
                self.instance.state == states.ERROR:
            self.driver.log.info(
                _LI('Resource status is ERROR, ignoring POLL message: %s'),
                message,
            )
            return False

        if message.crud == REBUILD:
            if message.body.get('image_uuid'):
                self.driver.log.info(
                    _LI('Resource is being REBUILT with custom image %s'),
                    message.body['image_uuid'])
                self.image_uuid = message.body['image_uuid']
            else:
                self.image_uuid = self.driver.image_uuid

        self._queue.append(message.crud)
        queue_len = len(self._queue)
        if queue_len > self._queue_warning_threshold:
            logger = self.driver.log.warning
        else:
            logger = self.driver.log.debug
        logger(_LW('incoming message brings queue length to %s'), queue_len)
        return True
Beispiel #25
0
 def update_router_status(self, router_id, status):
     try:
         self.api_client.update_router_status(router_id, status)
     except Exception as e:
         # We don't want to die just because we can't tell neutron
         # what the status of the router should be. Log the error
         # but otherwise ignore it.
         LOG.info(
             _LI('ignoring failure to update status for %s to %s: %s'),
             id,
             status,
             e,
         )
Beispiel #26
0
 def execute(self, action, worker_context):
     # Check for a loop where the resource keeps failing to boot or
     # accept the configuration.
     if self.instance.attempts >= self.params.reboot_error_threshold:
         self.params.driver.log.info(
             _LI('Dropping out of boot loop after '
                 ' %s trials'), self.instance.attempts)
         self.instance.set_error(worker_context)
         return action
     self.instance.boot(worker_context)
     self.params.driver.log.debug('CreateInstance attempt %s/%s',
                                  self.instance.attempts,
                                  self.params.reboot_error_threshold)
     return action
Beispiel #27
0
 def execute(self, action, worker_context):
     # Check for a loop where the resource keeps failing to boot or
     # accept the configuration.
     if self.instance.attempts >= self.params.reboot_error_threshold:
         self.params.driver.log.info(_LI('Dropping out of boot loop after '
                                     ' %s trials'),
                                     self.instance.attempts)
         self.instance.set_error(worker_context)
         return action
     self.instance.boot(worker_context)
     self.params.driver.log.debug('CreateInstance attempt %s/%s',
                                  self.instance.attempts,
                                  self.params.reboot_error_threshold)
     return action
Beispiel #28
0
    def wait(self):
        """Loop waiting on children to die and respawning as necessary"""

        LOG.debug('Full set of CONF:')
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        while self.running:
            wrap = self._wait_child()
            if not wrap:
                # Yield to other threads if no children have exited
                # Sleep for a short time to avoid excessive CPU usage
                # (see bug #1095346)
                eventlet.greenthread.sleep(.01)
                continue

            while self.running and len(wrap.children) < wrap.workers:
                self._start_child(wrap)

        if self.sigcaught:
            signame = {
                signal.SIGTERM: 'SIGTERM',
                signal.SIGINT: 'SIGINT'
            }[self.sigcaught]
            LOG.info(_LI('Caught %s, stopping children'), signame)

        for pid in self.children:
            try:
                os.kill(pid, signal.SIGTERM)
            except OSError as exc:
                if exc.errno != errno.ESRCH:
                    raise

        # Wait for children to die
        if self.children:
            LOG.info(_LI('Waiting on %d children to exit'), len(self.children))
            while self.children:
                self._wait_child()
Beispiel #29
0
    def wait(self):
        """Loop waiting on children to die and respawning as necessary"""

        LOG.debug('Full set of CONF:')
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        while self.running:
            wrap = self._wait_child()
            if not wrap:
                # Yield to other threads if no children have exited
                # Sleep for a short time to avoid excessive CPU usage
                # (see bug #1095346)
                eventlet.greenthread.sleep(.01)
                continue

            while self.running and len(wrap.children) < wrap.workers:
                self._start_child(wrap)

        if self.sigcaught:
            signame = {signal.SIGTERM: 'SIGTERM',
                       signal.SIGINT: 'SIGINT'}[self.sigcaught]
            LOG.info(_LI('Caught %s, stopping children'), signame)

        for pid in self.children:
            try:
                os.kill(pid, signal.SIGTERM)
            except OSError as exc:
                if exc.errno != errno.ESRCH:
                    raise

        # Wait for children to die
        if self.children:
            LOG.info(
                _LI('Waiting on %d children to exit'), len(self.children))
            while self.children:
                self._wait_child()
Beispiel #30
0
 def stop(self):
     """Shutdown all workers cleanly.
     """
     LOG.info('shutting down scheduler')
     # Send a poison pill to all of the workers
     for w in self.workers:
         LOG.debug('sending stop message to %s', w['worker'].name)
         w['queue'].put(None)
     # Wait for the workers to finish and be ready to exit.
     for w in self.workers:
         LOG.debug('waiting for queue for %s', w['worker'].name)
         w['queue'].close()
         LOG.debug('waiting for worker %s', w['worker'].name)
         w['worker'].join()
     LOG.info(_LI('scheduler shutdown'))
Beispiel #31
0
 def stop(self):
     """Shutdown all workers cleanly.
     """
     LOG.info('shutting down scheduler')
     # Send a poison pill to all of the workers
     for w in self.workers:
         LOG.debug('sending stop message to %s', w['worker'].name)
         w['queue'].put(None)
     # Wait for the workers to finish and be ready to exit.
     for w in self.workers:
         LOG.debug('waiting for queue for %s', w['worker'].name)
         w['queue'].close()
         LOG.debug('waiting for worker %s', w['worker'].name)
         w['worker'].join()
     LOG.info(_LI('scheduler shutdown'))
Beispiel #32
0
 def run(self, ip_address, port=cfg.CONF.rug_api_port):
     app = RugAPI()
     for i in xrange(5):
         LOG.info(_LI("Starting the rug-api on %s/%s"), ip_address, port)
         try:
             sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128)
         except socket.error as err:
             if err.errno != 99:  # EADDRNOTAVAIL
                 raise
             LOG.warning(_LW("Could not create rug-api socket: %s"), err)
             LOG.warning(_LW("Sleeping %s before trying again"), i + 1)
             eventlet.sleep(i + 1)
         else:
             break
     else:
         raise RuntimeError(_("Could not establish rug-api socket on %s/%s") % (ip_address, port))
     eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
Beispiel #33
0
def ignore_signals():
    """Ignore signals that might interrupt processing

    Since the RUG doesn't want to be asynchronously interrupted,
    various signals received needs to be ignored. The registered
    signals including SIGHUP, SIGALRM, and default signals
    SIGUSR1 and SIGUSR2 are captured and ignored through the SIG_IGN
    action.

    :param: None

    :returns: None

    """
    for s in [signal.SIGHUP, signal.SIGUSR1, signal.SIGUSR2, signal.SIGALRM]:
        logging.getLogger(__name__).info(_LI('ignoring signal %s'), s)
        signal.signal(s, signal.SIG_IGN)
Beispiel #34
0
 def run(self, ip_address, port=RUG_META_PORT):
     app = MetadataProxyHandler()
     for i in xrange(5):
         LOG.info(_LI("Starting the metadata proxy on %s/%s"), ip_address, port)
         try:
             sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128)
         except socket.error as err:
             if err.errno != 99:
                 raise
             LOG.warning(_LW("Could not create metadata proxy socket: %s"), err)
             LOG.warning(_LW("Sleeping %s before trying again"), i + 1)
             eventlet.sleep(i + 1)
         else:
             break
     else:
         raise RuntimeError(_("Could not establish metadata proxy socket on %s/%s") % (ip_address, port))
     eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
Beispiel #35
0
def shuffle_notifications(notification_queue, sched):
    """Copy messages from the notification queue into the scheduler.
    """
    while True:
        try:
            target, message = notification_queue.get()
            if target is None:
                break
            sched.handle_message(target, message)
        except IOError:
            # FIXME(rods): if a signal arrive during an IO operation
            # an IOError is raised. We catch the exceptions in
            # meantime waiting for a better solution.
            pass
        except KeyboardInterrupt:
            LOG.info(_LI('got Ctrl-C'))
            break
        except:
            LOG.exception(_LE('unhandled exception processing message'))
Beispiel #36
0
def shuffle_notifications(notification_queue, sched):
    """Copy messages from the notification queue into the scheduler.
    """
    while True:
        try:
            target, message = notification_queue.get()
            if target is None:
                break
            sched.handle_message(target, message)
        except IOError:
            # FIXME(rods): if a signal arrive during an IO operation
            # an IOError is raised. We catch the exceptions in
            # meantime waiting for a better solution.
            pass
        except KeyboardInterrupt:
            LOG.info(_LI('got Ctrl-C'))
            break
        except:
            LOG.exception(_LE('unhandled exception processing message'))
Beispiel #37
0
    def boot(self, worker_context):
        """Boots the instance with driver pre/post boot hooks.

        :returns: None
        """
        self._ensure_cache(worker_context)

        self.log.info('Booting %s' % self.driver.RESOURCE_NAME)
        self.state = states.DOWN
        self._boot_counter.start()

        # driver preboot hook
        self.driver.pre_boot(worker_context)

        # try to boot the instance
        try:
            instance_info = worker_context.nova_client.boot_instance(
                self.instance_info,
                self.driver.name,
                self.driver.image_uuid,
                self.driver.flavor,
                self.driver.make_ports(worker_context)
            )
            if not instance_info:
                self.log.info(_LI('Previous instance is still deleting'))
                # Reset the boot counter, causing the state machine to start
                # again with a new Instance.
                self.reset_boot_counter()
                self.instance_info = None
                return
        except:
            self.log.exception(_LE('Instance failed to start boot'))
            return
        else:
            # We have successfully started a (re)boot attempt so
            # record the timestamp so we can report how long it takes.
            self.state = states.BOOTING
            self.instance_info = instance_info

        # driver post boot hook
        self.driver.post_boot(worker_context)
    def _check_boot_timeout(self):
        time_since_boot = self.instance_info.time_since_boot

        if time_since_boot:
            if time_since_boot.seconds < cfg.CONF.boot_timeout:
                # Do not reset the state if we have an error
                # condition already. The state will be reset when
                # the router starts responding again, or when the
                # error is cleared from a forced rebuild.
                if self.state != ERROR:
                    self.state = BOOTING
            else:
                # If the instance was created more than `boot_timeout` seconds
                # ago, log an error and set the state set to DOWN
                self.log.info(_LI("Router is DOWN.  Created over %d secs ago."), cfg.CONF.boot_timeout)
                # Do not reset the state if we have an error condition
                # already. The state will be reset when the router starts
                # responding again, or when the error is cleared from a
                # forced rebuild.
                if self.state != ERROR:
                    self.state = DOWN
def get_default_v4_gateway(client, router, networks):
    """Find the IPv4 default gateway for the router.
    """
    LOG.debug('networks = %r', networks)
    LOG.debug('external interface = %s', router.external_port.mac_address)

    # Now find the subnet that our external IP is on, and return its
    # gateway.
    for n in networks:
        if n['network_type'] == EXTERNAL_NET:
            v4_addresses = [
                addr
                for addr in (netaddr.IPAddress(ip.partition('/')[0])
                             for ip in n['interface']['addresses'])
                if addr.version == 4
            ]
            for s in n['subnets']:
                subnet = netaddr.IPNetwork(s['cidr'])
                if subnet.version != 4:
                    continue
                LOG.debug(
                    '%s: checking if subnet %s should have the default route',
                    router.id, s['cidr'])
                for addr in v4_addresses:
                    if addr in subnet:
                        LOG.debug(
                            '%s: found gateway %s for subnet %s on network %s',
                            router.id,
                            s['gateway_ip'],
                            s['cidr'],
                            n['network_id'],
                        )
                        return s['gateway_ip']

    # Sometimes we are asked to build a configuration for the server
    # when the external interface is still marked as "down". We can
    # report that case, but we don't treat it as an error here because
    # we'll be asked to do it again when the interface comes up.
    LOG.info(_LI('%s: no default gateway was found'), router.id)
    return ''
Beispiel #40
0
def get_default_v4_gateway(client, router, networks):
    """Find the IPv4 default gateway for the router.
    """
    LOG.debug('networks = %r', networks)
    LOG.debug('external interface = %s', router.external_port.mac_address)

    # Now find the subnet that our external IP is on, and return its
    # gateway.
    for n in networks:
        if n['network_type'] == EXTERNAL_NET:
            v4_addresses = [
                addr for addr in (netaddr.IPAddress(ip.partition('/')[0])
                                  for ip in n['interface']['addresses'])
                if addr.version == 4
            ]
            for s in n['subnets']:
                subnet = netaddr.IPNetwork(s['cidr'])
                if subnet.version != 4:
                    continue
                LOG.debug(
                    '%s: checking if subnet %s should have the default route',
                    router.id, s['cidr'])
                for addr in v4_addresses:
                    if addr in subnet:
                        LOG.debug(
                            '%s: found gateway %s for subnet %s on network %s',
                            router.id,
                            s['gateway_ip'],
                            s['cidr'],
                            n['network_id'],
                        )
                        return s['gateway_ip']

    # Sometimes we are asked to build a configuration for the server
    # when the external interface is still marked as "down". We can
    # report that case, but we don't treat it as an error here because
    # we'll be asked to do it again when the interface comes up.
    LOG.info(_LI('%s: no default gateway was found'), router.id)
    return ''
Beispiel #41
0
    def wait(self):
        signal.signal(signal.SIGTERM, self._handle_signal)
        signal.signal(signal.SIGINT, self._handle_signal)

        LOG.debug('Full set of CONF:')
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        status = None
        try:
            super(ServiceLauncher, self).wait()
        except SignalExit as exc:
            signame = {signal.SIGTERM: 'SIGTERM',
                       signal.SIGINT: 'SIGINT'}[exc.signo]
            LOG.info(_LI('Caught %s, exiting'), signame)
            status = exc.code
        except SystemExit as exc:
            status = exc.code
        finally:
            if rpc:
                rpc.cleanup()
            self.stop()
        return status
Beispiel #42
0
    def wait(self):
        signal.signal(signal.SIGTERM, self._handle_signal)
        signal.signal(signal.SIGINT, self._handle_signal)

        LOG.debug('Full set of CONF:')
        CONF.log_opt_values(LOG, std_logging.DEBUG)

        status = None
        try:
            super(ServiceLauncher, self).wait()
        except SignalExit as exc:
            signame = {
                signal.SIGTERM: 'SIGTERM',
                signal.SIGINT: 'SIGINT'
            }[exc.signo]
            LOG.info(_LI('Caught %s, exiting'), signame)
            status = exc.code
        except SystemExit as exc:
            status = exc.code
        finally:
            if rpc:
                rpc.cleanup()
            self.stop()
        return status
Beispiel #43
0
def main(argv=sys.argv[1:]):
    """Main Entry point into the akanda-rug

    This is the main entry point into the akanda-rug. On invocation of
    this method, logging, local network connectivity setup is performed.
    This information is obtained through the 'ak-config' file, passed as
    arguement to this method. Worker threads are spawned for handling
    various tasks that are associated with processing as well as
    responding to different Neutron events prior to starting a notification
    dispatch loop.

    :param argv: list of Command line arguments

    :returns: None

    :raises: None

    """
    # TODO(rama) Error Handling to be added as part of the docstring
    # description

    # Change the process and thread name so the logs are cleaner.
    p = multiprocessing.current_process()
    p.name = 'pmain'
    t = threading.current_thread()
    t.name = 'tmain'
    ak_cfg.parse_config(argv)
    log.setup(cfg.CONF, 'akanda-rug')
    cfg.CONF.log_opt_values(LOG, logging.INFO)

    neutron = neutron_api.Neutron(cfg.CONF)

    # TODO(mark): develop better way restore after machine reboot
    # neutron.purge_management_interface()

    # bring the mgt tap interface up
    neutron.ensure_local_service_port()

    # bring the external port
    if cfg.CONF.plug_external_port:
        neutron.ensure_local_external_port()

    # Set up the queue to move messages between the eventlet-based
    # listening process and the scheduler.
    notification_queue = multiprocessing.Queue()

    # Ignore signals that might interrupt processing.
    daemon.ignore_signals()

    # If we see a SIGINT, stop processing.
    def _stop_processing(*args):
        notification_queue.put((None, None))

    signal.signal(signal.SIGINT, _stop_processing)

    # Listen for notifications.
    notification_proc = multiprocessing.Process(
        target=notifications.listen,
        kwargs={'notification_queue': notification_queue},
        name='notification-listener',
    )
    notification_proc.start()

    mgt_ip_address = neutron_api.get_local_service_ip(cfg.CONF).split('/')[0]
    metadata_proc = multiprocessing.Process(target=metadata.serve,
                                            args=(mgt_ip_address, ),
                                            name='metadata-proxy')
    metadata_proc.start()

    from akanda.rug.api import rug as rug_api
    rug_api_proc = multiprocessing.Process(target=rug_api.serve,
                                           args=(mgt_ip_address, ),
                                           name='rug-api')
    rug_api_proc.start()

    # Set up the notifications publisher
    Publisher = (notifications.Publisher if cfg.CONF.ceilometer.enabled else
                 notifications.NoopPublisher)
    publisher = Publisher(topic=cfg.CONF.ceilometer.topic, )

    # Set up a factory to make Workers that know how many threads to
    # run.
    worker_factory = functools.partial(worker.Worker, notifier=publisher)

    # Set up the scheduler that knows how to manage the routers and
    # dispatch messages.
    sched = scheduler.Scheduler(worker_factory=worker_factory, )

    # Prepopulate the workers with existing routers on startup
    populate.pre_populate_workers(sched)

    # Set up the periodic health check
    health.start_inspector(cfg.CONF.health_check_period, sched)

    # Block the main process, copying messages from the notification
    # listener to the scheduler
    try:
        shuffle_notifications(notification_queue, sched)
    finally:
        LOG.info(_LI('Stopping scheduler.'))
        sched.stop()
        LOG.info(_LI('Stopping notification publisher.'))
        publisher.stop()

        # Terminate the subprocesses
        for subproc in [notification_proc, metadata_proc, rug_api_proc]:
            LOG.info(_LI('Stopping %s.'), subproc.name)
            subproc.terminate()
Beispiel #44
0
    def update_state(self, worker_context, silent=False):
        """Updates state of the instance and, by extension, its logical resource

        :param worker_context:
        :param silent:
        :returns: state
        """
        self._ensure_cache(worker_context)

        if self.driver.get_state(worker_context) == states.GONE:
            self.log.debug('%s driver reported its state is GONE',
                           self.driver.RESOURCE_NAME)
            self.state = states.GONE
            return self.state

        if self.instance_info is None:
            self.log.info(_LI('no backing instance, marking as down'))
            self.state = states.DOWN
            return self.state

        for i in xrange(cfg.CONF.max_retries):
            if self.driver.is_alive(self.instance_info.management_address):
                if self.state != states.CONFIGURED:
                    self.state = states.UP
                break
            if not silent:
                self.log.debug('Alive check failed. Attempt %d of %d',
                               i,
                               cfg.CONF.max_retries)
            time.sleep(cfg.CONF.retry_delay)
        else:
            old_state = self.state
            self._check_boot_timeout()

            # If the instance isn't responding, make sure Nova knows about it
            instance = worker_context.nova_client.get_instance_for_obj(self.id)
            if instance is None and self.state != states.ERROR:
                self.log.info('No instance was found; rebooting')
                self.state = states.DOWN
                self.instance_info = None

            # update_state() is called from Alive() to check the
            # status of the router. If we can't talk to the API at
            # that point, the router should be considered missing and
            # we should reboot it, so mark it states.DOWN if we think it was
            # configured before.
            if old_state == states.CONFIGURED and self.state != states.ERROR:
                self.log.debug('Instance not alive, marking it as DOWN')
                self.state = states.DOWN

        # After the instance is all the way up, record how long it took
        # to boot and accept a configuration.
        if self.instance_info.booting and self.state == states.CONFIGURED:
            # If we didn't boot the instance (because we were restarted
            # while it remained running, for example), we won't have a
            # duration to log.
            self.instance_info.confirm_up()
            if self.instance_info.boot_duration:
                self.log.info('%s booted in %s seconds after %s attempts',
                              self.driver.RESOURCE_NAME,
                              self.instance_info.boot_duration.total_seconds(),
                              self._boot_counter.count)
            # Always reset the boot counter, even if we didn't boot
            # the server ourself, so we don't accidentally think we
            # have an erroring router.
            self._boot_counter.reset()
        return self.state
Beispiel #45
0
    def _dispatch_command(self, target, message):
        instructions = message.body
        if instructions['command'] == commands.WORKERS_DEBUG:
            self.report_status()

        # NOTE(adam_g): Drop 'router-debug' compat in M.
        elif (instructions['command'] == commands.RESOURCE_DEBUG or
              instructions['command'] == commands.ROUTER_DEBUG):

            resource_id = (instructions.get('resource_id') or
                           instructions.get('router_id'))
            if not resource_id:
                LOG.warning(_LW(
                    'Ignoring instruction to debug resource with no id'))
                return
            reason = instructions.get('reason')
            if resource_id in commands.WILDCARDS:
                LOG.warning(_LW(
                    'Ignoring instruction to debug all resources with %r'),
                    resource_id)
            else:
                LOG.info(_LI('Placing router %s in debug mode (reason: %s)'),
                         resource_id, reason)
                self.db_api.enable_resource_debug(resource_id, reason)

        elif (instructions['command'] == commands.RESOURCE_MANAGE or
              instructions['command'] == commands.ROUTER_MANAGE):
            resource_id = (instructions.get('resource_id') or
                           instructions.get('router_id'))
            if not resource_id:
                LOG.warning(_LW(
                    'Ignoring instruction to manage resource with no id'))
                return
            try:
                self.db_api.disable_resource_debug(resource_id)
                LOG.info(_LI('Resuming management of resource %s'),
                         resource_id)
            except KeyError:
                pass
            try:
                self._resource_locks[resource_id].release()
                LOG.info(_LI('Unlocked resource %s'), resource_id)
            except KeyError:
                pass
            except threading.ThreadError:
                # Already unlocked, that's OK.
                pass

        elif instructions['command'] in EVENT_COMMANDS:
            resource_id = instructions.get('resource_id')
            sm = self._find_state_machine_by_resource_id(resource_id)
            if not sm:
                LOG.debug(
                    'Will not process command, no managed state machine '
                    'found for resource %s', resource_id)
                return
            new_res = event.Resource(
                id=resource_id,
                driver=sm.driver.RESOURCE_NAME,
                tenant_id=sm.tenant_id)
            new_msg = event.Event(
                resource=new_res,
                crud=EVENT_COMMANDS[instructions['command']],
                body=instructions,
            )
            # Use handle_message() to ensure we acquire the lock
            LOG.info(_LI('sending %s instruction to %s'),
                     instructions['command'], new_res)
            self.handle_message(new_msg.resource.tenant_id, new_msg)
            LOG.info(_LI('forced %s for %s complete'),
                     instructions['command'], new_res)

        # NOTE(adam_g): This is here to support the deprecated old format of
        #               sending commands to specific routers and can be
        #               removed once the CLI component is dropped in M.
        elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS:
            print 'XXX DEPR'
            new_rsc = event.Resource(
                driver=drivers.router.Router.RESOURCE_NAME,
                id=message.body.get('router_id'),
                tenant_id=message.body.get('tenant_id'),
            )
            new_msg = event.Event(
                resource=new_rsc,
                crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']],
                body=instructions,
            )
            # Use handle_message() to ensure we acquire the lock
            LOG.info(_LI('sending %s instruction to %s'),
                     instructions['command'], new_rsc)
            self.handle_message(new_msg.resource.tenant_id, new_msg)
            LOG.info(_LI('forced %s for %s complete'),
                     instructions['command'], new_rsc)

        elif instructions['command'] == commands.TENANT_DEBUG:
            tenant_id = instructions['tenant_id']
            reason = instructions.get('reason')
            if tenant_id in commands.WILDCARDS:
                LOG.warning(_LW(
                    'Ignoring instruction to debug all tenants with %r'),
                    tenant_id)
            else:
                LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'),
                         tenant_id, reason)
                self.db_api.enable_tenant_debug(tenant_id, reason)

        elif instructions['command'] == commands.TENANT_MANAGE:
            tenant_id = instructions['tenant_id']
            try:
                self.db_api.disable_tenant_debug(tenant_id)
                LOG.info(_LI('Resuming management of tenant %s'), tenant_id)
            except KeyError:
                pass

        elif instructions['command'] == commands.GLOBAL_DEBUG:
            enable = instructions.get('enabled')
            reason = instructions.get('reason')
            if enable == 1:
                LOG.info('Enabling global debug mode (reason: %s)', reason)
                self.db_api.enable_global_debug(reason)
            elif enable == 0:
                LOG.info('Disabling global debug mode')
                self.db_api.disable_global_debug()
            else:
                LOG.warning('Unrecognized global debug command: %s',
                            instructions)
        elif instructions['command'] == commands.CONFIG_RELOAD:
            try:
                cfg.CONF()
            except Exception:
                LOG.exception(_LE('Could not reload configuration'))
            else:
                cfg.CONF.log_opt_values(LOG, INFO)

        else:
            LOG.warning(_LW('Unrecognized command: %s'), instructions)
Beispiel #46
0
    def _thread_target(self):
        """This method runs in each worker thread.
        """
        my_id = threading.current_thread().name
        LOG.debug('starting thread')
        # Use a separate context from the one we use when receiving
        # messages and talking to the tenant router manager because we
        # are in a different thread and the clients are not
        # thread-safe.
        context = WorkerContext()
        while self._keep_going:
            try:
                # Try to get a state machine from the work queue. If
                # there's nothing to do, we will block for a while.
                self._thread_status[my_id] = 'waiting for task'
                sm = self.work_queue.get(timeout=10)
            except Queue.Empty:
                continue
            if sm is None:
                LOG.info(_LI('received stop message'))
                break

            # Make sure we didn't already have some updates under way
            # for a router we've been told to ignore for debug mode.
            should_ignore, reason = \
                self.db_api.resource_in_debug(sm.resource_id)
            if should_ignore:
                LOG.debug('Skipping update of resource %s in debug mode. '
                          '(reason: %s)', sm.resource_id, reason)
                continue
            # FIXME(dhellmann): Need to look at the router to see if
            # it belongs to a tenant which is in debug mode, but we
            # don't have that data in the sm, yet.
            LOG.debug('performing work on %s for tenant %s',
                      sm.resource_id, sm.tenant_id)
            try:
                self._thread_status[my_id] = 'updating %s' % sm.resource_id
                sm.update(context)
            except:
                LOG.exception(_LE('could not complete update for %s'),
                              sm.resource_id)
            finally:
                self._thread_status[my_id] = (
                    'finalizing task for %s' % sm.resource_id
                )
                self.work_queue.task_done()
                with self.lock:
                    # Release the lock that prevents us from adding
                    # the state machine back into the queue. If we
                    # find more work, we will re-acquire it. If we do
                    # not find more work, we hold the primary work
                    # queue lock so the main thread cannot put the
                    # state machine back into the queue until we
                    # release that lock.
                    self._release_resource_lock(sm)
                    # The state machine has indicated that it is done
                    # by returning. If there is more work for it to
                    # do, reschedule it by placing it at the end of
                    # the queue.
                    if sm.has_more_work():
                        LOG.debug('%s has more work, returning to work queue',
                                  sm.resource_id)
                        self._add_resource_to_work_queue(sm)
                    else:
                        LOG.debug('%s has no more work', sm.resource_id)
        # Return the context object so tests can look at it
        self._thread_status[my_id] = 'exiting'
        return context
Beispiel #47
0
    def _dispatch_command(self, target, message):
        instructions = message.body
        if instructions['command'] == commands.WORKERS_DEBUG:
            self.report_status()

        elif instructions['command'] == commands.ROUTER_DEBUG:
            router_id = instructions['router_id']
            reason = instructions.get('reason')
            if router_id in commands.WILDCARDS:
                LOG.warning(_LW(
                    'Ignoring instruction to debug all routers with %r'),
                    router_id)
            else:
                LOG.info(_LI('Placing router %s in debug mode (reason: %s)'),
                         router_id, reason)
                self.db_api.enable_router_debug(router_id, reason)

        elif instructions['command'] == commands.ROUTER_MANAGE:
            router_id = instructions['router_id']
            try:
                self.db_api.disable_router_debug(router_id)
                LOG.info(_LI('Resuming management of router %s'), router_id)
            except KeyError:
                pass
            try:
                self._router_locks[router_id].release()
                LOG.info(_LI('Unlocked router %s'), router_id)
            except KeyError:
                pass
            except threading.ThreadError:
                # Already unlocked, that's OK.
                pass

        elif instructions['command'] in self._EVENT_COMMANDS:
            new_msg = event.Event(
                tenant_id=message.tenant_id,
                router_id=message.router_id,
                crud=self._EVENT_COMMANDS[instructions['command']],
                body=instructions,
            )
            # Use handle_message() to ensure we acquire the lock
            LOG.info(_LI('sending %s instruction to %s'),
                     instructions['command'], message.tenant_id)
            self.handle_message(new_msg.tenant_id, new_msg)
            LOG.info(_LI('forced %s for %s complete'),
                     instructions['command'], message.tenant_id)

        elif instructions['command'] == commands.TENANT_DEBUG:
            tenant_id = instructions['tenant_id']
            reason = instructions.get('reason')
            if tenant_id in commands.WILDCARDS:
                LOG.warning(_LW(
                    'Ignoring instruction to debug all tenants with %r'),
                    tenant_id)
            else:
                LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'),
                         tenant_id, reason)
                self.db_api.enable_tenant_debug(tenant_id, reason)

        elif instructions['command'] == commands.TENANT_MANAGE:
            tenant_id = instructions['tenant_id']
            try:
                self.db_api.disable_tenant_debug(tenant_id)
                LOG.info(_LI('Resuming management of tenant %s'), tenant_id)
            except KeyError:
                pass

        elif instructions['command'] == commands.GLOBAL_DEBUG:
            enable = instructions.get('enabled')
            reason = instructions.get('reason')
            if enable == 1:
                LOG.info('Enabling global debug mode (reason: %s)', reason)
                self.db_api.enable_global_debug(reason)
            elif enable == 0:
                LOG.info('Disabling global debug mode')
                self.db_api.disable_global_debug()
            else:
                LOG.warning('Unrecognized global debug command: %s',
                            instructions)
        elif instructions['command'] == commands.CONFIG_RELOAD:
            try:
                cfg.CONF()
            except Exception:
                LOG.exception(_LE('Could not reload configuration'))
            else:
                cfg.CONF.log_opt_values(LOG, INFO)

        else:
            LOG.warning(_LW('Unrecognized command: %s'), instructions)
Beispiel #48
0
    def _ensure_local_port(self, network_id, subnet_id, network_type,
                           ip_address):
        driver = importutils.import_object(self.conf.interface_driver,
                                           self.conf)

        host_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, socket.gethostname()))

        name = 'AKANDA:RUG:%s' % network_type.upper()

        query_dict = dict(device_owner=DEVICE_OWNER_RUG,
                          device_id=host_id,
                          name=name,
                          network_id=network_id)

        ports = self.api_client.list_ports(**query_dict)['ports']

        if ports:
            port = Port.from_dict(ports[0])
            LOG.info(_LI('already have local %s port, using %r'), network_type,
                     port)
        else:
            LOG.info(_LI('creating a new local %s port'), network_type)
            port_dict = {
                'admin_state_up':
                True,
                'network_id':
                network_id,
                'device_owner':
                DEVICE_OWNER_ROUTER_INT,  # lying here for IP
                'name':
                name,
                'device_id':
                host_id,
                'fixed_ips': [{
                    'ip_address': ip_address.split('/')[0],
                    'subnet_id': subnet_id
                }],
                'binding:host_id':
                socket.gethostname()
            }
            port = Port.from_dict(
                self.api_client.create_port(dict(port=port_dict))['port'])

            # remove lie that enabled us pick IP on slaac subnet
            self.api_client.update_port(
                port.id, {'port': {
                    'device_owner': DEVICE_OWNER_RUG
                }})
            port.device_owner = DEVICE_OWNER_RUG

            LOG.info(_LI('new local %s port: %r'), network_type, port)

        # create the tap interface if it doesn't already exist
        if not ip_lib.device_exists(driver.get_device_name(port)):
            driver.plug(port.network_id, port.id, driver.get_device_name(port),
                        port.mac_address)

            # add sleep to ensure that port is setup before use
            time.sleep(1)

        driver.init_l3(driver.get_device_name(port), [ip_address])
        return port
Beispiel #49
0
    def launch_service(self, service, workers=1):
        wrap = ServiceWrapper(service, workers)

        LOG.info(_LI('Starting %d workers'), wrap.workers)
        while self.running and len(wrap.children) < wrap.workers:
            self._start_child(wrap)