Example #1
0
    def run_child(self):
        def child_hup(*args):
            """Shuts down child processes, existing requests are handled."""
            signal.signal(signal.SIGHUP, signal.SIG_IGN)
            eventlet.wsgi.is_accepting = False
            self.sock.close()

        pid = os.fork()
        if pid == 0:
            signal.signal(signal.SIGHUP, child_hup)
            signal.signal(signal.SIGTERM, signal.SIG_DFL)
            # ignore the interrupt signal to avoid a race whereby
            # a child worker receives the signal before the parent
            # and is respawned unnecessarily as a result
            signal.signal(signal.SIGINT, signal.SIG_IGN)
            # The child has no need to stash the unwrapped
            # socket, and the reference prevents a clean
            # exit on sighup
            self._sock = None
            self.run_server()
            LOG.info(_LI('Child %d exiting normally'), os.getpid())
            # self.pool.waitall() is now called in wsgi's server so
            # it's safe to exit here
            sys.exit(0)
        else:
            LOG.info(_LI('Started child %s'), pid)
            self.children.add(pid)
Example #2
0
    def run_child(self):
        def child_hup(*args):
            """Shuts down child processes, existing requests are handled."""
            signal.signal(signal.SIGHUP, signal.SIG_IGN)
            eventlet.wsgi.is_accepting = False
            self.sock.close()

        pid = os.fork()
        if pid == 0:
            signal.signal(signal.SIGHUP, child_hup)
            signal.signal(signal.SIGTERM, signal.SIG_DFL)
            # ignore the interrupt signal to avoid a race whereby
            # a child worker receives the signal before the parent
            # and is respawned unnecessarily as a result
            signal.signal(signal.SIGINT, signal.SIG_IGN)
            # The child has no need to stash the unwrapped
            # socket, and the reference prevents a clean
            # exit on sighup
            self._sock = None
            self.run_server()
            LOG.info(_LI('Child %d exiting normally'), os.getpid())
            # self.pool.waitall() is now called in wsgi's server so
            # it's safe to exit here
            sys.exit(0)
        else:
            LOG.info(_LI('Started child %s'), pid)
            self.children.add(pid)
Example #3
0
    def acquire(self, retry=True):
        '''Acquire a lock on the target.

        :param retry: When True, retry if lock was released while stealing.
        :type retry: boolean
        '''

        lock_engine_id = self.lock_create(self.target.id, self.engine_id)
        if lock_engine_id is None:
            LOG.debug("Engine %(engine)s acquired lock on %(target_type)s "
                      "%(target)s" % {'engine': self.engine_id,
                                      'target_type': self.target_type,
                                      'target': self.target.id})
            return

        if lock_engine_id == self.engine_id or \
           self.engine_alive(self.context, lock_engine_id):
            LOG.debug("Lock on %(target_type)s %(target)s is owned by engine "
                      "%(engine)s" % {'target_type': self.target_type,
                                      'target': self.target.id,
                                      'engine': lock_engine_id})
            raise exception.ActionInProgress(target_name=self.target.name,
                                             action=self.target.status)
        else:
            LOG.info(_LI("Stale lock detected on %(target_type)s %(target)s. "
                         "Engine %(engine)s will attempt to steal the lock"),
                     {'target_type': self.target_type,
                      'target': self.target.id,
                      'engine': self.engine_id})

            result = self.lock_steal(self.target.id, lock_engine_id,
                                     self.engine_id)

            if result is None:
                LOG.info(_LI("Engine %(engine)s successfully stole the lock "
                             "on %(target_type)s %(target)s"),
                         {'engine': self.engine_id,
                          'target_type': self.target_type,
                          'target': self.target.id})
                return
            elif result is True:
                if retry:
                    LOG.info(_LI("The lock on %(target_type)s %(target)s was "
                                 "released while engine %(engine)s was "
                                 "stealing it. Trying again"),
                             {'target_type': self.target_type,
                              'target': self.target.id,
                              'engine': self.engine_id})
                    return self.acquire(retry=False)
            else:
                new_lock_engine_id = result
                LOG.info(_LI("Failed to steal lock on %(target_type)s "
                             "%(target)s. Engine %(engine)s stole the "
                             "lock already"), {'target_type': self.target_type,
                                               'target': self.target.id,
                                               'engine': new_lock_engine_id})

            raise exception.ActionInProgress(
                target_name=self.target.name, action=self.target.status)
Example #4
0
 def stop(self):
     super(Dispatcher, self).stop()
     # Wait for all action threads to be finished
     LOG.info(_LI("Stopping all action threads of engine %s"),
              self.engine_id)
     # Stop ThreadGroup gracefully
     self.TG.stop(True)
     LOG.info(_LI("All action threads have been finished"))
Example #5
0
 def stop(self):
     super(Dispatcher, self).stop()
     # Wait for all action threads to be finished
     LOG.info(_LI("Stopping all action threads of engine %s"),
              self.engine_id)
     # Stop ThreadGroup gracefully
     self.TG.stop(True)
     LOG.info(_LI("All action threads have been finished"))
Example #6
0
    def _remove_children(self, pid):

        if pid in self.children:
            self.children.remove(pid)
            LOG.info(_LI('Removed dead child %s'), pid)
        elif pid in self.stale_children:
            self.stale_children.remove(pid)
            LOG.info(_LI('Removed stale child %s'), pid)
        else:
            LOG.warning(_LW('Unrecognized child %s'), pid)
Example #7
0
    def _remove_children(self, pid):

        if pid in self.children:
            self.children.remove(pid)
            LOG.info(_LI('Removed dead child %s'), pid)
        elif pid in self.stale_children:
            self.stale_children.remove(pid)
            LOG.info(_LI('Removed stale child %s'), pid)
        else:
            LOG.warning(_LW('Unrecognized child %s'), pid)
Example #8
0
 def run_child(self):
     pid = os.fork()
     if pid == 0:
         signal.signal(signal.SIGHUP, signal.SIG_DFL)
         signal.signal(signal.SIGTERM, signal.SIG_DFL)
         self.run_server()
         self.LOG.info(_LI('Child %d exiting normally') % os.getpid())
         return
     else:
         self.LOG.info(_LI('Started child %s') % pid)
         self.children.append(pid)
Example #9
0
    def _single_run(self, application, sock):
        """Start a WSGI server in a new green thread."""

        LOG.info(_LI("Starting single process server"))
        eventlet.wsgi.server(sock, application, custom_pool=self.pool,
                             url_length_limit=URL_LENGTH_LIMIT,
                             log=self._logger, debug=cfg.CONF.debug)
Example #10
0
    def _start_check(self, entry):
        """Routine to call for starting the checking for a cluster.

        @param entry: A dict containing the data associated with the cluster.
        @return: An updated registry entry record.
        """
        if entry['check_type'] == consts.NODE_STATUS_POLLING:
            interval = min(entry['interval'], cfg.CONF.periodic_interval_max)
            timer = self.TG.add_timer(interval, self._poll_cluster, None,
                                      entry['cluster_id'])
            entry['timer'] = timer
        elif entry['check_type'] == consts.VM_LIFECYCLE_EVENTS:
            LOG.info(_LI("Start listening events for cluster (%s)."),
                     entry['cluster_id'])
            listener = self._add_listener(entry['cluster_id'])
            if listener:
                entry['listener'] = listener
            else:
                return None
        else:
            LOG.warn(_LW("Cluster (%(id)s) check type (%(type)s) is invalid."),
                     {'id': entry['cluster_id'], 'type': entry['check_type']})
            return None

        return entry
Example #11
0
def main():
    try:
        logging.register_options(cfg.CONF)
        cfg.CONF(project='senlin',
                 prog='senlin-api',
                 version=version.version_info.version_string())
        logging.setup(cfg.CONF, 'senlin-api')
        objects.register_all()
        messaging.setup()

        app = wsgi.load_paste_app()

        host = cfg.CONF.senlin_api.bind_host
        port = cfg.CONF.senlin_api.bind_port
        LOG.info(_LI('Starting Senlin API on %(host)s:%(port)s'), {
            'host': host,
            'port': port
        })
        profiler.setup('senlin-api', host)
        server = wsgi.Server('senlin-api', cfg.CONF.senlin_api)
        server.start(app, default_port=port)
        systemd.notify_once()
        server.wait()
    except RuntimeError as ex:
        sys.exit("ERROR: %s" % six.text_type(ex))
Example #12
0
    def _register_info(self, name, info):
        '''place the new info in the correct location in the registry.

        :param path: a string of plugin name.
        :param info: reference to a PluginInfo data structure, deregister a
                     PluginInfo if specified as None.
        '''
        registry = self._registry
        if info is None:
            # delete this entry.
            msg = _LW("Removing %(item)s from registry")
            LOG.warning(msg, {'item': name})
            registry.pop(name, None)
            return

        if name in registry and isinstance(registry[name], PluginInfo):
            if registry[name] == info:
                return
            details = {
                'name': name,
                'old': str(registry[name].plugin),
                'new': str(info.plugin)
            }
            LOG.warning(_LW('Changing %(name)s from %(old)s to %(new)s'),
                        details)
        else:
            msg = _LI('Registering %(name)s -> %(value)s')
            LOG.info(msg, {'name': name, 'value': str(info.plugin)})

        info.user_provided = not self.is_global
        registry[name] = info
Example #13
0
    def _start_check(self, entry):
        """Routine to call for starting the checking for a cluster.

        @param entry: A dict containing the data associated with the cluster.
        @return: An updated registry entry record.
        """
        if entry['check_type'] == consts.NODE_STATUS_POLLING:
            interval = min(entry['interval'], cfg.CONF.periodic_interval_max)
            timer = self.TG.add_timer(interval, self._poll_cluster, None,
                                      entry['cluster_id'])
            entry['timer'] = timer
        elif entry['check_type'] == consts.VM_LIFECYCLE_EVENTS:
            LOG.info(_LI("Start listening events for cluster (%s)."),
                     entry['cluster_id'])
            listener = self._add_listener(entry['cluster_id'])
            if listener:
                entry['listener'] = listener
            else:
                return None
        else:
            LOG.warning(_LW("Cluster (%(id)s) check type (%(type)s) is "
                            "invalid."),
                        {'id': entry['cluster_id'],
                         'type': entry['check_type']})
            return None

        return entry
Example #14
0
    def _register_info(self, name, info):
        '''place the new info in the correct location in the registry.

        :param path: a string of plugin name.
        :param info: reference to a PluginInfo data structure, deregister a
                     PluginInfo if specified as None.
        '''
        registry = self._registry
        if info is None:
            # delete this entry.
            LOG.warning(_LW('Removing %(item)s from registry'), {'item': name})
            registry.pop(name, None)
            return

        if name in registry and isinstance(registry[name], PluginInfo):
            if registry[name] == info:
                return
            details = {
                'name': name,
                'old': str(registry[name].plugin),
                'new': str(info.plugin)
            }
            LOG.warning(_LW('Changing %(name)s from %(old)s to %(new)s'),
                        details)
        else:
            LOG.info(_LI('Registering %(name)s -> %(value)s'), {
                'name': name, 'value': str(info.plugin)})

        info.user_provided = not self.is_global
        registry[name] = info
Example #15
0
def ActionProc(context, action_id, worker_id):
    '''Action process.'''

    # Step 1: lock the action for execution
    timestamp = wallclock()
    result = db_api.action_acquire(context, action_id, worker_id, timestamp)
    if result is None:
        LOG.debug(_('Failed locking action "%s" for execution'), action_id)
        return False

    # Step 2: materialize the action object
    action = Action.load(context, action_id=action_id)

    LOG.info(_LI('Action %(name)s [%(id)s] started'),
             {'name': six.text_type(action.action), 'id': action.id})

    reason = 'Action completed'
    try:
        # Step 3: execute the action
        result, reason = action.execute()

        # NOTE: The following exception report is not giving useful
        # information for some reasons.
        # except Exception as ex:
        # We catch exception here to make sure the following logics are
        # executed.
        # result = action.RES_ERROR
        # reason = six.text_type(ex)
        # LOG.error(_('Exception occurred in action execution[%(action)s]: '
        #            '%(reason)s'), {'action': action.action,
        #                            'reason': reason})
    finally:
        # NOTE: locks on action is eventually released here by status update
        action.set_status(result, reason)
Example #16
0
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False):
    """Try to lock the specified node.

    :param context: the context used for DB operations;
    :param node_id: ID of the node to be locked.
    :param action_id: ID of the action that attempts to lock the node.
    :param engine: ID of the engine that attempts to lock the node.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """
    # Step 1: try lock the node - if the returned owner_id is the
    #         action id, it was a success
    owner = db_api.node_lock_acquire(node_id, action_id)
    if action_id == owner:
        return True

    # Step 2: retry using global configuration options
    retries = cfg.CONF.lock_retry_times
    retry_interval = cfg.CONF.lock_retry_interval

    while retries > 0:
        scheduler.sleep(retry_interval)
        LOG.debug('Acquire lock for node %s again' % node_id)
        owner = db_api.node_lock_acquire(node_id, action_id)
        if action_id == owner:
            return True
        retries = retries - 1

    # Step 3: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owner = db_api.node_lock_steal(node_id, action_id)
        return action_id == owner

    # if this node lock by dead engine
    action = db_api.action_get(context, owner)
    if (action and action.owner and action.owner != engine
            and is_engine_dead(context, action.owner)):
        LOG.info(
            _LI('The node %(n)s is locked by dead action %(a)s, '
                'try to steal the lock.'), {
                    'n': node_id,
                    'a': owner
                })
        reason = _('Engine died when executing this action.')
        db_api.action_mark_failed(context,
                                  action.id,
                                  time.time(),
                                  reason=reason)
        db_api.node_lock_steal(node_id, action_id)
        return True

    LOG.error(
        _LE('Node is already locked by action %(old)s, '
            'action %(new)s failed grabbing the lock'), {
                'old': owner,
                'new': action_id
            })

    return False
Example #17
0
    def _update_nodes(self, profile_id, nodes_obj):
        # Get batching policy data if any
        fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.")
        LOG.info(fmt, {'cluster': self.entity.id, 'profile': profile_id})
        pause_time = 0
        plan = []

        pd = self.data.get('update', None)
        if pd:
            pause_time = pd.get('pause_time')
            plan = pd.get('plan')
        else:
            pause_time = 0
            nodes_list = []
            for node in self.entity.nodes:
                nodes_list.append(node.id)
            plan.append(set(nodes_list))

        nodes = []
        for node_set in plan:
            child = []
            nodes = list(node_set)

            for node in nodes:
                kwargs = {
                    'name': 'node_update_%s' % node[:8],
                    'cause': base.CAUSE_DERIVED,
                    'inputs': {
                        'new_profile_id': profile_id,
                    },
                }
                action_id = base.Action.create(self.context, node,
                                               consts.NODE_UPDATE, **kwargs)
                child.append(action_id)

            if child:
                dobj.Dependency.create(self.context, [c for c in child],
                                       self.id)
                for cid in child:
                    ao.Action.update(self.context, cid,
                                     {'status': base.Action.READY})

                dispatcher.start_action()
                # clear the action list
                child = []
                result, new_reason = self._wait_for_dependents()
                if result != self.RES_OK:
                    self.entity.eval_status(self.context,
                                            consts.CLUSTER_UPDATE)
                    return result, _('Failed in updating nodes.')
                # pause time
                if pause_time != 0:
                    self._sleep(pause_time)

        self.entity.profile_id = profile_id
        self.entity.eval_status(self.context,
                                consts.CLUSTER_UPDATE,
                                profile_id=profile_id,
                                updated_at=timeutils.utcnow(True))
        return self.RES_OK, 'Cluster update completed.'
Example #18
0
def cluster_lock_acquire(context, cluster_id, action_id, engine=None,
                         scope=CLUSTER_SCOPE, forced=False):
    """Try to lock the specified cluster.

    :param cluster_id: ID of the cluster to be locked.
    :param action_id: ID of the action which wants to lock the cluster.
    :param engine: ID of the engine which wants to lock the cluster.
    :param scope: scope of lock, could be cluster wide lock, or node-wide
                  lock.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """

    # Step 1: try lock the cluster - if the returned owner_id is the
    #         action id, it was a success
    owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope)
    if action_id in owners:
        return True

    # Step 2: retry using global configuration options
    retries = cfg.CONF.lock_retry_times
    retry_interval = cfg.CONF.lock_retry_interval

    while retries > 0:
        scheduler.sleep(retry_interval)
        LOG.debug('Acquire lock for cluster %s again' % cluster_id)
        owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope)
        if action_id in owners:
            return True
        retries = retries - 1

    # Step 3: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owners = db_api.cluster_lock_steal(cluster_id, action_id)
        return action_id in owners

    # Will reach here only because scope == CLUSTER_SCOPE
    action = db_api.action_get(context, owners[0])
    if (action and action.owner and action.owner != engine and
            is_engine_dead(context, action.owner)):
        LOG.info(_LI('The cluster %(c)s is locked by dead action %(a)s, '
                     'try to steal the lock.'), {
            'c': cluster_id,
            'a': owners[0]
        })
        reason = _('Engine died when executing this action.')
        db_api.action_mark_failed(context, action.id, time.time(),
                                  reason=reason)
        owners = db_api.cluster_lock_steal(cluster_id, action_id)
        return action_id in owners

    LOG.error(_LE('Cluster is already locked by action %(old)s, '
                  'action %(new)s failed grabbing the lock'),
              {'old': str(owners), 'new': action_id})

    return False
Example #19
0
    def _single_run(self, application, sock):
        """Start a WSGI server in a new green thread."""

        LOG.info(_LI("Starting single process server"))
        eventlet.wsgi.server(sock,
                             application,
                             custom_pool=self.pool,
                             url_length_limit=URL_LENGTH_LIMIT,
                             log=self._logger,
                             debug=cfg.CONF.debug)
Example #20
0
def node_lock_acquire(context, node_id, action_id, engine=None,
                      forced=False):
    """Try to lock the specified node.

    :param context: the context used for DB operations;
    :param node_id: ID of the node to be locked.
    :param action_id: ID of the action that attempts to lock the node.
    :param engine: ID of the engine that attempts to lock the node.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """
    # Step 1: try lock the node - if the returned owner_id is the
    #         action id, it was a success
    owner = db_api.node_lock_acquire(node_id, action_id)
    if action_id == owner:
        return True

    # Step 2: retry using global configuration options
    retries = cfg.CONF.lock_retry_times
    retry_interval = cfg.CONF.lock_retry_interval

    while retries > 0:
        scheduler.sleep(retry_interval)
        LOG.debug('Acquire lock for node %s again' % node_id)
        owner = db_api.node_lock_acquire(node_id, action_id)
        if action_id == owner:
            return True
        retries = retries - 1

    # Step 3: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owner = db_api.node_lock_steal(node_id, action_id)
        return action_id == owner

    # if this node lock by dead engine
    action = db_api.action_get(context, owner)
    if (action and action.owner and action.owner != engine and
            is_engine_dead(context, action.owner)):
        LOG.info(_LI('The node %(n)s is locked by dead action %(a)s, '
                     'try to steal the lock.'), {
            'n': node_id,
            'a': owner
        })
        reason = _('Engine died when executing this action.')
        db_api.action_mark_failed(context, action.id, time.time(),
                                  reason=reason)
        db_api.node_lock_steal(node_id, action_id)
        return True

    LOG.error(_LE('Node is already locked by action %(old)s, '
                  'action %(new)s failed grabbing the lock'),
              {'old': owner, 'new': action_id})

    return False
Example #21
0
def cluster_lock_acquire(context,
                         cluster_id,
                         action_id,
                         engine=None,
                         scope=CLUSTER_SCOPE,
                         forced=False):
    """Try to lock the specified cluster.

    :param cluster_id: ID of the cluster to be locked.
    :param action_id: ID of the action which wants to lock the cluster.
    :param engine: ID of the engine which wants to lock the cluster.
    :param scope: scope of lock, could be cluster wide lock, or node-wide
                  lock.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """

    # Step 1: try lock the cluster - if the returned owner_id is the
    #         action id, it was a success
    owners = cl_obj.ClusterLock.acquire(cluster_id, action_id, scope)
    if action_id in owners:
        return True

    # Step 2: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owners = cl_obj.ClusterLock.steal(cluster_id, action_id)
        return action_id in owners

    # Step 3: check if the owner is a dead engine, if so, steal the lock.
    # Will reach here only because scope == CLUSTER_SCOPE
    action = ao.Action.get(context, owners[0])
    if (action and action.owner and action.owner != engine
            and utils.is_engine_dead(context, action.owner)):
        LOG.info(
            _LI('The cluster %(c)s is locked by dead action %(a)s, '
                'try to steal the lock.'), {
                    'c': cluster_id,
                    'a': owners[0]
                })
        reason = _('Engine died when executing this action.')
        owners = cl_obj.ClusterLock.steal(cluster_id, action_id)
        # Mark the old action to failed.
        ao.Action.mark_failed(context, action.id, time.time(), reason)
        return action_id in owners

    LOG.error(
        _LE('Cluster is already locked by action %(old)s, '
            'action %(new)s failed grabbing the lock'), {
                'old': str(owners),
                'new': action_id
            })

    return False
Example #22
0
    def start_wsgi(self):
        if self.conf.workers == 0:
            # Useful for profiling, test, debug etc.
            self.pool = eventlet.GreenPool(size=self.threads)
            self.pool.spawn_n(self._single_run, self.application, self.sock)
            return

        LOG.info(_LI("Starting %d workers") % self.conf.workers)
        signal.signal(signal.SIGTERM, self.kill_children)
        signal.signal(signal.SIGINT, self.kill_children)
        signal.signal(signal.SIGHUP, self.hup)
        while len(self.children) < self.conf.workers:
            self.run_child()
Example #23
0
    def start_wsgi(self):
        if self.conf.workers == 0:
            # Useful for profiling, test, debug etc.
            self.pool = eventlet.GreenPool(size=self.threads)
            self.pool.spawn_n(self._single_run, self.application, self.sock)
            return

        LOG.info(_LI("Starting %d workers") % self.conf.workers)
        signal.signal(signal.SIGTERM, self.kill_children)
        signal.signal(signal.SIGINT, self.kill_children)
        signal.signal(signal.SIGHUP, self.hup)
        while len(self.children) < self.conf.workers:
            self.run_child()
Example #24
0
    def _verify_and_respawn_children(self, pid, status):
        if len(self.stale_children) == 0:
            LOG.debug('No stale children')

        if os.WIFEXITED(status) and os.WEXITSTATUS(status) != 0:
            LOG.error(_LE('Not respawning child %d, cannot '
                          'recover from termination'), pid)
            if not self.children and not self.stale_children:
                LOG.info(_LI('All workers have terminated. Exiting'))
                self.running = False
        else:
            if len(self.children) < self.conf.workers:
                self.run_child()
Example #25
0
def info(context, entity, action, status=None, status_reason=None,
         timestamp=None):
    timestamp = timestamp or timeutils.utcnow(True)
    event = Event(timestamp, logging.INFO, entity,
                  action=action, status=status, status_reason=status_reason,
                  user=context.user, project=context.project)
    event.store(context)
    LOG.info(_LI('%(name)s [%(id)s] %(action)s - %(status)s: %(reason)s'),
             {'name': event.oname,
              'id': event.oid and event.oid[:8],
              'action': action,
              'status': status,
              'reason': status_reason})
Example #26
0
def load_dispatcher():
    """Load dispatchers."""
    global dispatchers

    LOG.debug("Loading dispatchers")
    dispatchers = named.NamedExtensionManager(
        namespace="senlin.dispatchers",
        names=cfg.CONF.event_dispatchers,
        invoke_on_load=True,
        propagate_map_exceptions=True)
    if not list(dispatchers):
        LOG.warning(_LW("No dispatchers configured for 'senlin.dispatchers'"))
    else:
        LOG.info(_LI("Loaded dispatchers: %s"), dispatchers.names())
Example #27
0
    def _verify_and_respawn_children(self, pid, status):
        if len(self.stale_children) == 0:
            LOG.debug('No stale children')

        if os.WIFEXITED(status) and os.WEXITSTATUS(status) != 0:
            LOG.error(
                _LE('Not respawning child %d, cannot '
                    'recover from termination'), pid)
            if not self.children and not self.stale_children:
                LOG.info(_LI('All workers have terminated. Exiting'))
                self.running = False
        else:
            if len(self.children) < self.conf.workers:
                self.run_child()
Example #28
0
    def set_status(self, result, reason=None):
        '''Set action status based on return value from execute.'''

        timestamp = wallclock()

        if result == self.RES_OK:
            status = self.SUCCEEDED
            msg = _LI('Action %(name)s [%(id)s] completed with SUCCESS.')
            db_api.action_mark_succeeded(self.context, self.id, timestamp)

        elif result == self.RES_ERROR:
            status = self.FAILED
            msg = _LI('Action %(name)s [%(id)s] failed with ERROR.')
            db_api.action_mark_failed(self.context, self.id, timestamp,
                                      reason=reason or 'ERROR')

        elif result == self.RES_TIMEOUT:
            status = self.FAILED
            msg = _LI('Action %(name)s [%(id)s] failed with TIMEOUT.')
            db_api.action_mark_failed(self.context, self.id, timestamp,
                                      reason=reason or 'TIMEOUT')

        elif result == self.RES_CANCEL:
            status = self.CANCELLED
            msg = _LI('Action %(name)s [%(id)s] was cancelled.')
            db_api.action_mark_cancelled(self.context, self.id, timestamp)

        else:  # result == self.RES_RETRY:
            status = self.READY
            # Action failed at the moment, but can be retried
            # We abandon it and then notify other dispatchers to execute it
            db_api.action_abandon(self.context, self.id)
            msg = _LI('Action %(name)s [%(id)s] aborted with RETRY.')

        LOG.info(msg, {'name': self.action, 'id': self.id, 'status': status})
        self.status = status
        self.status_reason = reason
Example #29
0
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False):
    """Try to lock the specified node.

    :param context: the context used for DB operations;
    :param node_id: ID of the node to be locked.
    :param action_id: ID of the action that attempts to lock the node.
    :param engine: ID of the engine that attempts to lock the node.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """
    # Step 1: try lock the node - if the returned owner_id is the
    #         action id, it was a success
    owner = nl_obj.NodeLock.acquire(node_id, action_id)
    if action_id == owner:
        return True

    # Step 2: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owner = nl_obj.NodeLock.steal(node_id, action_id)
        return action_id == owner

    # Step 3: Try to steal a lock if it's owner is a dead engine.
    # if this node lock by dead engine
    action = ao.Action.get(context, owner)
    if (action and action.owner and action.owner != engine
            and utils.is_engine_dead(context, action.owner)):
        LOG.info(
            _LI('The node %(n)s is locked by dead action %(a)s, '
                'try to steal the lock.'), {
                    'n': node_id,
                    'a': owner
                })
        reason = _('Engine died when executing this action.')
        nl_obj.NodeLock.steal(node_id, action_id)
        ao.Action.mark_failed(context, action.id, time.time(), reason)
        return True

    LOG.error(
        _LE('Node is already locked by action %(old)s, '
            'action %(new)s failed grabbing the lock'), {
                'old': owner,
                'new': action_id
            })

    return False
Example #30
0
 def wait_on_children(self):
     while self.running:
         try:
             pid, status = os.wait()
             if os.WIFEXITED(status) or os.WIFSIGNALED(status):
                 self.LOG.error(_LE('Removing dead child %s') % pid)
                 self.children.remove(pid)
                 self.run_child()
         except OSError as err:
             if err.errno not in (errno.EINTR, errno.ECHILD):
                 raise
         except KeyboardInterrupt:
             self.LOG.info(_LI('Caught keyboard interrupt. Exiting.'))
             os.killpg(0, signal.SIGTERM)
             break
     eventlet.greenio.shutdown_safe(self.sock)
     self.sock.close()
     self.LOG.debug('Exited')
Example #31
0
 def info(self, ctxt, publisher_id, event_type, payload, metadata):
     meta = payload['metadata']
     if meta.get('cluster_id') == self.cluster_id:
         if event_type not in self.VM_FAILURE_EVENTS:
             return
         params = {
             'event': self.VM_FAILURE_EVENTS[event_type],
             'state': payload.get('state', 'Unknown'),
             'instance_id': payload.get('instance_id', 'Unknown'),
             'timestamp': metadata['timestamp'],
             'publisher': publisher_id,
         }
         node_id = meta.get('cluster_node_id')
         if node_id:
             LOG.info(_LI("Requesting node recovery: %s"), node_id)
             ctx_value = context.get_service_context(
                 project=self.project_id, user=payload['user_id'])
             ctx = context.RequestContext(**ctx_value)
             self.rpc.node_recover(ctx, node_id, params)
Example #32
0
def url_fetch(url, allowed_schemes=('http', 'https')):
    '''Get the data at the specified URL.

    The URL must use the http: or https: schemes.
    The file: scheme is also supported if you override
    the allowed_schemes argument.
    Raise an IOError if getting the data fails.
    '''
    LOG.info(_LI('Fetching data from %s'), url)

    components = urllib.parse.urlparse(url)

    if components.scheme not in allowed_schemes:
        raise URLFetchError(_('Invalid URL scheme %s') % components.scheme)

    if components.scheme == 'file':
        try:
            return urllib.request.urlopen(url).read()
        except urllib.error.URLError as uex:
            raise URLFetchError(_('Failed to retrieve data: %s') % uex)

    try:
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        # We cannot use resp.text here because it would download the entire
        # file, and a large enough file would bring down the engine.  The
        # 'Content-Length' header could be faked, so it's necessary to
        # download the content in chunks to until max_reponse_size is reached.
        # The chunk_size we use needs to balance CPU-intensive string
        # concatenation with accuracy (eg. it's possible to fetch 1000 bytes
        # greater than max_response_size with a chunk_size of 1000).
        reader = resp.iter_content(chunk_size=1000)
        result = ""
        for chunk in reader:
            result += chunk
            if len(result) > cfg.CONF.max_response_size:
                raise URLFetchError("Data exceeds maximum allowed size (%s"
                                    " bytes)" % cfg.CONF.max_response_size)
        return result

    except exceptions.RequestException as ex:
        raise URLFetchError(_('Failed to retrieve data: %s') % ex)
Example #33
0
    def _load_runtime_registry(self):
        """Load the initial runtime registry with a DB scan."""
        db_registries = objects.HealthRegistry.claim(self.ctx, self.engine_id)

        for cluster in db_registries:
            entry = {
                'cluster_id': cluster.cluster_id,
                'check_type': cluster.check_type,
                'interval': cluster.interval,
                'params': cluster.params,
                'enabled': True,
            }

            LOG.info(_LI("Loading cluster %s for health monitoring"),
                     cluster.cluster_id)

            entry = self._start_check(entry)
            if entry:
                self.rt['registries'].append(entry)
Example #34
0
def url_fetch(url, allowed_schemes=('http', 'https')):
    '''Get the data at the specified URL.

    The URL must use the http: or https: schemes.
    The file: scheme is also supported if you override
    the allowed_schemes argument.
    Raise an IOError if getting the data fails.
    '''
    LOG.info(_LI('Fetching data from %s'), url)

    components = urllib.parse.urlparse(url)

    if components.scheme not in allowed_schemes:
        raise URLFetchError(_('Invalid URL scheme %s') % components.scheme)

    if components.scheme == 'file':
        try:
            return urllib.request.urlopen(url).read()
        except urllib.error.URLError as uex:
            raise URLFetchError(_('Failed to retrieve data: %s') % uex)

    try:
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        # We cannot use resp.text here because it would download the entire
        # file, and a large enough file would bring down the engine.  The
        # 'Content-Length' header could be faked, so it's necessary to
        # download the content in chunks to until max_response_size is reached.
        # The chunk_size we use needs to balance CPU-intensive string
        # concatenation with accuracy (eg. it's possible to fetch 1000 bytes
        # greater than max_response_size with a chunk_size of 1000).
        reader = resp.iter_content(chunk_size=1000)
        result = ""
        for chunk in reader:
            result += chunk
            if len(result) > cfg.CONF.max_response_size:
                raise URLFetchError("Data exceeds maximum allowed size (%s"
                                    " bytes)" % cfg.CONF.max_response_size)
        return result

    except exceptions.RequestException as ex:
        raise URLFetchError(_('Failed to retrieve data: %s') % ex)
Example #35
0
def main():
    try:
        logging.register_options(cfg.CONF)
        cfg.CONF(project='senlin', prog='senlin-api',
                 version=version.version_info.version_string())
        logging.setup(cfg.CONF, 'senlin-api')
        messaging.setup()

        app = wsgi.load_paste_app()

        host = cfg.CONF.senlin_api.bind_host
        port = cfg.CONF.senlin_api.bind_port
        LOG.info(_LI('Starting Senlin API on %(host)s:%(port)s'),
                 {'host': host, 'port': port})
        server = wsgi.Server('senlin-api', cfg.CONF.senlin_api)
        server.start(app, default_port=port)
        systemd.notify_once()
        server.wait()
    except RuntimeError as ex:
        sys.exit("ERROR: %s" % six.text_type(ex))
Example #36
0
    def start(self, application, conf, default_port):
        '''Run a WSGI server with the given application.

        :param application: The application to run in the WSGI server
        :param conf: a cfg.ConfigOpts object
        :param default_port: Port to bind to if none is specified in conf
        '''
        def kill_children(*args):
            """Kills the entire process group."""
            self.LOG.error(_LE('SIGTERM received'))
            signal.signal(signal.SIGTERM, signal.SIG_IGN)
            self.running = False
            os.killpg(0, signal.SIGTERM)

        def hup(*args):
            # Shuts down the server(s), but allows running requests to complete

            self.LOG.error(_LE('SIGHUP received'))
            signal.signal(signal.SIGHUP, signal.SIG_IGN)
            os.killpg(0, signal.SIGHUP)
            signal.signal(signal.SIGHUP, hup)

        # Note: may need to make this configurable
        eventlet.wsgi.MAX_HEADER_LINE = 16384
        self.application = application
        self.sock = get_socket(conf, default_port)

        self.LOG = logging.getLogger('eventlet.wsgi.server')

        if conf.workers == 0:
            # Useful for profiling, test, debug etc.
            self.pool = eventlet.GreenPool(size=self.threads)
            self.pool.spawn_n(self._single_run, application, self.sock)
            return

        self.LOG.info(_LI("Starting %d workers") % conf.workers)
        signal.signal(signal.SIGTERM, kill_children)
        signal.signal(signal.SIGHUP, hup)
        while len(self.children) < conf.workers:
            self.run_child()
Example #37
0
    def wait_on_children(self):
        """Wait on children exit."""

        while self.running:
            try:
                pid, status = os.wait()
                if os.WIFEXITED(status) or os.WIFSIGNALED(status):
                    self._remove_children(pid)
                    self._verify_and_respawn_children(pid, status)
            except OSError as err:
                if err.errno not in (errno.EINTR, errno.ECHILD):
                    raise
            except KeyboardInterrupt:
                LOG.info(_LI('Caught keyboard interrupt. Exiting.'))
                os.killpg(0, signal.SIGTERM)
                break
            except exception.SIGHUPInterrupt:
                self.reload()
                continue

        eventlet.greenio.shutdown_safe(self.sock)
        self.sock.close()
        LOG.debug('Exited')
Example #38
0
    def wait_on_children(self):
        """Wait on children exit."""

        while self.running:
            try:
                pid, status = os.wait()
                if os.WIFEXITED(status) or os.WIFSIGNALED(status):
                    self._remove_children(pid)
                    self._verify_and_respawn_children(pid, status)
            except OSError as err:
                if err.errno not in (errno.EINTR, errno.ECHILD):
                    raise
            except KeyboardInterrupt:
                LOG.info(_LI('Caught keyboard interrupt. Exiting.'))
                os.killpg(0, signal.SIGTERM)
                break
            except exception.SIGHUPInterrupt:
                self.reload()
                continue

        eventlet.greenio.shutdown_safe(self.sock)
        self.sock.close()
        LOG.debug('Exited')
Example #39
0
    def read_global_environment(self):
        '''Read and parse global environment files.'''

        cfg.CONF.import_opt('environment_dir', 'senlin.common.config')
        env_dir = cfg.CONF.environment_dir

        try:
            files = glob.glob(os.path.join(env_dir, '*'))
        except OSError as ex:
            LOG.error(_LE('Failed to read %s'), env_dir)
            LOG.exception(ex)
            return

        for fname in files:
            try:
                with open(fname) as f:
                    LOG.info(_LI('Loading environment from %s'), fname)
                    self.load(self.parse(f.read()))
            except ValueError as vex:
                LOG.error(_LE('Failed to parse %s'), fname)
                LOG.exception(six.text_type(vex))
            except IOError as ioex:
                LOG.error(_LE('Failed to read %s'), fname)
                LOG.exception(six.text_type(ioex))
Example #40
0
    def read_global_environment(self):
        '''Read and parse global environment files.'''

        cfg.CONF.import_opt('environment_dir', 'senlin.common.config')
        env_dir = cfg.CONF.environment_dir

        try:
            files = glob.glob(os.path.join(env_dir, '*'))
        except OSError as ex:
            LOG.error(_LE('Failed to read %s'), env_dir)
            LOG.exception(ex)
            return

        for fname in files:
            try:
                with open(fname) as f:
                    LOG.info(_LI('Loading environment from %s'), fname)
                    self.load(self.parse(f.read()))
            except ValueError as vex:
                LOG.error(_LE('Failed to parse %s'), fname)
                LOG.exception(six.text_type(vex))
            except IOError as ioex:
                LOG.error(_LE('Failed to read %s'), fname)
                LOG.exception(six.text_type(ioex))
Example #41
0
    def do_update(self):
        """Handler for CLUSTER_UPDATE action.

        :returns: A tuple consisting the result and the corresponding reason.
        """
        res = self.cluster.do_update(self.context)
        if not res:
            reason = _('Cluster update failed.')
            self.cluster.set_status(self.context, self.cluster.ERROR, reason)
            return self.RES_ERROR, reason

        name = self.inputs.get('name')
        metadata = self.inputs.get('metadata')
        timeout = self.inputs.get('timeout')
        profile_id = self.inputs.get('new_profile_id')

        if name is not None:
            self.cluster.name = name
        if metadata is not None:
            self.cluster.metadata = metadata
        if timeout is not None:
            self.cluster.timeout = timeout
        self.cluster.store(self.context)

        reason = _('Cluster update completed.')
        if profile_id is not None:
            fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.")
            LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id})
            child_actions = []
            for node in self.cluster.nodes:
                kwargs = {
                    'name': 'node_update_%s' % node.id[:8],
                    'cause': base.CAUSE_DERIVED,
                    'inputs': {
                        'new_profile_id': profile_id,
                    },
                    'user': self.context.user,
                    'project': self.context.project,
                    'domain': self.context.domain,
                }
                action = base.Action(node.id, 'NODE_UPDATE', **kwargs)
                action.store(self.context)
                child_actions.append(action)

            if child_actions:
                db_api.dependency_add(self.context,
                                      [c.id for c in child_actions],
                                      self.id)
                for child in child_actions:
                    db_api.action_update(self.context, child.id,
                                         {'status': child.READY})
                    dispatcher.start_action(action_id=child.id)

                result, new_reason = self._wait_for_dependents()
                if result != self.RES_OK:
                    self.cluster.set_status(self.context, self.cluster.WARNING,
                                            new_reason)
                    return result, new_reason
            self.cluster.set_status(self.context, self.cluster.ACTIVE,
                                    reason, profile_id=profile_id)
            return self.RES_OK, reason

        self.cluster.set_status(self.context, self.cluster.ACTIVE, reason)
        return self.RES_OK, reason
Example #42
0
    def do_update(self):
        """Handler for CLUSTER_UPDATE action.

        :returns: A tuple consisting the result and the corresponding reason.
        """
        res = self.cluster.do_update(self.context)
        if not res:
            reason = _('Cluster update failed.')
            self.cluster.set_status(self.context, self.cluster.ERROR, reason)
            return self.RES_ERROR, reason

        name = self.inputs.get('name')
        metadata = self.inputs.get('metadata')
        timeout = self.inputs.get('timeout')
        profile_id = self.inputs.get('new_profile_id')

        if name is not None:
            self.cluster.name = name
        if metadata is not None:
            self.cluster.metadata = metadata
        if timeout is not None:
            self.cluster.timeout = timeout
        self.cluster.store(self.context)

        reason = _('Cluster update completed.')
        if profile_id is None:
            self.cluster.set_status(self.context, self.cluster.ACTIVE, reason)
            return self.RES_OK, reason

        fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.")
        LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id})
        child = []
        for node in self.cluster.nodes:
            kwargs = {
                'name': 'node_update_%s' % node.id[:8],
                'cause': base.CAUSE_DERIVED,
                'inputs': {
                    'new_profile_id': profile_id,
                },
            }
            action_id = base.Action.create(self.context, node.id,
                                           consts.NODE_UPDATE, **kwargs)
            child.append(action_id)

        if child:
            db_api.dependency_add(self.context, [c for c in child], self.id)
            for cid in child:
                db_api.action_update(self.context, cid,
                                     {'status': base.Action.READY})
                dispatcher.start_action(action_id=cid)

            result, new_reason = self._wait_for_dependents()
            if result != self.RES_OK:
                new_reason = _('Failed in updating nodes.')
                self.cluster.set_status(self.context, self.cluster.WARNING,
                                        new_reason)
                return result, new_reason

        self.cluster.set_status(self.context,
                                self.cluster.ACTIVE,
                                reason,
                                profile_id=profile_id)
        return self.RES_OK, reason
Example #43
0
def cluster_lock_acquire(context,
                         cluster_id,
                         action_id,
                         engine=None,
                         scope=CLUSTER_SCOPE,
                         forced=False):
    """Try to lock the specified cluster.

    :param cluster_id: ID of the cluster to be locked.
    :param action_id: ID of the action which wants to lock the cluster.
    :param engine: ID of the engine which wants to lock the cluster.
    :param scope: scope of lock, could be cluster wide lock, or node-wide
                  lock.
    :param forced: set to True to cancel current action that owns the lock,
                   if any.
    :returns: True if lock is acquired, or False otherwise.
    """

    # Step 1: try lock the cluster - if the returned owner_id is the
    #         action id, it was a success
    owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope)
    if action_id in owners:
        return True

    # Step 2: retry using global configuration options
    retries = cfg.CONF.lock_retry_times
    retry_interval = cfg.CONF.lock_retry_interval

    while retries > 0:
        scheduler.sleep(retry_interval)
        LOG.debug('Acquire lock for cluster %s again' % cluster_id)
        owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope)
        if action_id in owners:
            return True
        retries = retries - 1

    # Step 3: Last resort is 'forced locking', only needed when retry failed
    if forced:
        owners = db_api.cluster_lock_steal(cluster_id, action_id)
        return action_id in owners

    # Will reach here only because scope == CLUSTER_SCOPE
    action = db_api.action_get(context, owners[0])
    if (action and action.owner and action.owner != engine
            and is_engine_dead(context, action.owner)):
        LOG.info(
            _LI('The cluster %(c)s is locked by dead action %(a)s, '
                'try to steal the lock.'), {
                    'c': cluster_id,
                    'a': owners[0]
                })
        reason = _('Engine died when executing this action.')
        db_api.action_mark_failed(context,
                                  action.id,
                                  time.time(),
                                  reason=reason)
        owners = db_api.cluster_lock_steal(cluster_id, action_id)
        return action_id in owners

    LOG.error(
        _LE('Cluster is already locked by action %(old)s, '
            'action %(new)s failed grabbing the lock'), {
                'old': str(owners),
                'new': action_id
            })

    return False
Example #44
0
    def do_update(self):
        """Handler for CLUSTER_UPDATE action.

        :returns: A tuple consisting the result and the corresponding reason.
        """
        res = self.cluster.do_update(self.context)
        if not res:
            reason = _('Cluster update failed.')
            self.cluster.set_status(self.context, self.cluster.ERROR, reason)
            return self.RES_ERROR, reason

        name = self.inputs.get('name')
        metadata = self.inputs.get('metadata')
        timeout = self.inputs.get('timeout')
        profile_id = self.inputs.get('new_profile_id')

        if name is not None:
            self.cluster.name = name
        if metadata is not None:
            self.cluster.metadata = metadata
        if timeout is not None:
            self.cluster.timeout = timeout
        self.cluster.store(self.context)

        reason = _('Cluster update completed.')
        if profile_id is None:
            self.cluster.set_status(self.context, self.cluster.ACTIVE, reason)
            return self.RES_OK, reason

        fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.")
        LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id})
        child = []
        for node in self.cluster.nodes:
            kwargs = {
                'name': 'node_update_%s' % node.id[:8],
                'cause': base.CAUSE_DERIVED,
                'inputs': {
                    'new_profile_id': profile_id,
                },
            }
            action_id = base.Action.create(self.context, node.id,
                                           consts.NODE_UPDATE, **kwargs)
            child.append(action_id)

        if child:
            dobj.Dependency.create(self.context, [c for c in child], self.id)
            for cid in child:
                ao.Action.update(self.context, cid,
                                 {'status': base.Action.READY})
            dispatcher.start_action()

            result, new_reason = self._wait_for_dependents()
            if result != self.RES_OK:
                new_reason = _('Failed in updating nodes.')
                self.cluster.set_status(self.context, self.cluster.WARNING,
                                        new_reason)
                return result, new_reason

        self.cluster.set_status(self.context, self.cluster.ACTIVE,
                                reason, profile_id=profile_id)
        return self.RES_OK, reason