Beispiel #1
0
    def info(self, ctxt, publisher_id, event_type, payload, metadata):
        meta = payload['metadata']
        cluster_id = meta.get('cluster_id')
        if not cluster_id:
            return

        if self.cluster_id != cluster_id:
            return

        if event_type not in self.VM_FAILURE_EVENTS:
            return

        params = {
            'event': self.VM_FAILURE_EVENTS[event_type],
            'state': payload.get('state', 'Unknown'),
            'instance_id': payload.get('instance_id', 'Unknown'),
            'timestamp': metadata['timestamp'],
            'publisher': publisher_id,
            'operation': self.recover_action['operation'],
        }
        node_id = meta.get('cluster_node_id')
        if node_id:
            LOG.info("Requesting node recovery: %s", node_id)
            ctx = context.get_service_context(project_id=self.project_id,
                                              user_id=payload['user_id'])
            req = objects.NodeRecoverRequest(identity=node_id, params=params)
            self.rpc.call(ctx, 'node_recover', req)
Beispiel #2
0
    def info(self, ctxt, publisher_id, event_type, payload, metadata):
        if event_type not in self.STACK_FAILURE_EVENTS:
            return

        tags = payload['tags']
        if tags is None or tags == []:
            return

        cluster_id = None
        node_id = None
        for tag in tags:
            if cluster_id is None:
                start = tag.find('cluster_id')
                if start == 0 and tag[11:] == self.cluster_id:
                    cluster_id = tag[11:]
            if node_id is None:
                start = tag.find('cluster_node_id')
                if start == 0:
                    node_id = tag[16:]

        if cluster_id is None or node_id is None:
            return

        params = {
            'event': self.STACK_FAILURE_EVENTS[event_type],
            'state': payload.get('state', 'Unknown'),
            'stack_id': payload.get('stack_identity', 'Unknown'),
            'timestamp': metadata['timestamp'],
            'publisher': publisher_id,
        }
        LOG.info("Requesting stack recovery: %s", node_id)
        ctx = context.get_service_context(project=self.project_id,
                                          user=payload['user_identity'])
        req = objects.NodeRecoverRequest(identity=node_id, params=params)
        self.rpc.call(ctx, 'node_recover', req)
Beispiel #3
0
    def _recover_node(self, ctx, node_id):
        """Recover node

        :returns: Recover action
        """
        try:
            req = objects.NodeRecoverRequest(identity=node_id,
                                             params=self.recover_action)

            return self.rpc_client.call(ctx, 'node_recover', req)
        except Exception as ex:
            LOG.error("Error when performing node recovery for %s: %s",
                      node_id, ex)
            return None
Beispiel #4
0
    def _recover_node(self, node_id, ctx, recover_action):
        """Recover node

        :returns: Recover action
        """
        try:
            LOG.info("%s is requesting node recovery "
                     "for %s.", self.__class__.__name__, node_id)
            req = objects.NodeRecoverRequest(identity=node_id,
                                             params=recover_action)

            return self.rpc_client.call(ctx, 'node_recover', req)
        except Exception as ex:
            LOG.error('Error when performing node recovery for %s: %s',
                      node_id, ex)
            return None
Beispiel #5
0
    def _poll_cluster(self, cluster_id, timeout, recover_action):
        """Routine to be executed for polling cluster status.

        :param cluster_id: The UUID of the cluster to be checked.
        :param timeout: The maximum number of seconds to wait.
        :param recover_action: The health policy action name.
        :returns: Nothing.
        """
        start_time = timeutils.utcnow(True)
        cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False)
        if not cluster:
            LOG.warning("Cluster (%s) is not found.", cluster_id)
            return _chase_up(start_time, timeout)

        ctx = context.get_service_context(user_id=cluster.user,
                                          project_id=cluster.project)
        params = {'delete_check_action': True}
        try:
            req = objects.ClusterCheckRequest(identity=cluster_id,
                                              params=params)
            action = self.rpc_client.call(ctx, 'cluster_check', req)
        except Exception as ex:
            LOG.warning(
                "Failed in triggering 'cluster_check' RPC for "
                "'%(c)s': %(r)s", {
                    'c': cluster_id,
                    'r': six.text_type(ex)
                })
            return _chase_up(start_time, timeout)

        # wait for action to complete
        res, reason = self._wait_for_action(ctx, action['action'], timeout)
        if not res:
            LOG.warning("%s", reason)
            return _chase_up(start_time, timeout)

        # loop through nodes to trigger recovery
        nodes = objects.Node.get_all_by_cluster(ctx, cluster_id)
        for node in nodes:
            if node.status != consts.NS_ACTIVE:
                LOG.info("Requesting node recovery: %s", node.id)
                req = objects.NodeRecoverRequest(identity=node.id,
                                                 params=recover_action)
                self.rpc_client.call(ctx, 'node_recover', req)

        return _chase_up(start_time, timeout)
Beispiel #6
0
    def _check_url_and_recover_node(self, ctx, node, recover_action, params):
        """Routine to check a node status from a url and recovery if necessary

        :param ctx: The request context to use for recovery action
        :param node: The node to be checked.
        :param recover_action: The health policy action name.
        :param params: Parameters specific to poll url or recovery action
        :returns: action if node was triggered for recovery.  Otherwise None.
        """

        url_template = params['poll_url']
        verify_ssl = params['poll_url_ssl_verify']
        expected_resp_str = params['poll_url_healthy_response']
        max_unhealthy_retry = params['poll_url_retry_limit']
        retry_interval = params['poll_url_retry_interval']
        node_update_timeout = params['node_update_timeout']

        url = self._expand_url_template(url_template, node)
        LOG.info("Polling node status from URL: %s", url)

        available_attemps = max_unhealthy_retry
        while available_attemps > 0:
            available_attemps -= 1

            try:
                result = utils.url_fetch(url, verify=verify_ssl)
            except utils.URLFetchError as ex:
                LOG.error(
                    "Error when requesting node health status from"
                    " %s: %s", url, ex)
                return None

            LOG.debug("Node status returned from URL(%s): %s", url, result)
            if re.search(expected_resp_str, result):
                LOG.debug('Node %s is healthy', node.id)
                return None

            if node.status != consts.NS_ACTIVE:
                LOG.info(
                    "Skip node recovery because node %s is not in "
                    "ACTIVE state", node.id)
                return None

            node_last_updated = node.updated_at or node.init_at
            if not timeutils.is_older_than(node_last_updated,
                                           node_update_timeout):
                LOG.info(
                    "Node %s was updated at %s which is less than "
                    "%d secs ago. Skip node recovery.", node.id,
                    node_last_updated, node_update_timeout)
                return None

            LOG.info("Node %s is reported as down (%d retries left)", node.id,
                     available_attemps)
            time.sleep(retry_interval)

        # recover node after exhausting retries
        LOG.info("Requesting node recovery: %s", node.id)
        req = objects.NodeRecoverRequest(identity=node.id,
                                         params=recover_action)

        return self.rpc_client.call(ctx, 'node_recover', req)