def test_cancel_backgound_command(self, read_stack_version_mock, resolve_hook_script_path_mock, resolve_script_path_mock, FileCache_mock,
                                      kill_process_with_children_mock):
    FileCache_mock.return_value = None
    FileCache_mock.cache_dir = MagicMock()
    resolve_hook_script_path_mock.return_value = None
#     shell.kill_process_with_children = MagicMock()
    dummy_controller = MagicMock()
    cfg = AmbariConfig().getConfig()
    cfg.set('agent', 'tolerate_download_failures', 'true')
    cfg.set('agent', 'prefix', '.')
    cfg.set('agent', 'cache_dir', 'background_tasks')

    actionQueue = ActionQueue(cfg, dummy_controller)

    dummy_controller.actionQueue = actionQueue
    orchestrator = CustomServiceOrchestrator(cfg, dummy_controller)
    orchestrator.file_cache = MagicMock()
    def f (a, b):
      return ""
    orchestrator.file_cache.get_service_base_dir = f
    actionQueue.customServiceOrchestrator = orchestrator

    import TestActionQueue
    import copy

    TestActionQueue.patch_output_file(orchestrator.python_executor)
    orchestrator.python_executor.prepare_process_result = MagicMock()
    orchestrator.dump_command_to_json = MagicMock()

    lock = threading.RLock()
    complete_done = threading.Condition(lock)

    complete_was_called = {}
    def command_complete_w(process_condenced_result, handle):
      with lock:
        complete_was_called['visited']= ''
        complete_done.wait(3)

    actionQueue.on_background_command_complete_callback = TestActionQueue.wraped(actionQueue.on_background_command_complete_callback, command_complete_w, None)
    execute_command = copy.deepcopy(TestActionQueue.TestActionQueue.background_command)
    actionQueue.put([execute_command])
    actionQueue.processBackgroundQueueSafeEmpty()

    time.sleep(.1)

    orchestrator.cancel_command(19,'')
    self.assertTrue(kill_process_with_children_mock.called)
    kill_process_with_children_mock.assert_called_with(33)

    with lock:
      complete_done.notifyAll()

    with lock:
      self.assertTrue(complete_was_called.has_key('visited'))

    time.sleep(.1)

    runningCommand = actionQueue.commandStatuses.get_command_status(19)
    self.assertTrue(runningCommand is not None)
    self.assertEqual(runningCommand['status'], ActionQueue.FAILED_STATUS)
Ejemplo n.º 2
0
class ActionQueue(threading.Thread):
    """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

    # How many actions can be performed in parallel. Feel free to change
    MAX_CONCURRENT_ACTIONS = 5

    #How much time(in seconds) we need wait for new incoming execution command before checking
    #status command queue
    EXECUTION_COMMAND_WAIT_TIME = 2

    STATUS_COMMAND = 'STATUS_COMMAND'
    EXECUTION_COMMAND = 'EXECUTION_COMMAND'
    AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
    BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
    ROLE_COMMAND_INSTALL = 'INSTALL'
    ROLE_COMMAND_START = 'START'
    ROLE_COMMAND_STOP = 'STOP'
    ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
    CUSTOM_COMMAND_RESTART = 'RESTART'
    CUSTOM_COMMAND_START = ROLE_COMMAND_START

    IN_PROGRESS_STATUS = 'IN_PROGRESS'
    COMPLETED_STATUS = 'COMPLETED'
    FAILED_STATUS = 'FAILED'

    def __init__(self, config, controller):
        super(ActionQueue, self).__init__()
        self.commandQueue = Queue.Queue()
        self.statusCommandResultQueue = multiprocessing.Queue(
        )  # this queue is filled by StatuCommandsExecutor.
        self.backgroundCommandQueue = Queue.Queue()
        self.commandStatuses = CommandStatusDict(
            callback_action=self.status_update_callback)
        self.config = config
        self.controller = controller
        self.configTags = {}
        self._stop = threading.Event()
        self.tmpdir = config.get('agent', 'prefix')
        self.customServiceOrchestrator = CustomServiceOrchestrator(
            config, controller)
        self.parallel_execution = config.get_parallel_exec_option()
        if self.parallel_execution == 1:
            logger.info(
                "Parallel execution is enabled, will execute agent commands in parallel"
            )
        self.lock = threading.Lock()

    def stop(self):
        self._stop.set()

    def stopped(self):
        return self._stop.isSet()

    def put_status(self, commands):
        self.controller.statusCommandsExecutor.put_commands(commands)

    def put(self, commands):
        for command in commands:
            if not command.has_key('serviceName'):
                command['serviceName'] = "null"
            if not command.has_key('clusterName'):
                command['clusterName'] = 'null'

            logger.info("Adding " + command['commandType'] + " for role " + \
                        command['role'] + " for service " + \
                        command['serviceName'] + " of cluster " + \
                        command['clusterName'] + " to the queue.")
            if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND:
                self.backgroundCommandQueue.put(
                    self.createCommandHandle(command))
            else:
                self.commandQueue.put(command)

    def cancel(self, commands):
        for command in commands:

            logger.info("Canceling command with taskId = {tid}".format(
                tid=str(command['target_task_id'])))
            logger.debug(pprint.pformat(command))

            task_id = command['target_task_id']
            reason = command['reason']

            # Remove from the command queue by task_id
            queue = self.commandQueue
            self.commandQueue = Queue.Queue()

            while not queue.empty():
                queued_command = queue.get(False)
                if queued_command['taskId'] != task_id:
                    self.commandQueue.put(queued_command)
                else:
                    logger.info("Canceling " + queued_command['commandType'] + \
                                " for service " + queued_command['serviceName'] + \
                                " and role " +  queued_command['role'] + \
                                " with taskId " + str(queued_command['taskId']))

            # Kill if in progress
            self.customServiceOrchestrator.cancel_command(task_id, reason)

    def run(self):
        try:
            while not self.stopped():
                self.processBackgroundQueueSafeEmpty()
                self.controller.get_status_commands_executor().process_results(
                )  # process status commands
                try:
                    if self.parallel_execution == 0:
                        command = self.commandQueue.get(
                            True, self.EXECUTION_COMMAND_WAIT_TIME)
                        self.process_command(command)
                    else:
                        # If parallel execution is enabled, just kick off all available
                        # commands using separate threads
                        while (True):
                            command = self.commandQueue.get(
                                True, self.EXECUTION_COMMAND_WAIT_TIME)
                            # If command is not retry_enabled then do not start them in parallel
                            # checking just one command is enough as all commands for a stage is sent
                            # at the same time and retry is only enabled for initial start/install
                            retryAble = False
                            if 'commandParams' in command and 'command_retry_enabled' in command[
                                    'commandParams']:
                                retryAble = command['commandParams'][
                                    'command_retry_enabled'] == "true"
                            if retryAble:
                                logger.info(
                                    "Kicking off a thread for the command, id="
                                    + str(command['commandId']) + " taskId=" +
                                    str(command['taskId']))
                                t = threading.Thread(
                                    target=self.process_command,
                                    args=(command, ))
                                t.daemon = True
                                t.start()
                            else:
                                self.process_command(command)
                                break
                            pass
                        pass
                except (Queue.Empty):
                    pass
        except:
            logger.exception("ActionQueue thread failed with exception:")
            raise

        logger.info("ActionQueue thread has successfully finished")

    def processBackgroundQueueSafeEmpty(self):
        while not self.backgroundCommandQueue.empty():
            try:
                command = self.backgroundCommandQueue.get(False)
                if command.has_key(
                        '__handle') and command['__handle'].status == None:
                    self.process_command(command)
            except Queue.Empty:
                pass

    def createCommandHandle(self, command):
        if command.has_key('__handle'):
            raise AgentException("Command already has __handle")
        command['__handle'] = BackgroundCommandExecutionHandle(
            command, command['commandId'], None,
            self.on_background_command_complete_callback)
        return command

    def process_command(self, command):
        # make sure we log failures
        commandType = command['commandType']
        logger.debug("Took an element of Queue (command type = %s)." %
                     commandType)
        try:
            if commandType in [
                    self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND,
                    self.AUTO_EXECUTION_COMMAND
            ]:
                try:
                    if self.controller.recovery_manager.enabled():
                        self.controller.recovery_manager.start_execution_command(
                        )
                    self.execute_command(command)
                finally:
                    if self.controller.recovery_manager.enabled():
                        self.controller.recovery_manager.stop_execution_command(
                        )
            else:
                logger.error("Unrecognized command " + pprint.pformat(command))
        except Exception:
            logger.exception(
                "Exception while processing {0} command".format(commandType))

    def tasks_in_progress_or_pending(self):
        return_val = False
        if not self.commandQueue.empty():
            return_val = True
        if self.controller.recovery_manager.has_active_command():
            return_val = True
        return return_val
        pass

    def execute_command(self, command):
        '''
    Executes commands of type EXECUTION_COMMAND
    '''
        clusterName = command['clusterName']
        commandId = command['commandId']
        isCommandBackground = command[
            'commandType'] == self.BACKGROUND_EXECUTION_COMMAND
        isAutoExecuteCommand = command[
            'commandType'] == self.AUTO_EXECUTION_COMMAND
        message = "Executing command with id = {commandId}, taskId = {taskId} for role = {role} of " \
                  "cluster {cluster}.".format(
                  commandId = str(commandId), taskId = str(command['taskId']),
                  role=command['role'], cluster=clusterName)
        logger.info(message)

        taskId = command['taskId']
        # Preparing 'IN_PROGRESS' report
        in_progress_status = self.commandStatuses.generate_report_template(
            command)
        # The path of the files that contain the output log and error log use a prefix that the agent advertises to the
        # server. The prefix is defined in agent-config.ini
        if not isAutoExecuteCommand:
            in_progress_status.update({
                'tmpout':
                self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
                'tmperr':
                self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
                'structuredOut':
                self.tmpdir + os.sep + 'structured-out-' + str(taskId) +
                '.json',
                'status':
                self.IN_PROGRESS_STATUS
            })
        else:
            in_progress_status.update({
                'tmpout':
                self.tmpdir + os.sep + 'auto_output-' + str(taskId) + '.txt',
                'tmperr':
                self.tmpdir + os.sep + 'auto_errors-' + str(taskId) + '.txt',
                'structuredOut':
                self.tmpdir + os.sep + 'auto_structured-out-' + str(taskId) +
                '.json',
                'status':
                self.IN_PROGRESS_STATUS
            })

        self.commandStatuses.put_command_status(command, in_progress_status)

        numAttempts = 0
        retryDuration = 0  # even with 0 allow one attempt
        retryAble = False
        delay = 1
        log_command_output = True
        if 'commandParams' in command and 'log_output' in command[
                'commandParams'] and "false" == command['commandParams'][
                    'log_output']:
            log_command_output = False

        if 'commandParams' in command:
            if 'max_duration_for_retries' in command['commandParams']:
                retryDuration = int(
                    command['commandParams']['max_duration_for_retries'])
            if 'command_retry_enabled' in command['commandParams']:
                retryAble = command['commandParams'][
                    'command_retry_enabled'] == "true"
        if isAutoExecuteCommand:
            retryAble = False

        logger.info(
            "Command execution metadata - taskId = {taskId}, retry enabled = {retryAble}, max retry duration (sec) = {retryDuration}, log_output = {log_command_output}"
            .format(taskId=taskId,
                    retryAble=retryAble,
                    retryDuration=retryDuration,
                    log_command_output=log_command_output))
        command_canceled = False
        while retryDuration >= 0:
            numAttempts += 1
            start = 0
            if retryAble:
                start = int(time.time())
            # running command
            commandresult = self.customServiceOrchestrator.runCommand(
                command,
                in_progress_status['tmpout'],
                in_progress_status['tmperr'],
                override_output_files=numAttempts == 1,
                retry=numAttempts > 1)
            end = 1
            if retryAble:
                end = int(time.time())
            retryDuration -= (end - start)

            # dumping results
            if isCommandBackground:
                logger.info(
                    "Command is background command, quit retrying. Exit code: {exitCode}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId,
                            exitCode=commandresult['exitcode'],
                            retryAble=retryAble,
                            retryDuration=retryDuration,
                            delay=delay))
                return
            else:
                if commandresult['exitcode'] == 0:
                    status = self.COMPLETED_STATUS
                else:
                    status = self.FAILED_STATUS
                    if (commandresult['exitcode']
                            == -signal.SIGTERM) or (commandresult['exitcode']
                                                    == -signal.SIGKILL):
                        logger.info(
                            'Command with taskId = {cid} was canceled!'.format(
                                cid=taskId))
                        command_canceled = True
                        break

            if status != self.COMPLETED_STATUS and retryAble and retryDuration > 0:
                delay = self.get_retry_delay(delay)
                if delay > retryDuration:
                    delay = retryDuration
                retryDuration -= delay  # allow one last attempt
                commandresult[
                    'stderr'] += "\n\nCommand failed. Retrying command execution ...\n\n"
                logger.info(
                    "Retrying command with taskId = {cid} after a wait of {delay}"
                    .format(cid=taskId, delay=delay))
                command['commandBeingRetried'] = "true"
                time.sleep(delay)
                continue
            else:
                logger.info(
                    "Quit retrying for command with taskId = {cid}. Status: {status}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId,
                            status=status,
                            retryAble=retryAble,
                            retryDuration=retryDuration,
                            delay=delay))
                break

        # do not fail task which was rescheduled from server
        if command_canceled:
            with self.lock:
                with self.commandQueue.mutex:
                    for com in self.commandQueue.queue:
                        if com['taskId'] == command['taskId']:
                            logger.info(
                                'Command with taskId = {cid} was rescheduled by server. '
                                'Fail report on cancelled command won\'t be sent with heartbeat.'
                                .format(cid=taskId))
                            return

        # final result to stdout
        commandresult[
            'stdout'] += '\n\nCommand completed successfully!\n' if status == self.COMPLETED_STATUS else '\n\nCommand failed after ' + str(
                numAttempts) + ' tries\n'
        logger.info(
            'Command with taskId = {cid} completed successfully!'.format(
                cid=taskId) if status == self.COMPLETED_STATUS else
            'Command with taskId = {cid} failed after {attempts} tries'.
            format(cid=taskId, attempts=numAttempts))

        roleResult = self.commandStatuses.generate_report_template(command)
        roleResult.update({
            'stdout': commandresult['stdout'],
            'stderr': commandresult['stderr'],
            'exitCode': commandresult['exitcode'],
            'status': status,
        })

        if self.config.has_option("logging","log_command_executes") \
            and int(self.config.get("logging", "log_command_executes")) == 1 \
            and log_command_output:

            if roleResult['stdout'] != '':
                logger.info("Begin command output log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])
                self.log_command_output(roleResult['stdout'],
                                        str(command['taskId']))
                logger.info("End command output log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])

            if roleResult['stderr'] != '':
                logger.info("Begin command stderr log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])
                self.log_command_output(roleResult['stderr'],
                                        str(command['taskId']))
                logger.info("End command stderr log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])

        if roleResult['stdout'] == '':
            roleResult['stdout'] = 'None'
        if roleResult['stderr'] == '':
            roleResult['stderr'] = 'None'

        # let ambari know name of custom command
        if command['hostLevelParams'].has_key('custom_command'):
            roleResult['customCommand'] = command['hostLevelParams'][
                'custom_command']

        if 'structuredOut' in commandresult:
            roleResult['structuredOut'] = str(
                json.dumps(commandresult['structuredOut']))
        else:
            roleResult['structuredOut'] = ''

        # let recovery manager know the current state
        if status == self.COMPLETED_STATUS:
            if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
                and self.controller.recovery_manager.configured_for_recovery(command['role']):
                if command['roleCommand'] == self.ROLE_COMMAND_START:
                    self.controller.recovery_manager.update_current_status(
                        command['role'], LiveStatus.LIVE_STATUS)
                    self.controller.recovery_manager.update_config_staleness(
                        command['role'], False)
                    logger.info(
                        "After EXECUTION_COMMAND (START), with taskId=" +
                        str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))
                elif command[
                        'roleCommand'] == self.ROLE_COMMAND_STOP or command[
                            'roleCommand'] == self.ROLE_COMMAND_INSTALL:
                    self.controller.recovery_manager.update_current_status(
                        command['role'], LiveStatus.DEAD_STATUS)
                    logger.info(
                        "After EXECUTION_COMMAND (STOP/INSTALL), with taskId="
                        + str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))
                elif command[
                        'roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND:
                    if command['hostLevelParams'].has_key('custom_command') and \
                            command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART:
                        self.controller.recovery_manager.update_current_status(
                            command['role'], LiveStatus.LIVE_STATUS)
                        self.controller.recovery_manager.update_config_staleness(
                            command['role'], False)
                        logger.info(
                            "After EXECUTION_COMMAND (RESTART), current state of "
                            + command['role'] + " to " +
                            self.controller.recovery_manager.
                            get_current_status(command['role']))
            pass

            # let ambari know that configuration tags were applied
            configHandler = ActualConfigHandler(self.config, self.configTags)

            #update
            if 'commandParams' in command:
                command_params = command['commandParams']
                if command_params and command_params.has_key(
                        'forceRefreshConfigTags') and len(
                            command_params['forceRefreshConfigTags']) > 0:
                    forceRefreshConfigTags = command_params[
                        'forceRefreshConfigTags'].split(',')
                    logger.info(
                        "Got refresh additional component tags command")

                    for configTag in forceRefreshConfigTags:
                        configHandler.update_component_tag(
                            command['role'], configTag,
                            command['configurationTags'][configTag])

                    roleResult[
                        'customCommand'] = self.CUSTOM_COMMAND_RESTART  # force restart for component to evict stale_config on server side
                    command[
                        'configurationTags'] = configHandler.read_actual_component(
                            command['role'])

            if command.has_key('configurationTags'):
                configHandler.write_actual(command['configurationTags'])
                roleResult['configurationTags'] = command['configurationTags']
            component = {
                'serviceName': command['serviceName'],
                'componentName': command['role']
            }
            if 'roleCommand' in command and \
                (command['roleCommand'] == self.ROLE_COMMAND_START or
                   (command['roleCommand'] == self.ROLE_COMMAND_INSTALL and component in LiveStatus.CLIENT_COMPONENTS) or
                     (command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND and
                        'custom_command' in command['hostLevelParams'] and
                            command['hostLevelParams']['custom_command'] in (self.CUSTOM_COMMAND_RESTART, self.CUSTOM_COMMAND_START))):
                configHandler.write_actual_component(
                    command['role'], command['configurationTags'])
                if 'clientsToUpdateConfigs' in command[
                        'hostLevelParams'] and command['hostLevelParams'][
                            'clientsToUpdateConfigs']:
                    configHandler.write_client_components(
                        command['serviceName'], command['configurationTags'],
                        command['hostLevelParams']['clientsToUpdateConfigs'])
                roleResult[
                    'configurationTags'] = configHandler.read_actual_component(
                        command['role'])
        elif status == self.FAILED_STATUS:
            if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
                    and self.controller.recovery_manager.configured_for_recovery(command['role']):
                if command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
                    self.controller.recovery_manager.update_current_status(
                        command['role'],
                        self.controller.recovery_manager.INSTALL_FAILED)
                    logger.info(
                        "After EXECUTION_COMMAND (INSTALL), with taskId=" +
                        str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))

        self.commandStatuses.put_command_status(command, roleResult)

    def log_command_output(self, text, taskId):
        """
    Logs a message as multiple enumerated log messages every of which is not larger than MAX_SYMBOLS_PER_LOG_MESSAGE.

    If logs are redirected to syslog (syslog_enabled=1), this is very useful for logging big messages.
    As syslog usually truncates long messages.
    """
        chunks = split_on_chunks(text, MAX_SYMBOLS_PER_LOG_MESSAGE)
        if len(chunks) > 1:
            for i in range(len(chunks)):
                logger.info(
                    "Cmd log for taskId={0} and chunk {1}/{2} of log for command: \n"
                    .format(taskId, i + 1, len(chunks)) + chunks[i])
        else:
            logger.info("Cmd log for taskId={0}: ".format(taskId) + text)

    def get_retry_delay(self, last_delay):
        """
    Returns exponentially growing delay. The idea being if number of retries is high then the reason to retry
    is probably a host or environment specific issue requiring longer waits
    """
        return last_delay * 2

    def command_was_canceled(self):
        self.customServiceOrchestrator

    def on_background_command_complete_callback(self, process_condensed_result,
                                                handle):
        logger.debug('Start callback: %s' % process_condensed_result)
        logger.debug('The handle is: %s' % handle)
        status = self.COMPLETED_STATUS if handle.exitCode == 0 else self.FAILED_STATUS

        aborted_postfix = self.customServiceOrchestrator.command_canceled_reason(
            handle.command['taskId'])
        if aborted_postfix:
            status = self.FAILED_STATUS
            logger.debug('Set status to: %s , reason = %s' %
                         (status, aborted_postfix))
        else:
            aborted_postfix = ''

        roleResult = self.commandStatuses.generate_report_template(
            handle.command)

        roleResult.update({
            'stdout':
            process_condensed_result['stdout'] + aborted_postfix,
            'stderr':
            process_condensed_result['stderr'] + aborted_postfix,
            'exitCode':
            process_condensed_result['exitcode'],
            'structuredOut':
            str(json.dumps(process_condensed_result['structuredOut']))
            if 'structuredOut' in process_condensed_result else '',
            'status':
            status,
        })

        self.commandStatuses.put_command_status(handle.command, roleResult)

    def execute_status_command_and_security_status(self, command):
        component_status_result = self.customServiceOrchestrator.requestComponentStatus(
            command)
        return command, component_status_result

    def process_status_command_result(self, result):
        '''
    Executes commands of type STATUS_COMMAND
    '''
        try:
            command, component_status_result = result
            cluster = command['clusterName']
            service = command['serviceName']
            component = command['componentName']
            configurations = command['configurations']
            if configurations.has_key('global'):
                globalConfig = configurations['global']
            else:
                globalConfig = {}

            if not Script.config:
                logger.debug(
                    'Setting Script.config to last status command configuration'
                )
                Script.config = command

            livestatus = LiveStatus(cluster, service, component, globalConfig,
                                    self.config, self.configTags)

            component_extra = None

            if component_status_result['exitcode'] == 0:
                component_status = LiveStatus.LIVE_STATUS
                if self.controller.recovery_manager.enabled() \
                  and self.controller.recovery_manager.configured_for_recovery(component):
                    self.controller.recovery_manager.update_current_status(
                        component, component_status)
            else:
                component_status = LiveStatus.DEAD_STATUS
                if self.controller.recovery_manager.enabled() \
                  and self.controller.recovery_manager.configured_for_recovery(component):
                    if (self.controller.recovery_manager.get_current_status(
                            component) !=
                            self.controller.recovery_manager.INSTALL_FAILED):
                        self.controller.recovery_manager.update_current_status(
                            component, component_status)

            request_execution_cmd = self.controller.recovery_manager.requires_recovery(component) and \
                                      not self.controller.recovery_manager.command_exists(component, ActionQueue.EXECUTION_COMMAND)

            if 'structuredOut' in component_status_result:
                component_extra = component_status_result['structuredOut']

            result = livestatus.build(component_status=component_status)
            if self.controller.recovery_manager.enabled():
                result['sendExecCmdDet'] = str(request_execution_cmd)

            if component_extra is not None and len(component_extra) != 0:
                if component_extra.has_key('alerts'):
                    result['alerts'] = component_extra['alerts']
                    del component_extra['alerts']

                result['extra'] = component_extra

            logger.debug("Got live status for component " + component + \
                         " of service " + str(service) + \
                         " of cluster " + str(cluster))

            logger.debug(pprint.pformat(result))
            if result is not None:
                self.commandStatuses.put_command_status(command, result)
        except Exception, err:
            traceback.print_exc()
            logger.warn(err)
        pass
Ejemplo n.º 3
0
class ActionQueue(threading.Thread):
  """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

  # How many actions can be performed in parallel. Feel free to change
  MAX_CONCURRENT_ACTIONS = 5


  #How much time(in seconds) we need wait for new incoming execution command before checking
  #status command queue
  EXECUTION_COMMAND_WAIT_TIME = 2

  STATUS_COMMAND = 'STATUS_COMMAND'
  EXECUTION_COMMAND = 'EXECUTION_COMMAND'
  AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
  BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
  ROLE_COMMAND_INSTALL = 'INSTALL'
  ROLE_COMMAND_START = 'START'
  ROLE_COMMAND_STOP = 'STOP'
  ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
  CUSTOM_COMMAND_RESTART = 'RESTART'

  IN_PROGRESS_STATUS = 'IN_PROGRESS'
  COMPLETED_STATUS = 'COMPLETED'
  FAILED_STATUS = 'FAILED'

  def __init__(self, config, controller):
    super(ActionQueue, self).__init__()
    self.commandQueue = Queue.Queue()
    self.statusCommandQueue = Queue.Queue()
    self.backgroundCommandQueue = Queue.Queue()
    self.commandStatuses = CommandStatusDict(callback_action =
      self.status_update_callback)
    self.config = config
    self.controller = controller
    self.configTags = {}
    self._stop = threading.Event()
    self.tmpdir = config.get('agent', 'prefix')
    self.customServiceOrchestrator = CustomServiceOrchestrator(config, controller)
    self.parallel_execution = config.get_parallel_exec_option()
    if self.parallel_execution == 1:
      logger.info("Parallel execution is enabled, will start Agent commands in parallel")

  def stop(self):
    self._stop.set()

  def stopped(self):
    return self._stop.isSet()

  def put_status(self, commands):
    #Was supposed that we got all set of statuses, we don't need to keep old ones
    self.statusCommandQueue.queue.clear()

    for command in commands:
      logger.info("Adding " + command['commandType'] + " for service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      self.statusCommandQueue.put(command)
      logger.debug(pprint.pformat(command))

  def put(self, commands):
    for command in commands:
      if not command.has_key('serviceName'):
        command['serviceName'] = "null"
      if not command.has_key('clusterName'):
        command['clusterName'] = 'null'

      logger.info("Adding " + command['commandType'] + " for role " + \
                  command['role'] + " for service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND :
        self.backgroundCommandQueue.put(self.createCommandHandle(command))
      else:
        self.commandQueue.put(command)

  def cancel(self, commands):
    for command in commands:

      logger.info("Canceling command {tid}".format(tid = str(command['target_task_id'])))
      logger.debug(pprint.pformat(command))

      task_id = command['target_task_id']
      reason = command['reason']

      # Remove from the command queue by task_id
      queue = self.commandQueue
      self.commandQueue = Queue.Queue()

      while not queue.empty():
        queued_command = queue.get(False)
        if queued_command['task_id'] != task_id:
          self.commandQueue.put(queued_command)
        else:
          logger.info("Canceling " + queued_command['commandType'] + \
                      " for service " + queued_command['serviceName'] + \
                      " of cluster " +  queued_command['clusterName'] + \
                      " to the queue.")

      # Kill if in progress
      self.customServiceOrchestrator.cancel_command(task_id, reason)

  def run(self):
    while not self.stopped():
      self.processBackgroundQueueSafeEmpty();
      self.processStatusCommandQueueSafeEmpty();
      try:
        if self.parallel_execution == 0:
          command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
          self.process_command(command)
        else:
          # If parallel execution is enabled, just kick off all available
          # commands using separate threads
          while (True):
            command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
            logger.info("Kicking off a thread for the command, id=" +
                        str(command['commandId']) + " taskId=" + str(command['taskId']))
            t = threading.Thread(target=self.process_command, args=(command,))
            t.daemon = True
            t.start()
      except (Queue.Empty):
        pass

  def processBackgroundQueueSafeEmpty(self):
    while not self.backgroundCommandQueue.empty():
      try:
        command = self.backgroundCommandQueue.get(False)
        if(command.has_key('__handle') and command['__handle'].status == None):
          self.process_command(command)
      except (Queue.Empty):
        pass

  def processStatusCommandQueueSafeEmpty(self):
    while not self.statusCommandQueue.empty():
      try:
        command = self.statusCommandQueue.get(False)
        self.process_command(command)
      except (Queue.Empty):
        pass


  def createCommandHandle(self, command):
    if(command.has_key('__handle')):
      raise AgentException("Command already has __handle")
    command['__handle'] = BackgroundCommandExecutionHandle(command, command['commandId'], None, self.on_background_command_complete_callback)
    return command

  def process_command(self, command):
    # make sure we log failures
    commandType = command['commandType']
    logger.debug("Took an element of Queue (command type = %s)." % commandType)
    try:
      if commandType in [self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND, self.AUTO_EXECUTION_COMMAND]:
        try:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.start_execution_command()
          self.execute_command(command)
        finally:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.stop_execution_command()
      elif commandType == self.STATUS_COMMAND:
        self.execute_status_command(command)
      else:
        logger.error("Unrecognized command " + pprint.pformat(command))
    except Exception, err:
      # Should not happen
      traceback.print_exc()
      logger.warn(err)
Ejemplo n.º 4
0
class ActionQueue(threading.Thread):
    """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

    # How many actions can be performed in parallel. Feel free to change
    MAX_CONCURRENT_ACTIONS = 5

    #How much time(in seconds) we need wait for new incoming execution command before checking
    #status command queue
    EXECUTION_COMMAND_WAIT_TIME = 2

    STATUS_COMMAND = 'STATUS_COMMAND'
    EXECUTION_COMMAND = 'EXECUTION_COMMAND'
    AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
    BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
    ROLE_COMMAND_INSTALL = 'INSTALL'
    ROLE_COMMAND_START = 'START'
    ROLE_COMMAND_STOP = 'STOP'
    ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
    CUSTOM_COMMAND_RESTART = 'RESTART'

    IN_PROGRESS_STATUS = 'IN_PROGRESS'
    COMPLETED_STATUS = 'COMPLETED'
    FAILED_STATUS = 'FAILED'

    def __init__(self, config, controller):
        super(ActionQueue, self).__init__()
        self.commandQueue = Queue.Queue()
        self.statusCommandQueue = Queue.Queue()
        self.backgroundCommandQueue = Queue.Queue()
        self.commandStatuses = CommandStatusDict(
            callback_action=self.status_update_callback)
        self.config = config
        self.controller = controller
        self.configTags = {}
        self._stop = threading.Event()
        self.tmpdir = config.get('agent', 'prefix')
        self.customServiceOrchestrator = CustomServiceOrchestrator(
            config, controller)
        self.parallel_execution = config.get_parallel_exec_option()
        if self.parallel_execution == 1:
            logger.info(
                "Parallel execution is enabled, will start Agent commands in parallel"
            )

    def stop(self):
        self._stop.set()

    def stopped(self):
        return self._stop.isSet()

    def put_status(self, commands):
        #Was supposed that we got all set of statuses, we don't need to keep old ones
        self.statusCommandQueue.queue.clear()

        for command in commands:
            logger.info("Adding " + command['commandType'] + " for service " + \
                        command['serviceName'] + " of cluster " + \
                        command['clusterName'] + " to the queue.")
            self.statusCommandQueue.put(command)

    def put(self, commands):
        for command in commands:
            if not command.has_key('serviceName'):
                command['serviceName'] = "null"
            if not command.has_key('clusterName'):
                command['clusterName'] = 'null'

            logger.info("Adding " + command['commandType'] + " for role " + \
                        command['role'] + " for service " + \
                        command['serviceName'] + " of cluster " + \
                        command['clusterName'] + " to the queue.")
            if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND:
                self.backgroundCommandQueue.put(
                    self.createCommandHandle(command))
            else:
                self.commandQueue.put(command)

    def cancel(self, commands):
        for command in commands:

            logger.info("Canceling command {tid}".format(
                tid=str(command['target_task_id'])))
            logger.debug(pprint.pformat(command))

            task_id = command['target_task_id']
            reason = command['reason']

            # Remove from the command queue by task_id
            queue = self.commandQueue
            self.commandQueue = Queue.Queue()

            while not queue.empty():
                queued_command = queue.get(False)
                if queued_command['task_id'] != task_id:
                    self.commandQueue.put(queued_command)
                else:
                    logger.info("Canceling " + queued_command['commandType'] + \
                                " for service " + queued_command['serviceName'] + \
                                " of cluster " +  queued_command['clusterName'] + \
                                " to the queue.")

            # Kill if in progress
            self.customServiceOrchestrator.cancel_command(task_id, reason)

    def run(self):
        while not self.stopped():
            self.processBackgroundQueueSafeEmpty()
            self.processStatusCommandQueueSafeEmpty()
            try:
                if self.parallel_execution == 0:
                    command = self.commandQueue.get(
                        True, self.EXECUTION_COMMAND_WAIT_TIME)
                    self.process_command(command)
                else:
                    # If parallel execution is enabled, just kick off all available
                    # commands using separate threads
                    while (True):
                        command = self.commandQueue.get(
                            True, self.EXECUTION_COMMAND_WAIT_TIME)
                        logger.info(
                            "Kicking off a thread for the command, id=" +
                            str(command['commandId']) + " taskId=" +
                            str(command['taskId']))
                        t = threading.Thread(target=self.process_command,
                                             args=(command, ))
                        t.daemon = True
                        t.start()
            except (Queue.Empty):
                pass

    def processBackgroundQueueSafeEmpty(self):
        while not self.backgroundCommandQueue.empty():
            try:
                command = self.backgroundCommandQueue.get(False)
                if (command.has_key('__handle')
                        and command['__handle'].status == None):
                    self.process_command(command)
            except (Queue.Empty):
                pass

    def processStatusCommandQueueSafeEmpty(self):
        while not self.statusCommandQueue.empty():
            try:
                command = self.statusCommandQueue.get(False)
                self.process_command(command)
            except (Queue.Empty):
                pass

    def createCommandHandle(self, command):
        if (command.has_key('__handle')):
            raise AgentException("Command already has __handle")
        command['__handle'] = BackgroundCommandExecutionHandle(
            command, command['commandId'], None,
            self.on_background_command_complete_callback)
        return command

    def process_command(self, command):
        # make sure we log failures
        commandType = command['commandType']
        logger.debug("Took an element of Queue (command type = %s)." %
                     commandType)
        try:
            if commandType in [
                    self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND,
                    self.AUTO_EXECUTION_COMMAND
            ]:
                self.execute_command(command)
            elif commandType == self.STATUS_COMMAND:
                self.execute_status_command(command)
            else:
                logger.error("Unrecognized command " + pprint.pformat(command))
        except Exception, err:
            # Should not happen
            traceback.print_exc()
            logger.warn(err)
  def test_cancel_command(self, FileCache_mock,
                      run_file_mock, dump_command_to_json_mock,
                      get_hook_base_dir_mock, get_service_base_dir_mock,
                      get_host_scripts_base_dir_mock,
                      resolve_hook_script_path_mock, resolve_script_path_mock,
                      kill_process_with_children_mock):
    FileCache_mock.return_value = None
    command = {
      'role' : 'REGION_SERVER',
      'hostLevelParams' : {
        'stack_name' : 'HDP',
        'stack_version' : '2.0.7',
        'jdk_location' : 'some_location'
      },
      'commandParams': {
        'script_type': 'PYTHON',
        'script': 'scripts/hbase_regionserver.py',
        'command_timeout': '600',
        'service_package_folder' : 'HBASE'
      },
      'taskId' : '3',
      'roleCommand': 'INSTALL'
    }
    
    get_host_scripts_base_dir_mock.return_value = "/host_scripts"
    get_service_base_dir_mock.return_value = "/basedir/"
    resolve_script_path_mock.return_value = "/basedir/scriptpath"
    resolve_hook_script_path_mock.return_value = \
      ('/hooks_dir/prefix-command/scripts/hook.py',
       '/hooks_dir/prefix-command')
    dummy_controller = MagicMock()
    orchestrator = CustomServiceOrchestrator(self.config, dummy_controller)
    unix_process_id = 111
    orchestrator.commands_in_progress = {command['taskId']: unix_process_id}
    get_hook_base_dir_mock.return_value = "/hooks/"
    run_file_mock_return_value = {
      'stdout' : 'killed',
      'stderr' : 'killed',
      'exitcode': 1,
      }
    def side_effect(*args, **kwargs):
      time.sleep(0.2)
      return run_file_mock_return_value
    run_file_mock.side_effect = side_effect

    _, out = tempfile.mkstemp()
    _, err = tempfile.mkstemp()
    pool = ThreadPool(processes=1)
    async_result = pool.apply_async(orchestrator.runCommand, (command, out, err))

    time.sleep(0.1)
    orchestrator.cancel_command(command['taskId'], 'reason')

    ret = async_result.get()

    self.assertEqual(ret['exitcode'], 1)
    self.assertEquals(ret['stdout'], 'killed\nCommand aborted. reason')
    self.assertEquals(ret['stderr'], 'killed\nCommand aborted. reason')

    self.assertTrue(kill_process_with_children_mock.called)
    self.assertFalse(command['taskId'] in orchestrator.commands_in_progress.keys())
    self.assertTrue(os.path.exists(out))
    self.assertTrue(os.path.exists(err))
    try:
      os.remove(out)
      os.remove(err)
    except:
      pass
    def test_cancel_backgound_command(self, read_stack_version_mock,
                                      resolve_hook_script_path_mock,
                                      resolve_script_path_mock, FileCache_mock,
                                      kill_process_with_children_mock,
                                      get_py_executor_mock):
        FileCache_mock.return_value = None
        FileCache_mock.cache_dir = MagicMock()
        resolve_hook_script_path_mock.return_value = None
        #     shell.kill_process_with_children = MagicMock()
        dummy_controller = MagicMock()
        cfg = AmbariConfig()
        cfg.set('agent', 'tolerate_download_failures', 'true')
        cfg.set('agent', 'prefix', '.')
        cfg.set('agent', 'cache_dir', 'background_tasks')

        actionQueue = ActionQueue(cfg, dummy_controller)

        dummy_controller.actionQueue = actionQueue
        orchestrator = CustomServiceOrchestrator(cfg, dummy_controller)
        orchestrator.file_cache = MagicMock()

        def f(a, b):
            return ""

        orchestrator.file_cache.get_service_base_dir = f
        actionQueue.customServiceOrchestrator = orchestrator

        import TestActionQueue
        import copy

        pyex = PythonExecutor(actionQueue.customServiceOrchestrator.tmp_dir,
                              actionQueue.customServiceOrchestrator.config)
        TestActionQueue.patch_output_file(pyex)
        pyex.prepare_process_result = MagicMock()
        get_py_executor_mock.return_value = pyex
        orchestrator.dump_command_to_json = MagicMock()

        lock = threading.RLock()
        complete_done = threading.Condition(lock)

        complete_was_called = {}

        def command_complete_w(process_condenced_result, handle):
            with lock:
                complete_was_called['visited'] = ''
                complete_done.wait(3)

        actionQueue.on_background_command_complete_callback = TestActionQueue.wraped(
            actionQueue.on_background_command_complete_callback,
            command_complete_w, None)
        execute_command = copy.deepcopy(
            TestActionQueue.TestActionQueue.background_command)
        actionQueue.put([execute_command])
        actionQueue.processBackgroundQueueSafeEmpty()

        time.sleep(.1)

        orchestrator.cancel_command(19, '')
        self.assertTrue(kill_process_with_children_mock.called)
        kill_process_with_children_mock.assert_called_with(33)

        with lock:
            complete_done.notifyAll()

        with lock:
            self.assertTrue(complete_was_called.has_key('visited'))

        time.sleep(.1)

        runningCommand = actionQueue.commandStatuses.get_command_status(19)
        self.assertTrue(runningCommand is not None)
        self.assertEqual(runningCommand['status'], ActionQueue.FAILED_STATUS)
    def test_cancel_command(
            self, FileCache_mock, run_file_mock, dump_command_to_json_mock,
            get_hook_base_dir_mock, get_service_base_dir_mock,
            get_host_scripts_base_dir_mock, resolve_hook_script_path_mock,
            resolve_script_path_mock, kill_process_with_children_mock):
        FileCache_mock.return_value = None
        command = {
            'role': 'REGION_SERVER',
            'hostLevelParams': {
                'stack_name': 'HDP',
                'stack_version': '2.0.7',
                'jdk_location': 'some_location'
            },
            'commandParams': {
                'script_type': 'PYTHON',
                'script': 'scripts/hbase_regionserver.py',
                'command_timeout': '600',
                'service_package_folder': 'HBASE'
            },
            'taskId': '3',
            'roleCommand': 'INSTALL'
        }

        get_host_scripts_base_dir_mock.return_value = "/host_scripts"
        get_service_base_dir_mock.return_value = "/basedir/"
        resolve_script_path_mock.return_value = "/basedir/scriptpath"
        resolve_hook_script_path_mock.return_value = \
          ('/hooks_dir/prefix-command/scripts/hook.py',
           '/hooks_dir/prefix-command')
        dummy_controller = MagicMock()
        orchestrator = CustomServiceOrchestrator(self.config, dummy_controller)
        unix_process_id = 111
        orchestrator.commands_in_progress = {
            command['taskId']: unix_process_id
        }
        get_hook_base_dir_mock.return_value = "/hooks/"
        run_file_mock_return_value = {
            'stdout': 'killed',
            'stderr': 'killed',
            'exitcode': 1,
        }

        def side_effect(*args, **kwargs):
            time.sleep(0.2)
            return run_file_mock_return_value

        run_file_mock.side_effect = side_effect

        _, out = tempfile.mkstemp()
        _, err = tempfile.mkstemp()
        pool = ThreadPool(processes=1)
        async_result = pool.apply_async(orchestrator.runCommand,
                                        (command, out, err))

        time.sleep(0.1)
        orchestrator.cancel_command(command['taskId'], 'reason')

        ret = async_result.get()

        self.assertEqual(ret['exitcode'], 1)
        self.assertEquals(ret['stdout'], 'killed\nCommand aborted. reason')
        self.assertEquals(ret['stderr'], 'killed\nCommand aborted. reason')

        self.assertTrue(kill_process_with_children_mock.called)
        self.assertFalse(
            command['taskId'] in orchestrator.commands_in_progress.keys())
        self.assertTrue(os.path.exists(out))
        self.assertTrue(os.path.exists(err))
        try:
            os.remove(out)
            os.remove(err)
        except:
            pass
Ejemplo n.º 8
0
class ActionQueue(threading.Thread):
  """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

  # How many actions can be performed in parallel. Feel free to change
  MAX_CONCURRENT_ACTIONS = 5


  #How much time(in seconds) we need wait for new incoming execution command before checking
  #status command queue
  EXECUTION_COMMAND_WAIT_TIME = 2

  STATUS_COMMAND = 'STATUS_COMMAND'
  EXECUTION_COMMAND = 'EXECUTION_COMMAND'
  AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
  BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
  ROLE_COMMAND_INSTALL = 'INSTALL'
  ROLE_COMMAND_START = 'START'
  ROLE_COMMAND_STOP = 'STOP'
  ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
  CUSTOM_COMMAND_RESTART = 'RESTART'

  IN_PROGRESS_STATUS = 'IN_PROGRESS'
  COMPLETED_STATUS = 'COMPLETED'
  FAILED_STATUS = 'FAILED'

  def __init__(self, config, controller):
    super(ActionQueue, self).__init__()
    self.commandQueue = Queue.Queue()
    self.statusCommandQueue = Queue.Queue()
    self.backgroundCommandQueue = Queue.Queue()
    self.commandStatuses = CommandStatusDict(callback_action =
      self.status_update_callback)
    self.config = config
    self.controller = controller
    self.configTags = {}
    self._stop = threading.Event()
    self.tmpdir = config.get('agent', 'prefix')
    self.customServiceOrchestrator = CustomServiceOrchestrator(config, controller)
    self.parallel_execution = config.get_parallel_exec_option()
    if self.parallel_execution == 1:
      logger.info("Parallel execution is enabled, will execute agent commands in parallel")

  def stop(self):
    self._stop.set()

  def stopped(self):
    return self._stop.isSet()

  def put_status(self, commands):
    #Was supposed that we got all set of statuses, we don't need to keep old ones
    self.statusCommandQueue.queue.clear()

    for command in commands:
      logger.info("Adding " + command['commandType'] + " for component " + \
                  command['componentName'] + " of service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      self.statusCommandQueue.put(command)
      logger.debug(pprint.pformat(command))

  def put(self, commands):
    for command in commands:
      if not command.has_key('serviceName'):
        command['serviceName'] = "null"
      if not command.has_key('clusterName'):
        command['clusterName'] = 'null'

      logger.info("Adding " + command['commandType'] + " for role " + \
                  command['role'] + " for service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND :
        self.backgroundCommandQueue.put(self.createCommandHandle(command))
      else:
        self.commandQueue.put(command)

  def cancel(self, commands):
    for command in commands:

      logger.info("Canceling command {tid}".format(tid = str(command['target_task_id'])))
      logger.debug(pprint.pformat(command))

      task_id = command['target_task_id']
      reason = command['reason']

      # Remove from the command queue by task_id
      queue = self.commandQueue
      self.commandQueue = Queue.Queue()

      while not queue.empty():
        queued_command = queue.get(False)
        if queued_command['taskId'] != task_id:
          self.commandQueue.put(queued_command)
        else:
          logger.info("Canceling " + queued_command['commandType'] + \
                      " for service " + queued_command['serviceName'] + \
                      " and role " +  queued_command['role'] + \
                      " with taskId " + str(queued_command['taskId']))

      # Kill if in progress
      self.customServiceOrchestrator.cancel_command(task_id, reason)

  def run(self):
    try:
      while not self.stopped():
        self.processBackgroundQueueSafeEmpty();
        self.processStatusCommandQueueSafeEmpty();
        try:
          if self.parallel_execution == 0:
            command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
            self.process_command(command)
          else:
            # If parallel execution is enabled, just kick off all available
            # commands using separate threads
            while (True):
              command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
              # If command is not retry_enabled then do not start them in parallel
              # checking just one command is enough as all commands for a stage is sent
              # at the same time and retry is only enabled for initial start/install
              retryAble = False
              if 'commandParams' in command and 'command_retry_enabled' in command['commandParams']:
                retryAble = command['commandParams']['command_retry_enabled'] == "true"
              if retryAble:
                logger.info("Kicking off a thread for the command, id=" +
                            str(command['commandId']) + " taskId=" + str(command['taskId']))
                t = threading.Thread(target=self.process_command, args=(command,))
                t.daemon = True
                t.start()
              else:
                self.process_command(command)
                break
              pass
            pass
        except (Queue.Empty):
          pass
    except:
      logger.exception("ActionQueue thread failed with exception:")
      raise
    
    logger.info("ActionQueue thread has successfully finished")

  def processBackgroundQueueSafeEmpty(self):
    while not self.backgroundCommandQueue.empty():
      try:
        command = self.backgroundCommandQueue.get(False)
        if command.has_key('__handle') and command['__handle'].status == None:
          self.process_command(command)
      except Queue.Empty:
        pass

  def processStatusCommandQueueSafeEmpty(self):
    while not self.statusCommandQueue.empty():
      try:
        command = self.statusCommandQueue.get(False)
        self.process_command(command)
      except Queue.Empty:
        pass


  def createCommandHandle(self, command):
    if command.has_key('__handle'):
      raise AgentException("Command already has __handle")
    command['__handle'] = BackgroundCommandExecutionHandle(command, command['commandId'], None, self.on_background_command_complete_callback)
    return command

  def process_command(self, command):
    # make sure we log failures
    commandType = command['commandType']
    logger.debug("Took an element of Queue (command type = %s)." % commandType)
    try:
      if commandType in [self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND, self.AUTO_EXECUTION_COMMAND]:
        try:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.start_execution_command()
          self.execute_command(command)
        finally:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.stop_execution_command()
      elif commandType == self.STATUS_COMMAND:
        self.execute_status_command(command)
      else:
        logger.error("Unrecognized command " + pprint.pformat(command))
    except Exception:
      logger.exception("Exception while processing {0} command".format(commandType))

  def tasks_in_progress_or_pending(self):
    return_val = False
    if not self.commandQueue.empty():
      return_val = True
    if self.controller.recovery_manager.has_active_command():
      return_val = True
    return return_val
    pass

  def execute_command(self, command):
    '''
    Executes commands of type EXECUTION_COMMAND
    '''
    clusterName = command['clusterName']
    commandId = command['commandId']
    isCommandBackground = command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND
    isAutoExecuteCommand = command['commandType'] == self.AUTO_EXECUTION_COMMAND
    message = "Executing command with id = {commandId}, taskId = {taskId} for role = {role} of " \
              "cluster {cluster}.".format(
              commandId = str(commandId), taskId = str(command['taskId']),
              role=command['role'], cluster=clusterName)
    logger.info(message)

    taskId = command['taskId']
    # Preparing 'IN_PROGRESS' report
    in_progress_status = self.commandStatuses.generate_report_template(command)
    # The path of the files that contain the output log and error log use a prefix that the agent advertises to the
    # server. The prefix is defined in agent-config.ini
    if not isAutoExecuteCommand:
      in_progress_status.update({
        'tmpout': self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
        'tmperr': self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
        'structuredOut' : self.tmpdir + os.sep + 'structured-out-' + str(taskId) + '.json',
        'status': self.IN_PROGRESS_STATUS
      })
    else:
      in_progress_status.update({
        'tmpout': self.tmpdir + os.sep + 'auto_output-' + str(taskId) + '.txt',
        'tmperr': self.tmpdir + os.sep + 'auto_errors-' + str(taskId) + '.txt',
        'structuredOut' : self.tmpdir + os.sep + 'auto_structured-out-' + str(taskId) + '.json',
        'status': self.IN_PROGRESS_STATUS
      })

    self.commandStatuses.put_command_status(command, in_progress_status)

    numAttempts = 0
    retryDuration = 0  # even with 0 allow one attempt
    retryAble = False
    delay = 1
    log_command_output = True
    if 'commandParams' in command and 'log_output' in command['commandParams'] and "false" == command['commandParams']['log_output']:
      log_command_output = False

    if 'commandParams' in command:
      if 'max_duration_for_retries' in command['commandParams']:
        retryDuration = int(command['commandParams']['max_duration_for_retries'])
      if 'command_retry_enabled' in command['commandParams']:
        retryAble = command['commandParams']['command_retry_enabled'] == "true"
    if isAutoExecuteCommand:
      retryAble = False

    logger.info("Command execution metadata - taskId = {taskId}, retry enabled = {retryAble}, max retry duration (sec) = {retryDuration}, log_output = {log_command_output}".
                 format(taskId=taskId, retryAble=retryAble, retryDuration=retryDuration, log_command_output=log_command_output))
    while retryDuration >= 0:
      numAttempts += 1
      start = 0
      if retryAble:
        start = int(time.time())
      # running command
      commandresult = self.customServiceOrchestrator.runCommand(command,
                                                                in_progress_status['tmpout'],
                                                                in_progress_status['tmperr'],
                                                                override_output_files=numAttempts == 1,
                                                                retry=numAttempts > 1)
      end = 1
      if retryAble:
        end = int(time.time())
      retryDuration -= (end - start)

      # dumping results
      if isCommandBackground:
        logger.info("Command is background command, quit retrying. Exit code: {exitCode}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId, exitCode=commandresult['exitcode'], retryAble=retryAble, retryDuration=retryDuration, delay=delay))
        return
      else:
        if commandresult['exitcode'] == 0:
          status = self.COMPLETED_STATUS
        else:
          status = self.FAILED_STATUS
          if (commandresult['exitcode'] == -signal.SIGTERM) or (commandresult['exitcode'] == -signal.SIGKILL):
            logger.info('Command {cid} was canceled!'.format(cid=taskId))
            break

      if status != self.COMPLETED_STATUS and retryAble and retryDuration > 0:
        delay = self.get_retry_delay(delay)
        if delay > retryDuration:
          delay = retryDuration
        retryDuration -= delay  # allow one last attempt
        commandresult['stderr'] += "\n\nCommand failed. Retrying command execution ...\n\n"
        logger.info("Retrying command id {cid} after a wait of {delay}".format(cid=taskId, delay=delay))
        time.sleep(delay)
        continue
      else:
        logger.info("Quit retrying for command id {cid}. Status: {status}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId, status=status, retryAble=retryAble, retryDuration=retryDuration, delay=delay))
        break

    # final result to stdout
    commandresult['stdout'] += '\n\nCommand completed successfully!\n' if status == self.COMPLETED_STATUS else '\n\nCommand failed after ' + str(numAttempts) + ' tries\n'
    logger.info('Command {cid} completed successfully!'.format(cid=taskId) if status == self.COMPLETED_STATUS else 'Command {cid} failed after {attempts} tries'.format(cid=taskId, attempts=numAttempts))

    roleResult = self.commandStatuses.generate_report_template(command)
    roleResult.update({
      'stdout': commandresult['stdout'],
      'stderr': commandresult['stderr'],
      'exitCode': commandresult['exitcode'],
      'status': status,
    })

    if self.config.has_option("logging","log_command_executes") \
        and int(self.config.get("logging", "log_command_executes")) == 1 \
        and log_command_output:

      if roleResult['stdout'] != '':
          logger.info("Begin command output log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])
          self.log_command_output(roleResult['stdout'], str(command['taskId']))
          logger.info("End command output log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])

      if roleResult['stderr'] != '':
          logger.info("Begin command stderr log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])
          self.log_command_output(roleResult['stderr'], str(command['taskId']))
          logger.info("End command stderr log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])

    if roleResult['stdout'] == '':
      roleResult['stdout'] = 'None'
    if roleResult['stderr'] == '':
      roleResult['stderr'] = 'None'

    # let ambari know name of custom command
    if command['hostLevelParams'].has_key('custom_command'):
      roleResult['customCommand'] = command['hostLevelParams']['custom_command']

    if 'structuredOut' in commandresult:
      roleResult['structuredOut'] = str(json.dumps(commandresult['structuredOut']))
    else:
      roleResult['structuredOut'] = ''

    # let recovery manager know the current state
    if status == self.COMPLETED_STATUS:
      if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
          and self.controller.recovery_manager.configured_for_recovery(command['role']):
        if command['roleCommand'] == self.ROLE_COMMAND_START:
          self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.LIVE_STATUS)
          self.controller.recovery_manager.update_config_staleness(command['role'], False)
          logger.info("After EXECUTION_COMMAND (START), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                       self.controller.recovery_manager.get_current_status(command['role']) )
        elif command['roleCommand'] == self.ROLE_COMMAND_STOP or command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
          self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.DEAD_STATUS)
          logger.info("After EXECUTION_COMMAND (STOP/INSTALL), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                       self.controller.recovery_manager.get_current_status(command['role']) )
        elif command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND:
          if command['hostLevelParams'].has_key('custom_command') and \
                  command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART:
            self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.LIVE_STATUS)
            self.controller.recovery_manager.update_config_staleness(command['role'], False)
            logger.info("After EXECUTION_COMMAND (RESTART), current state of " + command['role'] + " to " +
                         self.controller.recovery_manager.get_current_status(command['role']) )
      pass

      # let ambari know that configuration tags were applied
      configHandler = ActualConfigHandler(self.config, self.configTags)

      if command.has_key('configurationTags'):
        configHandler.write_actual(command['configurationTags'])
        roleResult['configurationTags'] = command['configurationTags']
      component = {'serviceName':command['serviceName'],'componentName':command['role']}
      if 'roleCommand' in command and \
          (command['roleCommand'] == self.ROLE_COMMAND_START or
             (command['roleCommand'] == self.ROLE_COMMAND_INSTALL and component in LiveStatus.CLIENT_COMPONENTS) or
               (command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND and
                  'custom_command' in command['hostLevelParams'] and
                      command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART)):
        configHandler.write_actual_component(command['role'],
                                             command['configurationTags'])
        if 'clientsToUpdateConfigs' in command['hostLevelParams'] and command['hostLevelParams']['clientsToUpdateConfigs']:
          configHandler.write_client_components(command['serviceName'],
                                                command['configurationTags'],
                                                command['hostLevelParams']['clientsToUpdateConfigs'])
        roleResult['configurationTags'] = configHandler.read_actual_component(
            command['role'])
    elif status == self.FAILED_STATUS:
      if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
              and self.controller.recovery_manager.configured_for_recovery(command['role']):
        if command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
          self.controller.recovery_manager.update_current_status(command['role'], self.controller.recovery_manager.INSTALL_FAILED)
          logger.info("After EXECUTION_COMMAND (INSTALL), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                      self.controller.recovery_manager.get_current_status(command['role']))

    self.commandStatuses.put_command_status(command, roleResult)

  def log_command_output(self, text, taskId):
    """
    Logs a message as multiple enumerated log messages every of which is not larger than MAX_SYMBOLS_PER_LOG_MESSAGE.

    If logs are redirected to syslog (syslog_enabled=1), this is very useful for logging big messages.
    As syslog usually truncates long messages.
    """
    chunks = split_on_chunks(text, MAX_SYMBOLS_PER_LOG_MESSAGE)
    if len(chunks) > 1:
      for i in range(len(chunks)):
        logger.info("Cmd log for taskId={0} and chunk {1}/{2} of log for command: \n".format(taskId, i+1, len(chunks)) + chunks[i])
    else:
      logger.info("Cmd log for taskId={0}: ".format(taskId) + text)

  def get_retry_delay(self, last_delay):
    """
    Returns exponentially growing delay. The idea being if number of retries is high then the reason to retry
    is probably a host or environment specific issue requiring longer waits
    """
    return last_delay * 2

  def command_was_canceled(self):
    self.customServiceOrchestrator

  def on_background_command_complete_callback(self, process_condensed_result, handle):
    logger.debug('Start callback: %s' % process_condensed_result)
    logger.debug('The handle is: %s' % handle)
    status = self.COMPLETED_STATUS if handle.exitCode == 0 else self.FAILED_STATUS

    aborted_postfix = self.customServiceOrchestrator.command_canceled_reason(handle.command['taskId'])
    if aborted_postfix:
      status = self.FAILED_STATUS
      logger.debug('Set status to: %s , reason = %s' % (status, aborted_postfix))
    else:
      aborted_postfix = ''


    roleResult = self.commandStatuses.generate_report_template(handle.command)

    roleResult.update({
      'stdout': process_condensed_result['stdout'] + aborted_postfix,
      'stderr': process_condensed_result['stderr'] + aborted_postfix,
      'exitCode': process_condensed_result['exitcode'],
      'structuredOut': str(json.dumps(process_condensed_result['structuredOut'])) if 'structuredOut' in process_condensed_result else '',
      'status': status,
    })

    self.commandStatuses.put_command_status(handle.command, roleResult)

  def execute_status_command(self, command):
    '''
    Executes commands of type STATUS_COMMAND
    '''
    try:
      cluster = command['clusterName']
      service = command['serviceName']
      component = command['componentName']
      configurations = command['configurations']
      if configurations.has_key('global'):
        globalConfig = configurations['global']
      else:
        globalConfig = {}

      livestatus = LiveStatus(cluster, service, component,
                              globalConfig, self.config, self.configTags)

      component_extra = None

      # For custom services, responsibility to determine service status is
      # delegated to python scripts
      component_status_result = self.customServiceOrchestrator.requestComponentStatus(command)
      component_security_status_result = self.customServiceOrchestrator.requestComponentSecurityState(command)

      if component_status_result['exitcode'] == 0:
        component_status = LiveStatus.LIVE_STATUS
        if self.controller.recovery_manager.enabled() \
          and self.controller.recovery_manager.configured_for_recovery(component):
          self.controller.recovery_manager.update_current_status(component, component_status)
      else:
        component_status = LiveStatus.DEAD_STATUS
        if self.controller.recovery_manager.enabled() \
          and self.controller.recovery_manager.configured_for_recovery(component):
          if (self.controller.recovery_manager.get_current_status(component) != self.controller.recovery_manager.INSTALL_FAILED):
            self.controller.recovery_manager.update_current_status(component, component_status)

      request_execution_cmd = self.controller.recovery_manager.requires_recovery(component) and \
                                not self.controller.recovery_manager.command_exists(component, ActionQueue.EXECUTION_COMMAND)

      if 'structuredOut' in component_status_result:
        component_extra = component_status_result['structuredOut']

      result = livestatus.build(component_status=component_status)
      if self.controller.recovery_manager.enabled():
        result['sendExecCmdDet'] = str(request_execution_cmd)

      # Add security state to the result
      result['securityState'] = component_security_status_result

      if component_extra is not None and len(component_extra) != 0:
        if component_extra.has_key('alerts'):
          result['alerts'] = component_extra['alerts']
          del component_extra['alerts']

        result['extra'] = component_extra

      logger.debug("Got live status for component " + component + \
                   " of service " + str(service) + \
                   " of cluster " + str(cluster))

      logger.debug(pprint.pformat(result))
      if result is not None:
        self.commandStatuses.put_command_status(command, result)
    except Exception, err:
      traceback.print_exc()
      logger.warn(err)
    pass