Example #1
0
 def __init__(self, config, controller):
     super(ActionQueue, self).__init__()
     self.commandQueue = Queue.Queue()
     self.statusCommandQueue = None  # the queue this field points to is re-created whenever
     # a new StatusCommandExecutor child process is spawned
     # by Controller
     # multiprocessing.Queue()
     self.statusCommandResultQueue = multiprocessing.Queue(
     )  # this queue is filled by StatuCommandsExecutor.
     self.backgroundCommandQueue = Queue.Queue()
     self.commandStatuses = CommandStatusDict(
         callback_action=self.status_update_callback)
     self.config = config
     self.controller = controller
     self.configTags = {}
     self._stop = threading.Event()
     self.tmpdir = config.get('agent', 'prefix')
     self.customServiceOrchestrator = CustomServiceOrchestrator(
         config, controller)
     self.parallel_execution = config.get_parallel_exec_option()
     if self.parallel_execution == 1:
         logger.info(
             "Parallel execution is enabled, will execute agent commands in parallel"
         )
     self.lock = threading.Lock()
Example #2
0
 def __init__(self, config, controller):
   super(ActionQueue, self).__init__()
   self.commandQueue = Queue.Queue()
   self.statusCommandQueue = Queue.Queue()
   self.backgroundCommandQueue = Queue.Queue()
   self.commandStatuses = CommandStatusDict(callback_action =
     self.status_update_callback)
   self.config = config
   self.controller = controller
   self.configTags = {}
   self._stop = threading.Event()
   self.tmpdir = config.get('agent', 'prefix')
   self.customServiceOrchestrator = CustomServiceOrchestrator(config, controller)
   self.parallel_execution = config.get_parallel_exec_option()
   if self.parallel_execution == 1:
     logger.info("Parallel execution is enabled, will execute agent commands in parallel")
  def execute_status_command(self, command):
    '''
    Executes commands of type STATUS_COMMAND
    '''
    try:
      cluster = command['clusterName']
      service = command['serviceName']
      component = command['componentName']
      reportResult = CommandStatusDict.shouldReportResult(command)
      component_status = self.customServiceOrchestrator.requestComponentStatus(command)

      result = {"componentName": component,
                "msg": "",
                "clusterName": cluster,
                "serviceName": service,
                "reportResult": reportResult,
                "roleCommand": command['roleCommand']
      }

      if 'configurations' in component_status:
        result['configurations'] = component_status['configurations']
      if Constants.EXIT_CODE in component_status:
        result['status'] = component_status[Constants.EXIT_CODE]
        logger.debug("Got live status for component " + component + \
                     " of service " + str(service) + \
                     " of cluster " + str(cluster))
        logger.debug(pprint.pformat(result))

      if result is not None:
        self.commandStatuses.put_command_status(command, result, reportResult)
    except Exception, err:
      traceback.print_exc()
      logger.warn(err)
Example #4
0
 def __init__(self, config, controller):
     super(ActionQueue, self).__init__()
     self.commandQueue = Queue.Queue()
     self.commandStatuses = CommandStatusDict(
         callback_action=self.status_update_callback)
     self.config = config
     self.controller = controller
     self.sh = shellRunner()
     self._stop = threading.Event()
     self.tmpdir = config.get('agent', 'prefix')
Example #5
0
 def __init__(self, config, controller):
     super(ActionQueue, self).__init__()
     self.commandQueue = Queue.Queue()
     self.commandStatuses = CommandStatusDict(
         callback_action=self.status_update_callback)
     self.config = config
     self.controller = controller
     self._stop = threading.Event()
     self.tmpdir = config.getResolvedPath(AgentConfig.APP_TASK_DIR)
     self.customServiceOrchestrator = CustomServiceOrchestrator(
         config, controller)
    def test_generate_progress_report(self, open_mock):
        csd = CommandStatusDict(None)
        report = {}
        report["tmpout"] = None
        report["tmperr"] = None
        report["structuredOut"] = None

        # Make file read calls visible
        def open_side_effect(file, mode):
            if mode == "r":
                file_mock = MagicMock()
                file_mock.read.return_value = "Read from " + str(file)
                return file_mock
            else:
                return self.original_open(file, mode)

        open_mock.side_effect = open_side_effect

        inprogress = csd.generate_in_progress_report(self.auto_hbase_install_command, report)
        expected = {
            "status": "IN_PROGRESS",
            "stderr": "Read from None",
            "stdout": "Read from None",
            "clusterName": u"cc",
            "structuredOut": "{}",
            "reportResult": False,
            "roleCommand": u"INSTALL",
            "serviceName": u"HDFS",
            "role": u"HBASE",
            "actionId": "1-1",
            "taskId": 7,
            "exitcode": 777,
        }
        self.assertEqual(inprogress, expected)

        self.auto_hbase_install_command["auto_generated"] = False
        inprogress = csd.generate_in_progress_report(self.auto_hbase_install_command, report)
        expected["reportResult"] = True
        self.assertEqual(inprogress, expected)
        pass
  def test_generate_progress_report(self, open_mock):
    csd = CommandStatusDict(None)
    report = {}
    report['tmpout'] = None
    report['tmperr'] = None
    report['structuredOut'] = None

    # Make file read calls visible
    def open_side_effect(file, mode):
      if mode == 'r':
        file_mock = MagicMock()
        file_mock.read.return_value = "Read from " + str(file)
        return file_mock
      else:
        return self.original_open(file, mode)

    open_mock.side_effect = open_side_effect

    inprogress = csd.generate_in_progress_report(self.auto_hbase_install_command, report)
    expected = {
      'status': 'IN_PROGRESS',
      'stderr': 'Read from None',
      'stdout': 'Read from None',
      'clusterName': u'cc',
      'structuredOut': '{}',
      'reportResult': False,
      'roleCommand': u'INSTALL',
      'serviceName': u'HDFS',
      'role': u'HBASE',
      'actionId': '1-1',
      'taskId': 7,
      'exitcode': 777}
    self.assertEqual(inprogress, expected)

    self.auto_hbase_install_command['auto_generated'] = False
    inprogress = csd.generate_in_progress_report(self.auto_hbase_install_command, report)
    expected['reportResult'] = True
    self.assertEqual(inprogress, expected)
    pass
Example #8
0
 def __init__(self, config, controller):
     super(ActionQueue, self).__init__()
     self.commandQueue = Queue.Queue()
     self.statusCommandQueue = Queue.Queue()
     self.backgroundCommandQueue = Queue.Queue()
     self.commandStatuses = CommandStatusDict(
         callback_action=self.status_update_callback)
     self.config = config
     self.controller = controller
     self.configTags = {}
     self._stop = threading.Event()
     self.tmpdir = config.get('agent', 'prefix')
     self.customServiceOrchestrator = CustomServiceOrchestrator(
         config, controller)
Example #9
0
    def execute_status_command(self, command):
        '''
    Executes commands of type STATUS_COMMAND
    '''
        try:
            cluster = command['clusterName']
            service = command['serviceName']
            component = command['componentName']
            reportResult = CommandStatusDict.shouldReportResult(command)
            component_status = None
            if self.docker_mode:
                component_status = self.dockerManager.query_status(command)
            elif self.yarn_docker_mode:
                component_status = self.yarnDockerManager.query_status(command)
            else:
                component_status = self.customServiceOrchestrator.requestComponentStatus(
                    command)

            result = {
                "componentName": component,
                "msg": "",
                "clusterName": cluster,
                "serviceName": service,
                "reportResult": reportResult,
                "roleCommand": command['roleCommand']
            }

            if 'configurations' in component_status:
                result['configurations'] = component_status['configurations']
            if Constants.EXIT_CODE in component_status:
                result['status'] = component_status[Constants.EXIT_CODE]
                logger.debug("Got live status for component " + component + \
                             " of service " + str(service) + \
                             " of cluster " + str(cluster))
                logger.debug(pprint.pformat(result))
            if 'ip' in component_status:
                result['ip'] = component_status['ip']
            if 'hostname' in component_status:
                result['hostname'] = component_status['hostname']

            if result is not None:
                logger.debug("result of execute_status_command: " +
                             str(result))
                self.commandStatuses.put_command_status(
                    command, result, reportResult)
        except Exception, err:
            traceback.print_exc()
            logger.warn(err)
Example #10
0
 def __init__(self, config, controller):
   super(ActionQueue, self).__init__()
   self.commandQueue = Queue.Queue()
   self.statusCommandQueue = Queue.Queue()
   self.backgroundCommandQueue = Queue.Queue()
   self.commandStatuses = CommandStatusDict(callback_action =
     self.status_update_callback)
   self.config = config
   self.controller = controller
   self.configTags = {}
   self._stop = threading.Event()
   self.tmpdir = config.get('agent', 'prefix')
   self.customServiceOrchestrator = CustomServiceOrchestrator(config, controller)
   self.parallel_execution = config.get_parallel_exec_option()
   if self.parallel_execution == 1:
     logger.info("Parallel execution is enabled, will execute agent commands in parallel")
Example #11
0
 def __init__(self, config, controller, agentToggleLogger):
     super(ActionQueue, self).__init__()
     self.queueOutAgentToggleLogger = agentToggleLogger
     self.queueInAgentToggleLogger = AgentToggleLogger("info")
     self.commandQueue = Queue.Queue()
     self.commandStatuses = CommandStatusDict(
         callback_action=self.status_update_callback)
     self.config = config
     self.controller = controller
     self._stop = threading.Event()
     self.tmpdir = config.getResolvedPath(AgentConfig.APP_TASK_DIR)
     self.componentPackage = ''
     self.customServiceOrchestrator = CustomServiceOrchestrator(
         config, controller, self.queueOutAgentToggleLogger)
     self.dockerManager = DockerManager(self.tmpdir,
                                        config.getWorkRootPath(),
                                        config.getLogPath(),
                                        self.customServiceOrchestrator)
     self.yarnDockerManager = YarnDockerManager(
         self.tmpdir, config.getWorkRootPath(), config.getLogPath(),
         self.customServiceOrchestrator)
Example #12
0
class ActionQueue(threading.Thread):
    """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

    # How many actions can be performed in parallel. Feel free to change
    MAX_CONCURRENT_ACTIONS = 5

    #How much time(in seconds) we need wait for new incoming execution command before checking
    #status command queue
    EXECUTION_COMMAND_WAIT_TIME = 2

    STATUS_COMMAND = 'STATUS_COMMAND'
    EXECUTION_COMMAND = 'EXECUTION_COMMAND'
    AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
    BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
    ROLE_COMMAND_INSTALL = 'INSTALL'
    ROLE_COMMAND_START = 'START'
    ROLE_COMMAND_STOP = 'STOP'
    ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
    CUSTOM_COMMAND_RESTART = 'RESTART'
    CUSTOM_COMMAND_START = ROLE_COMMAND_START

    IN_PROGRESS_STATUS = 'IN_PROGRESS'
    COMPLETED_STATUS = 'COMPLETED'
    FAILED_STATUS = 'FAILED'

    def __init__(self, config, controller):
        super(ActionQueue, self).__init__()
        self.commandQueue = Queue.Queue()
        self.statusCommandResultQueue = multiprocessing.Queue(
        )  # this queue is filled by StatuCommandsExecutor.
        self.backgroundCommandQueue = Queue.Queue()
        self.commandStatuses = CommandStatusDict(
            callback_action=self.status_update_callback)
        self.config = config
        self.controller = controller
        self.configTags = {}
        self._stop = threading.Event()
        self.tmpdir = config.get('agent', 'prefix')
        self.customServiceOrchestrator = CustomServiceOrchestrator(
            config, controller)
        self.parallel_execution = config.get_parallel_exec_option()
        if self.parallel_execution == 1:
            logger.info(
                "Parallel execution is enabled, will execute agent commands in parallel"
            )
        self.lock = threading.Lock()

    def stop(self):
        self._stop.set()

    def stopped(self):
        return self._stop.isSet()

    def put_status(self, commands):
        self.controller.statusCommandsExecutor.put_commands(commands)

    def put(self, commands):
        for command in commands:
            if not command.has_key('serviceName'):
                command['serviceName'] = "null"
            if not command.has_key('clusterName'):
                command['clusterName'] = 'null'

            logger.info("Adding " + command['commandType'] + " for role " + \
                        command['role'] + " for service " + \
                        command['serviceName'] + " of cluster " + \
                        command['clusterName'] + " to the queue.")
            if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND:
                self.backgroundCommandQueue.put(
                    self.createCommandHandle(command))
            else:
                self.commandQueue.put(command)

    def cancel(self, commands):
        for command in commands:

            logger.info("Canceling command with taskId = {tid}".format(
                tid=str(command['target_task_id'])))
            logger.debug(pprint.pformat(command))

            task_id = command['target_task_id']
            reason = command['reason']

            # Remove from the command queue by task_id
            queue = self.commandQueue
            self.commandQueue = Queue.Queue()

            while not queue.empty():
                queued_command = queue.get(False)
                if queued_command['taskId'] != task_id:
                    self.commandQueue.put(queued_command)
                else:
                    logger.info("Canceling " + queued_command['commandType'] + \
                                " for service " + queued_command['serviceName'] + \
                                " and role " +  queued_command['role'] + \
                                " with taskId " + str(queued_command['taskId']))

            # Kill if in progress
            self.customServiceOrchestrator.cancel_command(task_id, reason)

    def run(self):
        try:
            while not self.stopped():
                self.processBackgroundQueueSafeEmpty()
                self.controller.get_status_commands_executor().process_results(
                )  # process status commands
                try:
                    if self.parallel_execution == 0:
                        command = self.commandQueue.get(
                            True, self.EXECUTION_COMMAND_WAIT_TIME)
                        self.process_command(command)
                    else:
                        # If parallel execution is enabled, just kick off all available
                        # commands using separate threads
                        while (True):
                            command = self.commandQueue.get(
                                True, self.EXECUTION_COMMAND_WAIT_TIME)
                            # If command is not retry_enabled then do not start them in parallel
                            # checking just one command is enough as all commands for a stage is sent
                            # at the same time and retry is only enabled for initial start/install
                            retryAble = False
                            if 'commandParams' in command and 'command_retry_enabled' in command[
                                    'commandParams']:
                                retryAble = command['commandParams'][
                                    'command_retry_enabled'] == "true"
                            if retryAble:
                                logger.info(
                                    "Kicking off a thread for the command, id="
                                    + str(command['commandId']) + " taskId=" +
                                    str(command['taskId']))
                                t = threading.Thread(
                                    target=self.process_command,
                                    args=(command, ))
                                t.daemon = True
                                t.start()
                            else:
                                self.process_command(command)
                                break
                            pass
                        pass
                except (Queue.Empty):
                    pass
        except:
            logger.exception("ActionQueue thread failed with exception:")
            raise

        logger.info("ActionQueue thread has successfully finished")

    def processBackgroundQueueSafeEmpty(self):
        while not self.backgroundCommandQueue.empty():
            try:
                command = self.backgroundCommandQueue.get(False)
                if command.has_key(
                        '__handle') and command['__handle'].status == None:
                    self.process_command(command)
            except Queue.Empty:
                pass

    def createCommandHandle(self, command):
        if command.has_key('__handle'):
            raise AgentException("Command already has __handle")
        command['__handle'] = BackgroundCommandExecutionHandle(
            command, command['commandId'], None,
            self.on_background_command_complete_callback)
        return command

    def process_command(self, command):
        # make sure we log failures
        commandType = command['commandType']
        logger.debug("Took an element of Queue (command type = %s)." %
                     commandType)
        try:
            if commandType in [
                    self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND,
                    self.AUTO_EXECUTION_COMMAND
            ]:
                try:
                    if self.controller.recovery_manager.enabled():
                        self.controller.recovery_manager.start_execution_command(
                        )
                    self.execute_command(command)
                finally:
                    if self.controller.recovery_manager.enabled():
                        self.controller.recovery_manager.stop_execution_command(
                        )
            else:
                logger.error("Unrecognized command " + pprint.pformat(command))
        except Exception:
            logger.exception(
                "Exception while processing {0} command".format(commandType))

    def tasks_in_progress_or_pending(self):
        return_val = False
        if not self.commandQueue.empty():
            return_val = True
        if self.controller.recovery_manager.has_active_command():
            return_val = True
        return return_val
        pass

    def execute_command(self, command):
        '''
    Executes commands of type EXECUTION_COMMAND
    '''
        clusterName = command['clusterName']
        commandId = command['commandId']
        isCommandBackground = command[
            'commandType'] == self.BACKGROUND_EXECUTION_COMMAND
        isAutoExecuteCommand = command[
            'commandType'] == self.AUTO_EXECUTION_COMMAND
        message = "Executing command with id = {commandId}, taskId = {taskId} for role = {role} of " \
                  "cluster {cluster}.".format(
                  commandId = str(commandId), taskId = str(command['taskId']),
                  role=command['role'], cluster=clusterName)
        logger.info(message)

        taskId = command['taskId']
        # Preparing 'IN_PROGRESS' report
        in_progress_status = self.commandStatuses.generate_report_template(
            command)
        # The path of the files that contain the output log and error log use a prefix that the agent advertises to the
        # server. The prefix is defined in agent-config.ini
        if not isAutoExecuteCommand:
            in_progress_status.update({
                'tmpout':
                self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
                'tmperr':
                self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
                'structuredOut':
                self.tmpdir + os.sep + 'structured-out-' + str(taskId) +
                '.json',
                'status':
                self.IN_PROGRESS_STATUS
            })
        else:
            in_progress_status.update({
                'tmpout':
                self.tmpdir + os.sep + 'auto_output-' + str(taskId) + '.txt',
                'tmperr':
                self.tmpdir + os.sep + 'auto_errors-' + str(taskId) + '.txt',
                'structuredOut':
                self.tmpdir + os.sep + 'auto_structured-out-' + str(taskId) +
                '.json',
                'status':
                self.IN_PROGRESS_STATUS
            })

        self.commandStatuses.put_command_status(command, in_progress_status)

        numAttempts = 0
        retryDuration = 0  # even with 0 allow one attempt
        retryAble = False
        delay = 1
        log_command_output = True
        if 'commandParams' in command and 'log_output' in command[
                'commandParams'] and "false" == command['commandParams'][
                    'log_output']:
            log_command_output = False

        if 'commandParams' in command:
            if 'max_duration_for_retries' in command['commandParams']:
                retryDuration = int(
                    command['commandParams']['max_duration_for_retries'])
            if 'command_retry_enabled' in command['commandParams']:
                retryAble = command['commandParams'][
                    'command_retry_enabled'] == "true"
        if isAutoExecuteCommand:
            retryAble = False

        logger.info(
            "Command execution metadata - taskId = {taskId}, retry enabled = {retryAble}, max retry duration (sec) = {retryDuration}, log_output = {log_command_output}"
            .format(taskId=taskId,
                    retryAble=retryAble,
                    retryDuration=retryDuration,
                    log_command_output=log_command_output))
        command_canceled = False
        while retryDuration >= 0:
            numAttempts += 1
            start = 0
            if retryAble:
                start = int(time.time())
            # running command
            commandresult = self.customServiceOrchestrator.runCommand(
                command,
                in_progress_status['tmpout'],
                in_progress_status['tmperr'],
                override_output_files=numAttempts == 1,
                retry=numAttempts > 1)
            end = 1
            if retryAble:
                end = int(time.time())
            retryDuration -= (end - start)

            # dumping results
            if isCommandBackground:
                logger.info(
                    "Command is background command, quit retrying. Exit code: {exitCode}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId,
                            exitCode=commandresult['exitcode'],
                            retryAble=retryAble,
                            retryDuration=retryDuration,
                            delay=delay))
                return
            else:
                if commandresult['exitcode'] == 0:
                    status = self.COMPLETED_STATUS
                else:
                    status = self.FAILED_STATUS
                    if (commandresult['exitcode']
                            == -signal.SIGTERM) or (commandresult['exitcode']
                                                    == -signal.SIGKILL):
                        logger.info(
                            'Command with taskId = {cid} was canceled!'.format(
                                cid=taskId))
                        command_canceled = True
                        break

            if status != self.COMPLETED_STATUS and retryAble and retryDuration > 0:
                delay = self.get_retry_delay(delay)
                if delay > retryDuration:
                    delay = retryDuration
                retryDuration -= delay  # allow one last attempt
                commandresult[
                    'stderr'] += "\n\nCommand failed. Retrying command execution ...\n\n"
                logger.info(
                    "Retrying command with taskId = {cid} after a wait of {delay}"
                    .format(cid=taskId, delay=delay))
                command['commandBeingRetried'] = "true"
                time.sleep(delay)
                continue
            else:
                logger.info(
                    "Quit retrying for command with taskId = {cid}. Status: {status}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId,
                            status=status,
                            retryAble=retryAble,
                            retryDuration=retryDuration,
                            delay=delay))
                break

        # do not fail task which was rescheduled from server
        if command_canceled:
            with self.lock:
                with self.commandQueue.mutex:
                    for com in self.commandQueue.queue:
                        if com['taskId'] == command['taskId']:
                            logger.info(
                                'Command with taskId = {cid} was rescheduled by server. '
                                'Fail report on cancelled command won\'t be sent with heartbeat.'
                                .format(cid=taskId))
                            return

        # final result to stdout
        commandresult[
            'stdout'] += '\n\nCommand completed successfully!\n' if status == self.COMPLETED_STATUS else '\n\nCommand failed after ' + str(
                numAttempts) + ' tries\n'
        logger.info(
            'Command with taskId = {cid} completed successfully!'.format(
                cid=taskId) if status == self.COMPLETED_STATUS else
            'Command with taskId = {cid} failed after {attempts} tries'.
            format(cid=taskId, attempts=numAttempts))

        roleResult = self.commandStatuses.generate_report_template(command)
        roleResult.update({
            'stdout': commandresult['stdout'],
            'stderr': commandresult['stderr'],
            'exitCode': commandresult['exitcode'],
            'status': status,
        })

        if self.config.has_option("logging","log_command_executes") \
            and int(self.config.get("logging", "log_command_executes")) == 1 \
            and log_command_output:

            if roleResult['stdout'] != '':
                logger.info("Begin command output log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])
                self.log_command_output(roleResult['stdout'],
                                        str(command['taskId']))
                logger.info("End command output log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])

            if roleResult['stderr'] != '':
                logger.info("Begin command stderr log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])
                self.log_command_output(roleResult['stderr'],
                                        str(command['taskId']))
                logger.info("End command stderr log for command with id = " +
                            str(command['taskId']) + ", role = " +
                            command['role'] + ", roleCommand = " +
                            command['roleCommand'])

        if roleResult['stdout'] == '':
            roleResult['stdout'] = 'None'
        if roleResult['stderr'] == '':
            roleResult['stderr'] = 'None'

        # let ambari know name of custom command
        if command['hostLevelParams'].has_key('custom_command'):
            roleResult['customCommand'] = command['hostLevelParams'][
                'custom_command']

        if 'structuredOut' in commandresult:
            roleResult['structuredOut'] = str(
                json.dumps(commandresult['structuredOut']))
        else:
            roleResult['structuredOut'] = ''

        # let recovery manager know the current state
        if status == self.COMPLETED_STATUS:
            if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
                and self.controller.recovery_manager.configured_for_recovery(command['role']):
                if command['roleCommand'] == self.ROLE_COMMAND_START:
                    self.controller.recovery_manager.update_current_status(
                        command['role'], LiveStatus.LIVE_STATUS)
                    self.controller.recovery_manager.update_config_staleness(
                        command['role'], False)
                    logger.info(
                        "After EXECUTION_COMMAND (START), with taskId=" +
                        str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))
                elif command[
                        'roleCommand'] == self.ROLE_COMMAND_STOP or command[
                            'roleCommand'] == self.ROLE_COMMAND_INSTALL:
                    self.controller.recovery_manager.update_current_status(
                        command['role'], LiveStatus.DEAD_STATUS)
                    logger.info(
                        "After EXECUTION_COMMAND (STOP/INSTALL), with taskId="
                        + str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))
                elif command[
                        'roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND:
                    if command['hostLevelParams'].has_key('custom_command') and \
                            command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART:
                        self.controller.recovery_manager.update_current_status(
                            command['role'], LiveStatus.LIVE_STATUS)
                        self.controller.recovery_manager.update_config_staleness(
                            command['role'], False)
                        logger.info(
                            "After EXECUTION_COMMAND (RESTART), current state of "
                            + command['role'] + " to " +
                            self.controller.recovery_manager.
                            get_current_status(command['role']))
            pass

            # let ambari know that configuration tags were applied
            configHandler = ActualConfigHandler(self.config, self.configTags)

            #update
            if 'commandParams' in command:
                command_params = command['commandParams']
                if command_params and command_params.has_key(
                        'forceRefreshConfigTags') and len(
                            command_params['forceRefreshConfigTags']) > 0:
                    forceRefreshConfigTags = command_params[
                        'forceRefreshConfigTags'].split(',')
                    logger.info(
                        "Got refresh additional component tags command")

                    for configTag in forceRefreshConfigTags:
                        configHandler.update_component_tag(
                            command['role'], configTag,
                            command['configurationTags'][configTag])

                    roleResult[
                        'customCommand'] = self.CUSTOM_COMMAND_RESTART  # force restart for component to evict stale_config on server side
                    command[
                        'configurationTags'] = configHandler.read_actual_component(
                            command['role'])

            if command.has_key('configurationTags'):
                configHandler.write_actual(command['configurationTags'])
                roleResult['configurationTags'] = command['configurationTags']
            component = {
                'serviceName': command['serviceName'],
                'componentName': command['role']
            }
            if 'roleCommand' in command and \
                (command['roleCommand'] == self.ROLE_COMMAND_START or
                   (command['roleCommand'] == self.ROLE_COMMAND_INSTALL and component in LiveStatus.CLIENT_COMPONENTS) or
                     (command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND and
                        'custom_command' in command['hostLevelParams'] and
                            command['hostLevelParams']['custom_command'] in (self.CUSTOM_COMMAND_RESTART, self.CUSTOM_COMMAND_START))):
                configHandler.write_actual_component(
                    command['role'], command['configurationTags'])
                if 'clientsToUpdateConfigs' in command[
                        'hostLevelParams'] and command['hostLevelParams'][
                            'clientsToUpdateConfigs']:
                    configHandler.write_client_components(
                        command['serviceName'], command['configurationTags'],
                        command['hostLevelParams']['clientsToUpdateConfigs'])
                roleResult[
                    'configurationTags'] = configHandler.read_actual_component(
                        command['role'])
        elif status == self.FAILED_STATUS:
            if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
                    and self.controller.recovery_manager.configured_for_recovery(command['role']):
                if command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
                    self.controller.recovery_manager.update_current_status(
                        command['role'],
                        self.controller.recovery_manager.INSTALL_FAILED)
                    logger.info(
                        "After EXECUTION_COMMAND (INSTALL), with taskId=" +
                        str(command['taskId']) + ", current state of " +
                        command['role'] + " to " +
                        self.controller.recovery_manager.get_current_status(
                            command['role']))

        self.commandStatuses.put_command_status(command, roleResult)

    def log_command_output(self, text, taskId):
        """
    Logs a message as multiple enumerated log messages every of which is not larger than MAX_SYMBOLS_PER_LOG_MESSAGE.

    If logs are redirected to syslog (syslog_enabled=1), this is very useful for logging big messages.
    As syslog usually truncates long messages.
    """
        chunks = split_on_chunks(text, MAX_SYMBOLS_PER_LOG_MESSAGE)
        if len(chunks) > 1:
            for i in range(len(chunks)):
                logger.info(
                    "Cmd log for taskId={0} and chunk {1}/{2} of log for command: \n"
                    .format(taskId, i + 1, len(chunks)) + chunks[i])
        else:
            logger.info("Cmd log for taskId={0}: ".format(taskId) + text)

    def get_retry_delay(self, last_delay):
        """
    Returns exponentially growing delay. The idea being if number of retries is high then the reason to retry
    is probably a host or environment specific issue requiring longer waits
    """
        return last_delay * 2

    def command_was_canceled(self):
        self.customServiceOrchestrator

    def on_background_command_complete_callback(self, process_condensed_result,
                                                handle):
        logger.debug('Start callback: %s' % process_condensed_result)
        logger.debug('The handle is: %s' % handle)
        status = self.COMPLETED_STATUS if handle.exitCode == 0 else self.FAILED_STATUS

        aborted_postfix = self.customServiceOrchestrator.command_canceled_reason(
            handle.command['taskId'])
        if aborted_postfix:
            status = self.FAILED_STATUS
            logger.debug('Set status to: %s , reason = %s' %
                         (status, aborted_postfix))
        else:
            aborted_postfix = ''

        roleResult = self.commandStatuses.generate_report_template(
            handle.command)

        roleResult.update({
            'stdout':
            process_condensed_result['stdout'] + aborted_postfix,
            'stderr':
            process_condensed_result['stderr'] + aborted_postfix,
            'exitCode':
            process_condensed_result['exitcode'],
            'structuredOut':
            str(json.dumps(process_condensed_result['structuredOut']))
            if 'structuredOut' in process_condensed_result else '',
            'status':
            status,
        })

        self.commandStatuses.put_command_status(handle.command, roleResult)

    def execute_status_command_and_security_status(self, command):
        component_status_result = self.customServiceOrchestrator.requestComponentStatus(
            command)
        return command, component_status_result

    def process_status_command_result(self, result):
        '''
    Executes commands of type STATUS_COMMAND
    '''
        try:
            command, component_status_result = result
            cluster = command['clusterName']
            service = command['serviceName']
            component = command['componentName']
            configurations = command['configurations']
            if configurations.has_key('global'):
                globalConfig = configurations['global']
            else:
                globalConfig = {}

            if not Script.config:
                logger.debug(
                    'Setting Script.config to last status command configuration'
                )
                Script.config = command

            livestatus = LiveStatus(cluster, service, component, globalConfig,
                                    self.config, self.configTags)

            component_extra = None

            if component_status_result['exitcode'] == 0:
                component_status = LiveStatus.LIVE_STATUS
                if self.controller.recovery_manager.enabled() \
                  and self.controller.recovery_manager.configured_for_recovery(component):
                    self.controller.recovery_manager.update_current_status(
                        component, component_status)
            else:
                component_status = LiveStatus.DEAD_STATUS
                if self.controller.recovery_manager.enabled() \
                  and self.controller.recovery_manager.configured_for_recovery(component):
                    if (self.controller.recovery_manager.get_current_status(
                            component) !=
                            self.controller.recovery_manager.INSTALL_FAILED):
                        self.controller.recovery_manager.update_current_status(
                            component, component_status)

            request_execution_cmd = self.controller.recovery_manager.requires_recovery(component) and \
                                      not self.controller.recovery_manager.command_exists(component, ActionQueue.EXECUTION_COMMAND)

            if 'structuredOut' in component_status_result:
                component_extra = component_status_result['structuredOut']

            result = livestatus.build(component_status=component_status)
            if self.controller.recovery_manager.enabled():
                result['sendExecCmdDet'] = str(request_execution_cmd)

            if component_extra is not None and len(component_extra) != 0:
                if component_extra.has_key('alerts'):
                    result['alerts'] = component_extra['alerts']
                    del component_extra['alerts']

                result['extra'] = component_extra

            logger.debug("Got live status for component " + component + \
                         " of service " + str(service) + \
                         " of cluster " + str(cluster))

            logger.debug(pprint.pformat(result))
            if result is not None:
                self.commandStatuses.put_command_status(command, result)
        except Exception, err:
            traceback.print_exc()
            logger.warn(err)
        pass
Example #13
0
  def execute_command(self, command):
    '''
    Executes commands of type  EXECUTION_COMMAND
    '''
    clusterName = command['clusterName']
    commandId = command['commandId']

    message = "Executing command with id = {commandId} for role = {role} of " \
              "cluster {cluster}".format(
      commandId=str(commandId), role=command['role'],
      cluster=clusterName)
    logger.info(message)
    logger.debug(pprint.pformat(command))

    taskId = command['taskId']

    # if auto generated then do not report result
    reportResult = CommandStatusDict.shouldReportResult(command)

    # Preparing 'IN_PROGRESS' report
    in_progress_status = self.commandStatuses.generate_report_template(command)
    in_progress_status.update({
      'tmpout': self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
      'tmperr': self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
      'structuredOut': self.tmpdir + os.sep + 'structured-out-' + str(
        taskId) + '.json',
      'status': self.IN_PROGRESS_STATUS,
      'reportResult': reportResult
    })
    self.commandStatuses.put_command_status(command, in_progress_status, reportResult)

    store_config = False
    if ActionQueue.STORE_APPLIED_CONFIG in command['commandParams']:
      store_config = 'true' == command['commandParams'][ActionQueue.STORE_APPLIED_CONFIG]
    store_command = False
    if 'roleParams' in command and ActionQueue.AUTO_RESTART in command['roleParams']:
      store_command = 'true' == command['roleParams'][ActionQueue.AUTO_RESTART]

    if store_command:
      logger.info("Component has indicated auto-restart. Saving details from START command.")

    # running command
    commandresult = self.customServiceOrchestrator.runCommand(command,
                                                              in_progress_status[
                                                                'tmpout'],
                                                              in_progress_status[
                                                                'tmperr'],
                                                              True,
                                                              store_config or store_command)
    # If command is STOP then set flag to indicate stop has been triggered.
    # In future we might check status of STOP command and take other measures
    # if graceful STOP fails (like force kill the processes)
    if command['roleCommand'] == 'STOP':
      self.controller.appGracefulStopTriggered = True

    # dumping results
    status = self.COMPLETED_STATUS
    if commandresult[Constants.EXIT_CODE] != 0:
      status = self.FAILED_STATUS
    roleResult = self.commandStatuses.generate_report_template(command)
    roleResult.update({
      'stdout': commandresult['stdout'],
      'stderr': commandresult['stderr'],
      Constants.EXIT_CODE: commandresult[Constants.EXIT_CODE],
      'status': status,
      'reportResult': reportResult
    })
    if roleResult['stdout'] == '':
      roleResult['stdout'] = 'None'
    if roleResult['stderr'] == '':
      roleResult['stderr'] = 'None'

    if 'structuredOut' in commandresult:
      roleResult['structuredOut'] = str(commandresult['structuredOut'])
    else:
      roleResult['structuredOut'] = ''
      # let server know that configuration tags were applied
    if status == self.COMPLETED_STATUS:
      if 'configurationTags' in command:
        roleResult['configurationTags'] = command['configurationTags']
      if Constants.ALLOCATED_PORTS in commandresult:
        roleResult['allocatedPorts'] = commandresult[Constants.ALLOCATED_PORTS]
      if Constants.FOLDERS in commandresult:
        roleResult['folders'] = commandresult[Constants.FOLDERS]
    self.commandStatuses.put_command_status(command, roleResult, reportResult)
Example #14
0
    def execute_command(self, command):
        '''
    Executes commands of type  EXECUTION_COMMAND
    '''
        clusterName = command['clusterName']
        commandId = command['commandId']

        message = "Executing command with id = {commandId} for role = {role} of " \
                  "cluster {cluster}".format(
          commandId=str(commandId), role=command['role'],
          cluster=clusterName)
        logger.info(message)
        logger.debug(pprint.pformat(command))

        taskId = command['taskId']

        # if auto generated then do not report result
        reportResult = CommandStatusDict.shouldReportResult(command)

        # Preparing 'IN_PROGRESS' report
        in_progress_status = self.commandStatuses.generate_report_template(
            command)
        in_progress_status.update({
            'tmpout':
            self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
            'tmperr':
            self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
            'structuredOut':
            self.tmpdir + os.sep + 'structured-out-' + str(taskId) + '.json',
            'status':
            self.IN_PROGRESS_STATUS,
            'reportResult':
            reportResult
        })
        self.commandStatuses.put_command_status(command, in_progress_status,
                                                reportResult)

        store_config = False
        if ActionQueue.STORE_APPLIED_CONFIG in command['commandParams']:
            store_config = 'true' == command['commandParams'][
                ActionQueue.STORE_APPLIED_CONFIG]
        store_command = False
        if 'roleParams' in command and ActionQueue.AUTO_RESTART in command[
                'roleParams']:
            logger.info(
                "Component has indicated auto-restart. Saving details from START command."
            )
            store_command = 'true' == command['roleParams'][
                ActionQueue.AUTO_RESTART]

        # running command
        commandresult = self.customServiceOrchestrator.runCommand(
            command, in_progress_status['tmpout'],
            in_progress_status['tmperr'], True, store_config or store_command)
        # dumping results
        status = self.COMPLETED_STATUS
        if commandresult[Constants.EXIT_CODE] != 0:
            status = self.FAILED_STATUS
        roleResult = self.commandStatuses.generate_report_template(command)
        roleResult.update({
            'stdout':
            commandresult['stdout'],
            'stderr':
            commandresult['stderr'],
            Constants.EXIT_CODE:
            commandresult[Constants.EXIT_CODE],
            'status':
            status,
            'reportResult':
            reportResult
        })
        if roleResult['stdout'] == '':
            roleResult['stdout'] = 'None'
        if roleResult['stderr'] == '':
            roleResult['stderr'] = 'None'

        if 'structuredOut' in commandresult:
            roleResult['structuredOut'] = str(commandresult['structuredOut'])
        else:
            roleResult['structuredOut'] = ''
            # let server know that configuration tags were applied
        if status == self.COMPLETED_STATUS:
            if 'configurationTags' in command:
                roleResult['configurationTags'] = command['configurationTags']
            if Constants.ALLOCATED_PORTS in commandresult:
                roleResult['allocatedPorts'] = commandresult[
                    Constants.ALLOCATED_PORTS]
            if Constants.FOLDERS in commandresult:
                roleResult['folders'] = commandresult[Constants.FOLDERS]
        self.commandStatuses.put_command_status(command, roleResult,
                                                reportResult)
Example #15
0
    def execute_command(self, command):
        '''
    Executes commands of type  EXECUTION_COMMAND
    '''
        clusterName = command['clusterName']
        commandId = command['commandId']
        if 'package' in command:
            self.componentPackage = command['package']
        else:
            self.componentPackage = ''

        logger.info("Package received: " + self.componentPackage)

        message = "Executing command with id = {commandId} for role = {role} of " \
                  "cluster {cluster}".format(
          commandId=str(commandId), role=command['role'],
          cluster=clusterName)
        logger.info(message)
        logger.debug(pprint.pformat(command))

        taskId = command['taskId']

        # if auto generated then do not report result
        reportResult = CommandStatusDict.shouldReportResult(command)

        # Preparing 'IN_PROGRESS' report
        in_progress_status = self.commandStatuses.generate_report_template(
            command)
        in_progress_status.update({
            'tmpout':
            self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
            'tmperr':
            self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
            'structuredOut':
            self.tmpdir + os.sep + 'structured-out-' + str(taskId) + '.json',
            'status':
            self.IN_PROGRESS_STATUS,
            'reportResult':
            reportResult
        })
        self.commandStatuses.put_command_status(command, in_progress_status,
                                                reportResult)

        store_config = False
        if ActionQueue.STORE_APPLIED_CONFIG in command['commandParams']:
            store_config = 'true' == command['commandParams'][
                ActionQueue.STORE_APPLIED_CONFIG]
        store_command = False
        if 'roleParams' in command and command[
                'roleParams'] is not None and ActionQueue.AUTO_RESTART in command[
                    'roleParams']:
            store_command = 'true' == command['roleParams'][
                ActionQueue.AUTO_RESTART]

        if store_command:
            logger.info(
                "Component has indicated auto-restart. Saving details from START command."
            )

        logger.info("Running command: " + str(command))

        if 'configurations' in command and 'yarnDockerMode' in command and command[
                'yarnDockerMode'] == True:
            self.yarn_docker_mode = True
            commandresult = self.yarnDockerManager.execute_command(
                command, store_config or store_command)
        elif 'configurations' in command and 'docker' in command[
                'configurations']:
            self.docker_mode = True
            commandresult = self.dockerManager.execute_command(
                command, store_config or store_command)
        else:
            # running command
            commandresult = self.customServiceOrchestrator.runCommand(
                command, in_progress_status['tmpout'],
                in_progress_status['tmperr'], True, store_config
                or store_command)
        # If command is STOP then set flag to indicate stop has been triggered.
        # In future we might check status of STOP command and take other measures
        # if graceful STOP fails (like force kill the processes)
        if command['roleCommand'] == 'STOP':
            logger.info("Stop command received")
            self.controller.appGracefulStopTriggered = True

        # dumping results
        status = self.COMPLETED_STATUS
        if commandresult[Constants.EXIT_CODE] != 0:
            status = self.FAILED_STATUS
        roleResult = self.commandStatuses.generate_report_template(command)
        roleResult.update({
            'stdout':
            commandresult['stdout'],
            'stderr':
            commandresult['stderr'],
            Constants.EXIT_CODE:
            commandresult[Constants.EXIT_CODE],
            'status':
            status,
            'reportResult':
            reportResult
        })
        if roleResult['stdout'] == '':
            roleResult['stdout'] = 'None'
        if roleResult['stderr'] == '':
            roleResult['stderr'] = 'None'

        if 'structuredOut' in commandresult:
            roleResult['structuredOut'] = str(commandresult['structuredOut'])
        else:
            roleResult['structuredOut'] = ''
            # let server know that configuration tags were applied
        if status == self.COMPLETED_STATUS:
            if 'configurationTags' in command:
                roleResult['configurationTags'] = command['configurationTags']
            if Constants.ALLOCATED_PORTS in commandresult:
                roleResult['allocatedPorts'] = commandresult[
                    Constants.ALLOCATED_PORTS]
            if Constants.FOLDERS in commandresult:
                roleResult['folders'] = commandresult[Constants.FOLDERS]
        self.commandStatuses.put_command_status(command, roleResult,
                                                reportResult)
Example #16
0
class ActionQueue(threading.Thread):
  """ Action Queue for the agent. We pick one command at a time from the queue
  and execute it
  Note: Action and command terms in this and related classes are used interchangeably
  """

  # How many actions can be performed in parallel. Feel free to change
  MAX_CONCURRENT_ACTIONS = 5


  #How much time(in seconds) we need wait for new incoming execution command before checking
  #status command queue
  EXECUTION_COMMAND_WAIT_TIME = 2

  STATUS_COMMAND = 'STATUS_COMMAND'
  EXECUTION_COMMAND = 'EXECUTION_COMMAND'
  AUTO_EXECUTION_COMMAND = 'AUTO_EXECUTION_COMMAND'
  BACKGROUND_EXECUTION_COMMAND = 'BACKGROUND_EXECUTION_COMMAND'
  ROLE_COMMAND_INSTALL = 'INSTALL'
  ROLE_COMMAND_START = 'START'
  ROLE_COMMAND_STOP = 'STOP'
  ROLE_COMMAND_CUSTOM_COMMAND = 'CUSTOM_COMMAND'
  CUSTOM_COMMAND_RESTART = 'RESTART'

  IN_PROGRESS_STATUS = 'IN_PROGRESS'
  COMPLETED_STATUS = 'COMPLETED'
  FAILED_STATUS = 'FAILED'

  def __init__(self, config, controller):
    super(ActionQueue, self).__init__()
    self.commandQueue = Queue.Queue()
    self.statusCommandQueue = Queue.Queue()
    self.backgroundCommandQueue = Queue.Queue()
    self.commandStatuses = CommandStatusDict(callback_action =
      self.status_update_callback)
    self.config = config
    self.controller = controller
    self.configTags = {}
    self._stop = threading.Event()
    self.tmpdir = config.get('agent', 'prefix')
    self.customServiceOrchestrator = CustomServiceOrchestrator(config, controller)
    self.parallel_execution = config.get_parallel_exec_option()
    if self.parallel_execution == 1:
      logger.info("Parallel execution is enabled, will execute agent commands in parallel")

  def stop(self):
    self._stop.set()

  def stopped(self):
    return self._stop.isSet()

  def put_status(self, commands):
    #Was supposed that we got all set of statuses, we don't need to keep old ones
    self.statusCommandQueue.queue.clear()

    for command in commands:
      logger.info("Adding " + command['commandType'] + " for component " + \
                  command['componentName'] + " of service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      self.statusCommandQueue.put(command)
      logger.debug(pprint.pformat(command))

  def put(self, commands):
    for command in commands:
      if not command.has_key('serviceName'):
        command['serviceName'] = "null"
      if not command.has_key('clusterName'):
        command['clusterName'] = 'null'

      logger.info("Adding " + command['commandType'] + " for role " + \
                  command['role'] + " for service " + \
                  command['serviceName'] + " of cluster " + \
                  command['clusterName'] + " to the queue.")
      if command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND :
        self.backgroundCommandQueue.put(self.createCommandHandle(command))
      else:
        self.commandQueue.put(command)

  def cancel(self, commands):
    for command in commands:

      logger.info("Canceling command {tid}".format(tid = str(command['target_task_id'])))
      logger.debug(pprint.pformat(command))

      task_id = command['target_task_id']
      reason = command['reason']

      # Remove from the command queue by task_id
      queue = self.commandQueue
      self.commandQueue = Queue.Queue()

      while not queue.empty():
        queued_command = queue.get(False)
        if queued_command['taskId'] != task_id:
          self.commandQueue.put(queued_command)
        else:
          logger.info("Canceling " + queued_command['commandType'] + \
                      " for service " + queued_command['serviceName'] + \
                      " and role " +  queued_command['role'] + \
                      " with taskId " + str(queued_command['taskId']))

      # Kill if in progress
      self.customServiceOrchestrator.cancel_command(task_id, reason)

  def run(self):
    try:
      while not self.stopped():
        self.processBackgroundQueueSafeEmpty();
        self.processStatusCommandQueueSafeEmpty();
        try:
          if self.parallel_execution == 0:
            command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
            self.process_command(command)
          else:
            # If parallel execution is enabled, just kick off all available
            # commands using separate threads
            while (True):
              command = self.commandQueue.get(True, self.EXECUTION_COMMAND_WAIT_TIME)
              # If command is not retry_enabled then do not start them in parallel
              # checking just one command is enough as all commands for a stage is sent
              # at the same time and retry is only enabled for initial start/install
              retryAble = False
              if 'commandParams' in command and 'command_retry_enabled' in command['commandParams']:
                retryAble = command['commandParams']['command_retry_enabled'] == "true"
              if retryAble:
                logger.info("Kicking off a thread for the command, id=" +
                            str(command['commandId']) + " taskId=" + str(command['taskId']))
                t = threading.Thread(target=self.process_command, args=(command,))
                t.daemon = True
                t.start()
              else:
                self.process_command(command)
                break
              pass
            pass
        except (Queue.Empty):
          pass
    except:
      logger.exception("ActionQueue thread failed with exception:")
      raise
    
    logger.info("ActionQueue thread has successfully finished")

  def processBackgroundQueueSafeEmpty(self):
    while not self.backgroundCommandQueue.empty():
      try:
        command = self.backgroundCommandQueue.get(False)
        if command.has_key('__handle') and command['__handle'].status == None:
          self.process_command(command)
      except Queue.Empty:
        pass

  def processStatusCommandQueueSafeEmpty(self):
    while not self.statusCommandQueue.empty():
      try:
        command = self.statusCommandQueue.get(False)
        self.process_command(command)
      except Queue.Empty:
        pass


  def createCommandHandle(self, command):
    if command.has_key('__handle'):
      raise AgentException("Command already has __handle")
    command['__handle'] = BackgroundCommandExecutionHandle(command, command['commandId'], None, self.on_background_command_complete_callback)
    return command

  def process_command(self, command):
    # make sure we log failures
    commandType = command['commandType']
    logger.debug("Took an element of Queue (command type = %s)." % commandType)
    try:
      if commandType in [self.EXECUTION_COMMAND, self.BACKGROUND_EXECUTION_COMMAND, self.AUTO_EXECUTION_COMMAND]:
        try:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.start_execution_command()
          self.execute_command(command)
        finally:
          if self.controller.recovery_manager.enabled():
            self.controller.recovery_manager.stop_execution_command()
      elif commandType == self.STATUS_COMMAND:
        self.execute_status_command(command)
      else:
        logger.error("Unrecognized command " + pprint.pformat(command))
    except Exception:
      logger.exception("Exception while processing {0} command".format(commandType))

  def tasks_in_progress_or_pending(self):
    return_val = False
    if not self.commandQueue.empty():
      return_val = True
    if self.controller.recovery_manager.has_active_command():
      return_val = True
    return return_val
    pass

  def execute_command(self, command):
    '''
    Executes commands of type EXECUTION_COMMAND
    '''
    clusterName = command['clusterName']
    commandId = command['commandId']
    isCommandBackground = command['commandType'] == self.BACKGROUND_EXECUTION_COMMAND
    isAutoExecuteCommand = command['commandType'] == self.AUTO_EXECUTION_COMMAND
    message = "Executing command with id = {commandId}, taskId = {taskId} for role = {role} of " \
              "cluster {cluster}.".format(
              commandId = str(commandId), taskId = str(command['taskId']),
              role=command['role'], cluster=clusterName)
    logger.info(message)

    taskId = command['taskId']
    # Preparing 'IN_PROGRESS' report
    in_progress_status = self.commandStatuses.generate_report_template(command)
    # The path of the files that contain the output log and error log use a prefix that the agent advertises to the
    # server. The prefix is defined in agent-config.ini
    if not isAutoExecuteCommand:
      in_progress_status.update({
        'tmpout': self.tmpdir + os.sep + 'output-' + str(taskId) + '.txt',
        'tmperr': self.tmpdir + os.sep + 'errors-' + str(taskId) + '.txt',
        'structuredOut' : self.tmpdir + os.sep + 'structured-out-' + str(taskId) + '.json',
        'status': self.IN_PROGRESS_STATUS
      })
    else:
      in_progress_status.update({
        'tmpout': self.tmpdir + os.sep + 'auto_output-' + str(taskId) + '.txt',
        'tmperr': self.tmpdir + os.sep + 'auto_errors-' + str(taskId) + '.txt',
        'structuredOut' : self.tmpdir + os.sep + 'auto_structured-out-' + str(taskId) + '.json',
        'status': self.IN_PROGRESS_STATUS
      })

    self.commandStatuses.put_command_status(command, in_progress_status)

    numAttempts = 0
    retryDuration = 0  # even with 0 allow one attempt
    retryAble = False
    delay = 1
    log_command_output = True
    if 'commandParams' in command and 'log_output' in command['commandParams'] and "false" == command['commandParams']['log_output']:
      log_command_output = False

    if 'commandParams' in command:
      if 'max_duration_for_retries' in command['commandParams']:
        retryDuration = int(command['commandParams']['max_duration_for_retries'])
      if 'command_retry_enabled' in command['commandParams']:
        retryAble = command['commandParams']['command_retry_enabled'] == "true"
    if isAutoExecuteCommand:
      retryAble = False

    logger.info("Command execution metadata - taskId = {taskId}, retry enabled = {retryAble}, max retry duration (sec) = {retryDuration}, log_output = {log_command_output}".
                 format(taskId=taskId, retryAble=retryAble, retryDuration=retryDuration, log_command_output=log_command_output))
    while retryDuration >= 0:
      numAttempts += 1
      start = 0
      if retryAble:
        start = int(time.time())
      # running command
      commandresult = self.customServiceOrchestrator.runCommand(command,
                                                                in_progress_status['tmpout'],
                                                                in_progress_status['tmperr'],
                                                                override_output_files=numAttempts == 1,
                                                                retry=numAttempts > 1)
      end = 1
      if retryAble:
        end = int(time.time())
      retryDuration -= (end - start)

      # dumping results
      if isCommandBackground:
        logger.info("Command is background command, quit retrying. Exit code: {exitCode}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId, exitCode=commandresult['exitcode'], retryAble=retryAble, retryDuration=retryDuration, delay=delay))
        return
      else:
        if commandresult['exitcode'] == 0:
          status = self.COMPLETED_STATUS
        else:
          status = self.FAILED_STATUS
          if (commandresult['exitcode'] == -signal.SIGTERM) or (commandresult['exitcode'] == -signal.SIGKILL):
            logger.info('Command {cid} was canceled!'.format(cid=taskId))
            break

      if status != self.COMPLETED_STATUS and retryAble and retryDuration > 0:
        delay = self.get_retry_delay(delay)
        if delay > retryDuration:
          delay = retryDuration
        retryDuration -= delay  # allow one last attempt
        commandresult['stderr'] += "\n\nCommand failed. Retrying command execution ...\n\n"
        logger.info("Retrying command id {cid} after a wait of {delay}".format(cid=taskId, delay=delay))
        time.sleep(delay)
        continue
      else:
        logger.info("Quit retrying for command id {cid}. Status: {status}, retryAble: {retryAble}, retryDuration (sec): {retryDuration}, last delay (sec): {delay}"
                    .format(cid=taskId, status=status, retryAble=retryAble, retryDuration=retryDuration, delay=delay))
        break

    # final result to stdout
    commandresult['stdout'] += '\n\nCommand completed successfully!\n' if status == self.COMPLETED_STATUS else '\n\nCommand failed after ' + str(numAttempts) + ' tries\n'
    logger.info('Command {cid} completed successfully!'.format(cid=taskId) if status == self.COMPLETED_STATUS else 'Command {cid} failed after {attempts} tries'.format(cid=taskId, attempts=numAttempts))

    roleResult = self.commandStatuses.generate_report_template(command)
    roleResult.update({
      'stdout': commandresult['stdout'],
      'stderr': commandresult['stderr'],
      'exitCode': commandresult['exitcode'],
      'status': status,
    })

    if self.config.has_option("logging","log_command_executes") \
        and int(self.config.get("logging", "log_command_executes")) == 1 \
        and log_command_output:

      if roleResult['stdout'] != '':
          logger.info("Begin command output log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])
          self.log_command_output(roleResult['stdout'], str(command['taskId']))
          logger.info("End command output log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])

      if roleResult['stderr'] != '':
          logger.info("Begin command stderr log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])
          self.log_command_output(roleResult['stderr'], str(command['taskId']))
          logger.info("End command stderr log for command with id = " + str(command['taskId']) + ", role = "
                      + command['role'] + ", roleCommand = " + command['roleCommand'])

    if roleResult['stdout'] == '':
      roleResult['stdout'] = 'None'
    if roleResult['stderr'] == '':
      roleResult['stderr'] = 'None'

    # let ambari know name of custom command
    if command['hostLevelParams'].has_key('custom_command'):
      roleResult['customCommand'] = command['hostLevelParams']['custom_command']

    if 'structuredOut' in commandresult:
      roleResult['structuredOut'] = str(json.dumps(commandresult['structuredOut']))
    else:
      roleResult['structuredOut'] = ''

    # let recovery manager know the current state
    if status == self.COMPLETED_STATUS:
      if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
          and self.controller.recovery_manager.configured_for_recovery(command['role']):
        if command['roleCommand'] == self.ROLE_COMMAND_START:
          self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.LIVE_STATUS)
          self.controller.recovery_manager.update_config_staleness(command['role'], False)
          logger.info("After EXECUTION_COMMAND (START), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                       self.controller.recovery_manager.get_current_status(command['role']) )
        elif command['roleCommand'] == self.ROLE_COMMAND_STOP or command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
          self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.DEAD_STATUS)
          logger.info("After EXECUTION_COMMAND (STOP/INSTALL), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                       self.controller.recovery_manager.get_current_status(command['role']) )
        elif command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND:
          if command['hostLevelParams'].has_key('custom_command') and \
                  command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART:
            self.controller.recovery_manager.update_current_status(command['role'], LiveStatus.LIVE_STATUS)
            self.controller.recovery_manager.update_config_staleness(command['role'], False)
            logger.info("After EXECUTION_COMMAND (RESTART), current state of " + command['role'] + " to " +
                         self.controller.recovery_manager.get_current_status(command['role']) )
      pass

      # let ambari know that configuration tags were applied
      configHandler = ActualConfigHandler(self.config, self.configTags)

      if command.has_key('configurationTags'):
        configHandler.write_actual(command['configurationTags'])
        roleResult['configurationTags'] = command['configurationTags']
      component = {'serviceName':command['serviceName'],'componentName':command['role']}
      if 'roleCommand' in command and \
          (command['roleCommand'] == self.ROLE_COMMAND_START or
             (command['roleCommand'] == self.ROLE_COMMAND_INSTALL and component in LiveStatus.CLIENT_COMPONENTS) or
               (command['roleCommand'] == self.ROLE_COMMAND_CUSTOM_COMMAND and
                  'custom_command' in command['hostLevelParams'] and
                      command['hostLevelParams']['custom_command'] == self.CUSTOM_COMMAND_RESTART)):
        configHandler.write_actual_component(command['role'],
                                             command['configurationTags'])
        if 'clientsToUpdateConfigs' in command['hostLevelParams'] and command['hostLevelParams']['clientsToUpdateConfigs']:
          configHandler.write_client_components(command['serviceName'],
                                                command['configurationTags'],
                                                command['hostLevelParams']['clientsToUpdateConfigs'])
        roleResult['configurationTags'] = configHandler.read_actual_component(
            command['role'])
    elif status == self.FAILED_STATUS:
      if self.controller.recovery_manager.enabled() and command.has_key('roleCommand') \
              and self.controller.recovery_manager.configured_for_recovery(command['role']):
        if command['roleCommand'] == self.ROLE_COMMAND_INSTALL:
          self.controller.recovery_manager.update_current_status(command['role'], self.controller.recovery_manager.INSTALL_FAILED)
          logger.info("After EXECUTION_COMMAND (INSTALL), with taskId=" + str(command['taskId']) +
                      ", current state of " + command['role'] + " to " +
                      self.controller.recovery_manager.get_current_status(command['role']))

    self.commandStatuses.put_command_status(command, roleResult)

  def log_command_output(self, text, taskId):
    """
    Logs a message as multiple enumerated log messages every of which is not larger than MAX_SYMBOLS_PER_LOG_MESSAGE.

    If logs are redirected to syslog (syslog_enabled=1), this is very useful for logging big messages.
    As syslog usually truncates long messages.
    """
    chunks = split_on_chunks(text, MAX_SYMBOLS_PER_LOG_MESSAGE)
    if len(chunks) > 1:
      for i in range(len(chunks)):
        logger.info("Cmd log for taskId={0} and chunk {1}/{2} of log for command: \n".format(taskId, i+1, len(chunks)) + chunks[i])
    else:
      logger.info("Cmd log for taskId={0}: ".format(taskId) + text)

  def get_retry_delay(self, last_delay):
    """
    Returns exponentially growing delay. The idea being if number of retries is high then the reason to retry
    is probably a host or environment specific issue requiring longer waits
    """
    return last_delay * 2

  def command_was_canceled(self):
    self.customServiceOrchestrator

  def on_background_command_complete_callback(self, process_condensed_result, handle):
    logger.debug('Start callback: %s' % process_condensed_result)
    logger.debug('The handle is: %s' % handle)
    status = self.COMPLETED_STATUS if handle.exitCode == 0 else self.FAILED_STATUS

    aborted_postfix = self.customServiceOrchestrator.command_canceled_reason(handle.command['taskId'])
    if aborted_postfix:
      status = self.FAILED_STATUS
      logger.debug('Set status to: %s , reason = %s' % (status, aborted_postfix))
    else:
      aborted_postfix = ''


    roleResult = self.commandStatuses.generate_report_template(handle.command)

    roleResult.update({
      'stdout': process_condensed_result['stdout'] + aborted_postfix,
      'stderr': process_condensed_result['stderr'] + aborted_postfix,
      'exitCode': process_condensed_result['exitcode'],
      'structuredOut': str(json.dumps(process_condensed_result['structuredOut'])) if 'structuredOut' in process_condensed_result else '',
      'status': status,
    })

    self.commandStatuses.put_command_status(handle.command, roleResult)

  def execute_status_command(self, command):
    '''
    Executes commands of type STATUS_COMMAND
    '''
    try:
      cluster = command['clusterName']
      service = command['serviceName']
      component = command['componentName']
      configurations = command['configurations']
      if configurations.has_key('global'):
        globalConfig = configurations['global']
      else:
        globalConfig = {}

      livestatus = LiveStatus(cluster, service, component,
                              globalConfig, self.config, self.configTags)

      component_extra = None

      # For custom services, responsibility to determine service status is
      # delegated to python scripts
      component_status_result = self.customServiceOrchestrator.requestComponentStatus(command)
      component_security_status_result = self.customServiceOrchestrator.requestComponentSecurityState(command)

      if component_status_result['exitcode'] == 0:
        component_status = LiveStatus.LIVE_STATUS
        if self.controller.recovery_manager.enabled() \
          and self.controller.recovery_manager.configured_for_recovery(component):
          self.controller.recovery_manager.update_current_status(component, component_status)
      else:
        component_status = LiveStatus.DEAD_STATUS
        if self.controller.recovery_manager.enabled() \
          and self.controller.recovery_manager.configured_for_recovery(component):
          if (self.controller.recovery_manager.get_current_status(component) != self.controller.recovery_manager.INSTALL_FAILED):
            self.controller.recovery_manager.update_current_status(component, component_status)

      request_execution_cmd = self.controller.recovery_manager.requires_recovery(component) and \
                                not self.controller.recovery_manager.command_exists(component, ActionQueue.EXECUTION_COMMAND)

      if 'structuredOut' in component_status_result:
        component_extra = component_status_result['structuredOut']

      result = livestatus.build(component_status=component_status)
      if self.controller.recovery_manager.enabled():
        result['sendExecCmdDet'] = str(request_execution_cmd)

      # Add security state to the result
      result['securityState'] = component_security_status_result

      if component_extra is not None and len(component_extra) != 0:
        if component_extra.has_key('alerts'):
          result['alerts'] = component_extra['alerts']
          del component_extra['alerts']

        result['extra'] = component_extra

      logger.debug("Got live status for component " + component + \
                   " of service " + str(service) + \
                   " of cluster " + str(cluster))

      logger.debug(pprint.pformat(result))
      if result is not None:
        self.commandStatuses.put_command_status(command, result)
    except Exception, err:
      traceback.print_exc()
      logger.warn(err)
    pass