class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20,
                                   logger = self.logger)
        self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq)
            return False

        return True


    def _getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during _getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _getWork request (prameters are %s)", configreq)
        return pendingwork


    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info("Received kill request. Setting STOP flag in the master process...")
        self.STOP = True


    def updateWork(self, taskname, command, status):
        configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq)


    def failQueuedTasks(self):
        """ This method is used at the TW startup and it fails QUEUED tasks that supposedly
            could not communicate with the REST and update their status. The method put those
            task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
            the command field.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self._getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Failing QUEUED task %s", task['tm_taskname'])
                dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus)
            if not len(pendingwork):
                self.logger.info("Finished failing QUEUED tasks (total %s)", total)
                break #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit)


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Failing QUEUED tasks before startup.")
        self.failQueuedTasks()
        self.logger.debug("Starting main loop.")
        while(not self.STOP):
            limit = self.slaves.queueableTasks()
            if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'):
                continue

            pendingwork = self._getWork(limit=limit, getstatus='HOLDING')

            if len(pendingwork) > 0:
                self.logger.info("Retrieved a total of %d works", len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s", str(pendingwork))

            toInject = []
            for task in pendingwork:
                worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                toInject.append((worktype, task, failstatus, None))

            for task in pendingwork:
                self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED')

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")
Example #2
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s." % os.getpid())
            logger.debug("Logging level initialized to %s." % loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2)
        self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""
        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            #Using a msg variable and only one self.logger.error so that messages do not get shuffled
            msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq)
            if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \
               not hte.headers.get('X-Error-Http', -1) == '400':
                msg += "Task Worker could not update work to the server: \n" +\
                                  "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\
                                  "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "Probably no task to be updated\n"
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                msg += "TW could not update work to the server:\n"
                msg += "%s \n" %(str(traceback.format_exc()))
                msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))
            self.logger.error(msg)
        except Exception as exc:
            msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq)
            self.logger.error(msg + traceback.format_exc())

        return True


    def _getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            self.logger.error("HTTP Error during _getWork: %s" % str(hte))
            self.logger.error("Could not get any work from the server: \n" +
                              "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +
                              "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown')))
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                self.logger.error("Server could not acquire any work from the server:")
                self.logger.error("%s " %(str(traceback.format_exc())))
                self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown')))
                self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown')))
        except Exception as exc:
            self.logger.error("Server could not process the request: %s" %(str(exc)))
            self.logger.error(traceback.format_exc())
        return pendingwork


    def quit(self, code, traceback_):
        self.logger.info("Received kill request. Waiting for the workers...")
        self.STOP = True


    def updateWork(self, taskname, status):
        configreq = {'workflow': taskname, 'status': status, 'subresource': 'state'}
        retry = True
        while retry:
            try:
                self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
                retry = False
            except HTTPException as hte:
                #Using a msg variable and only one self.logger.error so that messages do not get shuffled
                msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq)
                msg += "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown'))
                msg += "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))
                msg += "%s \n" %(str(traceback.format_exc()))
                self.logger.error(msg)
                retry = False
                if int(hte.headers.get('X-Error-Http', '0')) == 503:
                    #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing?
                    retry = True
                    time_sleep = 30 + random.randint(10, 30)
                    self.logger.info("Sleeping %s seconds and will try to update again." % str(time_sleep))
                    time.sleep(time_sleep)
            except Exception as exc:
                msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq)
                self.logger.error(msg + traceback.format_exc())
                retry = False


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distribuiting it to the slave processes."""

        self.logger.debug("Starting")
        while(not self.STOP):
            for status, worktype, failstatus in states():
                limit = self.slaves.queueableTasks()
                if not self._lockWork(limit=limit, getstatus=status, setstatus='HOLDING'):
                    continue
                ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down)
                ## we may end up executing the wrong worktype later on. A solution would be to
                ## save the previous task state in a new column of the TaskDB.
                pendingwork = self._getWork(limit=limit, getstatus='HOLDING')
                self.logger.info("Retrieved a total of %d %s works" %(len(pendingwork), worktype))
                self.logger.debug("Retrieved the following works: \n%s" %(str(pendingwork)))
                self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork])
                for task in pendingwork:
                    self.updateWork(task['tm_taskname'], 'QUEUED')

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s" %(str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            finished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")
Example #3
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""
    def __init__(self,
                 config,
                 logWarning,
                 logDebug,
                 sequential=False,
                 console=False,
                 name='master'):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool logWarning: it tells if a quiet logger is needed
        :arg bool logDebug: it tells if needs a verbose logger
        :arg bool sequential: it tells if to run in sequential (no subprocesses) mode.
        :arg bool console: it tells if to log to console.
        :arg string name: defines a name for the log of this master process"""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring errors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def createAndCleanLogDirectories(logsDir):
            # it can be named with the time stamp a TW started
            createLogdir(logsDir)
            createLogdir(logsDir + '/tasks')
            currentProcessesDir = logsDir + '/processes/'
            createLogdir(currentProcessesDir)
            # when running inside a container process logs will start with same
            # process numbers, i.e. same name, at any container restart.
            # to avoid clashes and confusion, we will put away all previous processes
            # logs when a TW instance starts. To this goal each TW which runs
            # creates a directory where new containers will move its logs, so
            # identify LastLogs_timestamp directory
            latestLogDir = None  # the logs directory could be empty
            files = os.listdir(currentProcessesDir)
            files.sort(
                reverse=True
            )  # if there are multiple Latest*, will hit the latest first
            for f in files:
                if f.startswith('Latest'):
                    latestLogDir = currentProcessesDir + f
                    break
            if files and latestLogDir:
                # rename from Latest to Old
                oldLogsDir = latestLogDir.replace('Latest', 'Old')
                shutil.move(latestLogDir, oldLogsDir)
            else:
                print(
                    "LatestLogDir not found in logs/processes, create a dummy dir to store old files"
                )
                oldLogsDir = currentProcessesDir + 'OldLog-Unknwown'
                createLogdir(oldLogsDir)
            # move process logs for latest TW run to old directory
            for f in files:
                if f.startswith('proc.c3id'):
                    shutil.move(currentProcessesDir + f, oldLogsDir)

            # create a new LateastLogs directory where to store logs from this TaskWorker
            YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime())
            myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS
            createLogdir(myDir)

        def setRootLogger(logWarning, logDebug, console, name):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logsDir/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool logWarning: it tells if a quiet logger is needed
            :arg bool logDebug: it tells if needs a verbose logger
            :arg bool console: it tells if to log to console
            :arg string name: define a name for the log file of this master process
            :return logger: a logger with the appropriate logger level."""

            # this must only done for real Master, not when it is used by TapeRecallStatus
            logsDir = config.TaskWorker.logsDir
            if name == 'master':
                createAndCleanLogDirectories(logsDir)

            if console:
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(logsDir + '/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if logWarning:
                loglevel = logging.WARNING
            if logDebug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger(name, logsDir)
            logger.info("PID %s.", os.getpid())
            logger.info("Logging level initialized to %s.", loglevel)
            return logger

        def logVersionAndConfig(config=None, logger=None):
            """
            log version number and major config. parameters
            args: config : a configuration object loaded from file
            args: logger : the logger instance to use
            """
            twstartDict = {}
            twstartDict['version'] = __version__
            twstartDict['DBSHostName'] = config.Services.DBSHostName
            twstartDict['name'] = config.TaskWorker.name
            twstartDict['instance'] = config.TaskWorker.instance
            if config.TaskWorker.instance == 'other':
                twstartDict['restHost'] = config.TaskWorker.restHost
                twstartDict['dbInstance'] = config.TaskWorker.dbInstance
            twstartDict['nslaves'] = config.TaskWorker.nslaves
            twstartDict[
                'recurringActions'] = config.TaskWorker.recurringActions
            # one line for automatic parsing
            logger.info('TWSTART: %s', json.dumps(twstartDict))
            # multiple lines for humans to read
            for k, v in twstartDict.items():
                logger.info('%s: %s', k, v)
            return

        self.STOP = False
        self.TEST = sequential
        self.logger = setRootLogger(logWarning, logDebug, console, name)
        self.config = config
        self.restHost = None
        dbInstance = None

        logVersionAndConfig(self.config, self.logger)

        try:
            instance = self.config.TaskWorker.instance
        except:
            msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            self.restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                self.restHost = self.config.TaskWorker.restHost
                dbInstance = self.config.TaskWorker.dbInstance
            except:
                msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration"
                raise ConfigException(msg)
        self.dbInstance = dbInstance

        self.logger.info('Will connect via URL: https://%s/%s', self.restHost,
                         self.dbInstance)

        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.crabserver = CRABRest(self.restHost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=20,
                                   logger=self.logger,
                                   userAgent='CRABTaskWorker')
        self.crabserver.setDbInstance(self.dbInstance)
        self.logger.debug("Hostcert: %s, hostkey: %s",
                          str(self.config.TaskWorker.cmscert),
                          str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=self.config.TaskWorker.cmscert,
            X509_USER_KEY=self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, self.restHost,
                                     self.dbInstance)
        else:
            self.slaves = Worker(self.config, self.restHost, self.dbInstance)
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]

    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName,
                         fromlist=actionName)
        return getattr(mod, actionName)(self.config.TaskWorker.logsDir)

    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {
            'subresource': 'process',
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus,
            'limit': limit,
            'status': setstatus
        }
        try:
            #self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the _lockWork request (prameters are %s)",
                configreq)
            return False

        return True

    def getWork(self, limit, getstatus, ignoreTWName=False):
        configreq = {
            'limit': limit,
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus
        }
        if ignoreTWName:
            configreq['workername'] = '%'

        pendingwork = []
        try:
            #pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result']
            pendingwork = self.crabserver.get(api='workflowdb',
                                              data=configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the getWork request (prameters are %s)",
                configreq)
        return pendingwork

    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info(
            "Received kill request. Setting STOP flag in the master process..."
        )
        self.STOP = True

    def updateWork(self, taskname, command, status):
        """ Update taskname setting the status and the command for it
            Return True if the change succeded, False otherwise
        """

        configreq = {
            'workflow': taskname,
            'command': command,
            'status': status,
            'subresource': 'state'
        }
        try:
            #self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the updateWork request (prameters are %s)",
                configreq)
        else:
            return True  #success
        return False  #failure

    def restartQueuedTasks(self):
        """ This method is used at the TW startup and it restarts QUEUED tasks
            setting them  back again to NEW.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self.getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Restarting QUEUED task %s",
                                  task['tm_taskname'])
                self.updateWork(task['tm_taskname'], task['tm_task_command'],
                                'NEW')
            if not pendingwork:
                self.logger.info("Finished restarting QUEUED tasks (total %s)",
                                 total)
                break  #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info(
                    "Restarted %s tasks (limit %s), getting next chunk of tasks",
                    len(pendingwork), limit)

    def failBannedTask(self, task):
        """ This method is used at the TW startup and it fails NEW tasks which I do not like
        The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
         the command field.
        Initial implementation bans based on a list of usernames, other task attributes can
          be checked if needed by adding a bit of code
        Returns:
            True : if the task was declared bad and was failed
            False: for normal (good) tasks
        """
        bannedUsernames = getattr(self.config.TaskWorker, 'bannedUsernames',
                                  [])
        if task['tm_username'] in bannedUsernames:
            self.logger.debug("Forcefully failing task %s",
                              task['tm_taskname'])
            if task['tm_task_command']:
                dummyWorktype, failstatus = STATE_ACTIONS_MAP[
                    task['tm_task_command']]
            else:
                failstatus = 'FAILED'
            self.updateWork(task['tm_taskname'], task['tm_task_command'],
                            failstatus)
            # TODO look into logging a message for the user
            return True
        return False

    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Restarting QUEUED tasks before startup.")
        self.restartQueuedTasks()
        self.logger.debug("Master Worker Starting Main Cycle.")
        while not self.STOP:
            limit = self.slaves.queueableTasks()
            if not self._lockWork(
                    limit=limit, getstatus='NEW', setstatus='HOLDING'):
                time.sleep(self.config.TaskWorker.polling)
                continue

            pendingwork = self.getWork(limit=limit, getstatus='HOLDING')

            if pendingwork:
                keys = ['tm_task_command', 'tm_taskname']
                tasksInfo = [{k: v
                              for k, v in task.items() if k in keys}
                             for task in pendingwork]
                self.logger.info("Retrieved a total of %d works",
                                 len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s",
                                  str(tasksInfo))

            toInject = []
            for task in pendingwork:
                if self.failBannedTask(task):
                    continue
                if self.updateWork(task['tm_taskname'],
                                   task['tm_task_command'], 'QUEUED'):
                    worktype, failstatus = STATE_ACTIONS_MAP[
                        task['tm_task_command']]
                    toInject.append((worktype, task, failstatus, None))
                else:
                    #The task stays in HOLDING and will be acquired again later
                    self.logger.info(
                        "Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration",
                        task['tm_taskname'])

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s",
                                      (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {
                        'tm_username': '******',
                        'tm_taskname': action.__module__
                    }, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d',
                             self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d',
                             self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle.")
Example #4
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool test: it tells if to run in test (no subprocesses) mode."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger


        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20,
                                   logger = self.logger)
        self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT = self.config.TaskWorker.cmscert,
                                                         X509_USER_KEY  = self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq)
            return False

        return True


    def getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the getWork request (prameters are %s)", configreq)
        return pendingwork


    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info("Received kill request. Setting STOP flag in the master process...")
        self.STOP = True


    def updateWork(self, taskname, command, status):
        """ Update taskname setting the status and the command for it
            Return True if the change succeded, False otherwise
        """

        configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq)
        else:
            return True #success
        return False #failure


    def failQueuedTasks(self):
        """ This method is used at the TW startup and it fails QUEUED tasks that supposedly
            could not communicate with the REST and update their status. The method put those
            task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
            the command field.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self.getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Failing QUEUED task %s", task['tm_taskname'])
                if task['tm_task_command']:
                    dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                else:
                    failstatus = 'FAILED'
                self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus)
            if not len(pendingwork):
                self.logger.info("Finished failing QUEUED tasks (total %s)", total)
                break #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit)


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Failing QUEUED tasks before startup.")
        self.failQueuedTasks()
        self.logger.debug("Master Worker Starting Main Cycle.")
        while(not self.STOP):
            limit = self.slaves.queueableTasks()
            if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'):
                time.sleep(self.config.TaskWorker.polling)
                continue

            pendingwork = self.getWork(limit=limit, getstatus='HOLDING')

            if len(pendingwork) > 0:
                self.logger.info("Retrieved a total of %d works", len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s", str(pendingwork))

            toInject = []
            for task in pendingwork:
                if self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED'):
                    worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                    toInject.append((worktype, task, failstatus, None))
                else:
                    #The task stays in HOLDING and will be acquired again later
                    self.logger.info("Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration", task['tm_taskname'])

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_username': '******', 'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle.")
Example #5
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""
    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s." % os.getpid())
            logger.debug("Logging level initialized to %s." % loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException(
                "No mode provided: need to specify config.TaskWorker.mode in the configuration"
            )
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[
                self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl  #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[
                self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException(
                "No correct mode provided: need to specify config.TaskWorker.mode in the configuration"
            )
        self.server = HTTPRequests(resthost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=2)
        self.logger.debug("Hostcert: %s, hostkey: %s" %
                          (str(self.config.TaskWorker.cmscert),
                           str(self.config.TaskWorker.cmskey)))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost,
                                     self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost,
                                 self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]

    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName,
                         fromlist=actionName)
        return getattr(mod, actionName)()

    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""
        configreq = {
            'subresource': 'process',
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus,
            'limit': limit,
            'status': setstatus
        }
        try:
            self.server.post(self.restURInoAPI + '/workflowdb',
                             data=urllib.urlencode(configreq))
        except HTTPException as hte:
            #Using a msg variable and only one self.logger.error so that messages do not get shuffled
            msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (
                str(hte), configreq)
            if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \
               not hte.headers.get('X-Error-Http', -1) == '400':
                msg += "Task Worker could not update work to the server: \n" +\
                                  "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\
                                  "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "Probably no task to be updated\n"
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                msg += "TW could not update work to the server:\n"
                msg += "%s \n" % (str(traceback.format_exc()))
                msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))
            self.logger.error(msg)
        except Exception as exc:
            msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (
                str(exc), configreq)
            self.logger.error(msg + traceback.format_exc())

        return True

    def _getWork(self, limit, getstatus):
        configreq = {
            'limit': limit,
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus
        }
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb',
                                          data=configreq)[0]['result']
        except HTTPException as hte:
            self.logger.error("HTTP Error during _getWork: %s" % str(hte))
            self.logger.error("Could not get any work from the server: \n" +
                              "\tstatus: %s\n" %
                              (hte.headers.get('X-Error-Http', 'unknown')) +
                              "\treason: %s" %
                              (hte.headers.get('X-Error-Detail', 'unknown')))
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                self.logger.error(
                    "Server could not acquire any work from the server:")
                self.logger.error("%s " % (str(traceback.format_exc())))
                self.logger.error("\turl: %s\n" %
                                  (getattr(hte, 'url', 'unknown')))
                self.logger.error("\tresult: %s\n" %
                                  (getattr(hte, 'result', 'unknown')))
        except Exception as exc:
            self.logger.error("Server could not process the request: %s" %
                              (str(exc)))
            self.logger.error(traceback.format_exc())
        return pendingwork

    def quit(self, code, traceback_):
        self.logger.info("Received kill request. Waiting for the workers...")
        self.STOP = True

    def updateWork(self, taskname, status):
        configreq = {
            'workflow': taskname,
            'status': status,
            'subresource': 'state'
        }
        retry = True
        while retry:
            try:
                self.server.post(self.restURInoAPI + '/workflowdb',
                                 data=urllib.urlencode(configreq))
                retry = False
            except HTTPException as hte:
                #Using a msg variable and only one self.logger.error so that messages do not get shuffled
                msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (
                    str(hte), configreq)
                msg += "\tstatus: %s\n" % (hte.headers.get(
                    'X-Error-Http', 'unknown'))
                msg += "\treason: %s\n" % (hte.headers.get(
                    'X-Error-Detail', 'unknown'))
                msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))
                msg += "%s \n" % (str(traceback.format_exc()))
                self.logger.error(msg)
                retry = False
                if int(hte.headers.get('X-Error-Http', '0')) == 503:
                    #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing?
                    retry = True
                    time_sleep = 30 + random.randint(10, 30)
                    self.logger.info(
                        "Sleeping %s seconds and will try to update again." %
                        str(time_sleep))
                    time.sleep(time_sleep)
            except Exception as exc:
                msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (
                    str(exc), configreq)
                self.logger.error(msg + traceback.format_exc())
                retry = False

    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distribuiting it to the slave processes."""

        self.logger.debug("Starting")
        while (not self.STOP):
            for status, worktype, failstatus in states():
                limit = self.slaves.queueableTasks()
                if not self._lockWork(
                        limit=limit, getstatus=status, setstatus='HOLDING'):
                    continue
                ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down)
                ## we may end up executing the wrong worktype later on. A solution would be to
                ## save the previous task state in a new column of the TaskDB.
                pendingwork = self._getWork(limit=limit, getstatus='HOLDING')
                self.logger.info("Retrieved a total of %d %s works" %
                                 (len(pendingwork), worktype))
                self.logger.debug("Retrieved the following works: \n%s" %
                                  (str(pendingwork)))
                self.slaves.injectWorks([(worktype, task, failstatus, None)
                                         for task in pendingwork])
                for task in pendingwork:
                    self.updateWork(task['tm_taskname'], 'QUEUED')

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s" %
                                      (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {
                        'tm_taskname': action.__module__
                    }, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d' %
                             self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d' %
                             self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            finished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")