Beispiel #1
0
class Propagator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self._last_stats_update = None
        self._last_metrics_update = None

    # main loop
    def run(self):
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
            mainLog.debug('getting jobs to propagate')
            sw = core_utils.get_stopwatch()
            jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs,
                                                          harvester_config.propagator.lockInterval,
                                                          harvester_config.propagator.updateInterval,
                                                          self.get_pid())
            mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time()))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite,
                                                                       tmpJobSpec.configID)
                        hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status()
                    # heartbeat is suppressed
                    if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \
                            not tmpJobSpec.not_suppress_heartbeat():
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                sw.reset()
                retList += self.communicator.check_jobs(jobListToCheck)
                mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time()))
                sw.reset()
                retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid())
                mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate),
                                                                              sw.get_elapsed_time()))
                # logging
                for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                  tmpJobSpec.status))
                        else:
                            mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                        tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                            tmpJobSpec.modificationTime = datetime.datetime.utcnow()
                        elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done():
                            # trigger next propagation to update remaining events
                            tmpJobSpec.trigger_propagation()
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec)
                                if tmpEvStat:
                                    if tmpEvRet is not None:
                                        tmpJobSpec.nRemainingEvents = tmpEvRet
                                    if tmpEvRet == 0:
                                        mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID))
                                        tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']:
                                nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                               PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL])
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow()
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()})
                    else:
                        mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                       tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            sw.reset()
            workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers,
                                                              harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time()))
            # update workers in central database
            sw.reset()
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nWorkers]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error('failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                   tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj,
                                                                                    logOffset, logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                            tmpWorkSpec.status))
            mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers,
                                                                      sw.get_elapsed_time()))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error('failed to get worker stats for {0}'.format(siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats)
                        if tmpRet:
                            mainLog.debug('updated worker stats (command) for {0}'.format(siteName))
                        else:
                            mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName,
                                                                                                           tmpStr))

            if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD:

                # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking
                # care of them
                active_ups_queues = self.queueConfigMapper.get_active_ups_queues()

                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues)
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name,
                                                                                 worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name,
                                                                                                        tmp_str))

            if not self._last_metrics_update \
                    or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD):
                # get latest metrics from DB
                service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update)
                if not service_metrics_list:
                    mainLog.error('failed to get service metrics')
                    self._last_metrics_update = datetime.datetime.utcnow()
                else:
                    tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list)
                    if tmp_ret:
                        mainLog.debug('update of service metrics OK')
                        self._last_metrics_update = datetime.datetime.utcnow()
                    else:
                        mainLog.error('failed to update service metrics err={0}'.format(tmp_str))

            # send dialog messages
            mainLog.debug('getting dialog messages to propagate')
            try:
                maxDialogs = harvester_config.propagator.maxDialogs
            except Exception:
                maxDialogs = 50
            diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs,
                                                                 harvester_config.propagator.lockInterval)
            mainLog.debug('got {0} dialogs'.format(len(diagSpecs)))
            if len(diagSpecs) > 0:
                tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs)
                if tmpStat:
                    diagIDs = [diagSpec.diagID for diagSpec in diagSpecs]
                    self.dbProxy.delete_dialog_messages(diagIDs)
                    mainLog.debug('sent {0} dialogs'.format(len(diagSpecs)))

                else:
                    mainLog.error('failed to send dialogs err={0}'.format(tmpStr))
            if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval:
                mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time())
            else:
                mainLog.debug('done' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
Beispiel #2
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.ident)
        while True:
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(lockedBy),
                                             method_name='run')
            mainLog.debug('getting queues to submit workers')
            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime)
            mainLog.debug('got {0} queues for site {1}'.format(
                len(curWorkers), siteName))
            # get commands
            if siteName is not None:
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} commands'.format(len(commandSpecs)))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][
                                    'nNewWorkers'] = tmpNewVal
            # define number of new workers
            if len(curWorkers) == 0:
                nWorkersPerQueue = dict()
            else:
                nWorkersPerQueue = self.workerAdjuster.define_num_workers(
                    curWorkers, siteName)
            if nWorkersPerQueue is None:
                mainLog.error(
                    'WorkerAdjuster failed to define the number of workers')
            elif len(nWorkersPerQueue) == 0:
                pass
            else:
                # loop over all queues
                for queueName, tmpVal in iteritems(nWorkersPerQueue):
                    tmpLog = core_utils.make_logger(
                        _logger,
                        'queue={0}'.format(queueName),
                        method_name='run')
                    tmpLog.debug('start')
                    nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                    nReady = tmpVal['nReady']
                    # check queue
                    if not self.queueConfigMapper.has_queue(queueName):
                        tmpLog.error('config not found')
                        continue
                    # no new workers
                    if nWorkers == 0:
                        tmpLog.debug(
                            'skipped since no new worker is needed based on current stats'
                        )
                        continue
                    # get queue
                    queueConfig = self.queueConfigMapper.get_queue(queueName)
                    # actions based on mapping type
                    if queueConfig.mapType == WorkSpec.MT_NoJob:
                        # workers without jobs
                        jobChunks = []
                        for i in range(nWorkers):
                            jobChunks.append([])
                    elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                        # one worker per one job
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, 1, None,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy)
                    elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                        # one worker for multiple jobs
                        nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                            queueConfig, nWorkers)
                        tmpLog.debug(
                            'nJobsPerWorker={0}'.format(nJobsPerWorker))
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, nJobsPerWorker, None,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy,
                            queueConfig.allowJobMixture)
                    elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                        # multiple workers for one job
                        nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                            queueConfig, nWorkers)
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, None, nWorkersPerJob,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy)
                    else:
                        tmpLog.error('unknown mapType={0}'.format(
                            queueConfig.mapType))
                        continue
                    tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                    if len(jobChunks) == 0:
                        continue
                    # make workers
                    okChunks, ngChunks = self.workerMaker.make_workers(
                        jobChunks, queueConfig, nReady)
                    if len(ngChunks) == 0:
                        tmpLog.debug('successfully made {0} workers'.format(
                            len(okChunks)))
                    else:
                        tmpLog.debug(
                            'made {0} workers, while {1} workers failed'.
                            format(len(okChunks), len(ngChunks)))
                    timeNow = datetime.datetime.utcnow()
                    # NG
                    for ngJobs in ngChunks:
                        for jobSpec in ngJobs:
                            jobSpec.status = 'failed'
                            jobSpec.subStatus = 'failedtomake'
                            jobSpec.stateChangeTime = timeNow
                            jobSpec.lockedBy = None
                            jobSpec.trigger_propagation()
                            self.dbProxy.update_job(jobSpec, {
                                'lockedBy': lockedBy,
                                'subStatus': 'prepared'
                            })
                    # OK
                    pandaIDs = set()
                    workSpecList = []
                    if len(okChunks) > 0:
                        for workSpec, okJobs in okChunks:
                            # has job
                            if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                    or queueConfig.mapType == WorkSpec.MT_NoJob:
                                workSpec.hasJob = 0
                            else:
                                workSpec.hasJob = 1
                                if workSpec.nJobsToReFill in [None, 0]:
                                    workSpec.set_jobspec_list(okJobs)
                                else:
                                    # refill free slots during the worker is running
                                    workSpec.set_jobspec_list(
                                        okJobs[:workSpec.nJobsToReFill])
                                    workSpec.nJobsToReFill = None
                                    for jobSpec in okJobs[workSpec.
                                                          nJobsToReFill:]:
                                        pandaIDs.add(jobSpec.PandaID)
                            # map type
                            workSpec.mapType = queueConfig.mapType
                            # queue name
                            workSpec.computingSite = queueConfig.queueName
                            # set access point
                            workSpec.accessPoint = queueConfig.messenger[
                                'accessPoint']
                            # events
                            if len(okJobs) > 0 and (
                                    'eventService' in okJobs[0].jobParams
                                    or 'cloneJob' in okJobs[0].jobParams):
                                workSpec.eventsRequest = WorkSpec.EV_useEvents
                            workSpecList.append(workSpec)
                    if len(workSpecList) > 0:
                        # get plugin for submitter
                        submitterCore = self.pluginFactory.get_plugin(
                            queueConfig.submitter)
                        if submitterCore is None:
                            # not found
                            tmpLog.error(
                                'submitter plugin for {0} not found'.format(
                                    jobSpec.computingSite))
                            continue
                        # get plugin for messenger
                        messenger = self.pluginFactory.get_plugin(
                            queueConfig.messenger)
                        if messenger is None:
                            # not found
                            tmpLog.error(
                                'messenger plugin for {0} not found'.format(
                                    jobSpec.computingSite))
                            continue
                        # setup access points
                        messenger.setup_access_points(workSpecList)
                        # feed jobs
                        for workSpec in workSpecList:
                            if workSpec.hasJob == 1:
                                tmpStat = messenger.feed_jobs(
                                    workSpec, workSpec.get_jobspec_list())
                                if tmpStat is False:
                                    tmpLog.error(
                                        'failed to send jobs to workerID={0}'.
                                        format(workSpec.workerID))
                                else:
                                    tmpLog.debug(
                                        'sent jobs to workerID={0} with {1}'.
                                        format(workSpec.workerID, tmpStat))
                        # submit
                        tmpLog.debug('submitting {0} workers'.format(
                            len(workSpecList)))
                        workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                            submitterCore, workSpecList)
                        for iWorker, (tmpRet, tmpStr) in enumerate(
                                zip(tmpRetList, tmpStrList)):
                            workSpec, jobList = okChunks[iWorker]
                            # use associated job list since it can be truncated for re-filling
                            jobList = workSpec.get_jobspec_list()
                            # set status
                            if not tmpRet:
                                # failed submission
                                tmpLog.error(
                                    'failed to submit a workerID={0} with {1}'.
                                    format(workSpec.workerID, tmpStr))
                                workSpec.set_status(WorkSpec.ST_missed)
                                jobList = []
                            elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                # directly go to running after feeding jobs for late biding
                                workSpec.set_status(WorkSpec.ST_running)
                            else:
                                # normal successful submission
                                workSpec.set_status(WorkSpec.ST_submitted)
                            workSpec.submitTime = timeNow
                            workSpec.modificationTime = timeNow
                            # prefetch events
                            if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents:
                                workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                eventsRequestParams = dict()
                                for jobSpec in jobList:
                                    eventsRequestParams[jobSpec.PandaID] = {
                                        'pandaID': jobSpec.PandaID,
                                        'taskID': jobSpec.taskID,
                                        'jobsetID':
                                        jobSpec.jobParams['jobsetID'],
                                        'nRanges':
                                        jobSpec.jobParams['coreCount'],
                                    }
                                workSpec.eventsRequestParams = eventsRequestParams
                            # register worker
                            tmpStat = self.dbProxy.register_worker(
                                workSpec, jobList, lockedBy)
                            if jobList is not None:
                                for jobSpec in jobList:
                                    pandaIDs.add(jobSpec.PandaID)
                                    if tmpStat:
                                        tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                        tmpLog.info(
                                            tmpStr.format(
                                                workSpec.workerID,
                                                jobSpec.PandaID,
                                                workSpec.batchID))
                                    else:
                                        tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}'
                                        tmpLog.error(
                                            tmpStr.format(
                                                jobSpec.PandaID,
                                                workSpec.batchID))
                    # release jobs
                    self.dbProxy.release_jobs(pandaIDs, lockedBy)
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.submitter.sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status == WorkSpec.ST_ready:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)
        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
Beispiel #3
0
class Preparator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'preparator-{0}'.format(self.ident)
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('try to get jobs to check')
            # get jobs to check preparation
            jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing',
                                                              harvester_config.preparator.maxJobsToCheck,
                                                              'preparatorTime', 'lockedBy',
                                                              harvester_config.preparator.checkInterval,
                                                              harvester_config.preparator.lockInterval,
                                                              lockedBy)
            mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
            # loop over all jobs
            for jobSpec in jobsToCheck:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('start checking')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID)
                    oldSubStatus = jobSpec.subStatus
                    # get plugin
                    preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator)
                    if preparatorCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    tmpStat, tmpStr = preparatorCore.check_status(jobSpec)
                    # still running
                    if tmpStat is None:
                        # update job
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus})
                        tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr))
                        continue
                    # succeeded
                    if tmpStat is True:
                        # resolve path
                        tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec)
                        if tmpStat is False:
                            jobSpec.lockedBy = None
                            self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                              'subStatus': oldSubStatus})
                            tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr))
                            continue
                        # update job
                        jobSpec.subStatus = 'prepared'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        jobSpec.set_all_input_ready()
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus},
                                                update_in_file=True)
                        tmpLog.debug('succeeded')
                    else:
                        # update job
                        jobSpec.status = 'failed'
                        jobSpec.subStatus = 'failed_to_prepare'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        jobSpec.stateChangeTime = datetime.datetime.utcnow()
                        errStr = 'stage-in failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr)
                        jobSpec.trigger_propagation()
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus})
                        tmpLog.error('failed with {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to trigger preparation
            mainLog.debug('try to get jobs to prepare')
            jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched',
                                                                harvester_config.preparator.maxJobsToTrigger,
                                                                'preparatorTime', 'lockedBy',
                                                                harvester_config.preparator.triggerInterval,
                                                                harvester_config.preparator.lockInterval,
                                                                lockedBy,
                                                                'preparing')
            mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger)))
            # loop over all jobs
            fileStatMap = dict()
            for jobSpec in jobsToTrigger:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to trigger preparation')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    oldSubStatus = jobSpec.subStatus
                    # get plugin
                    preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator)
                    if preparatorCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # check file status
                    if queueConfig.ddmEndpointIn not in fileStatMap:
                        fileStatMap[queueConfig.ddmEndpointIn] = dict()
                    newFileStatusData = []
                    toWait = False
                    for fileSpec in jobSpec.inFiles:
                        if fileSpec.status == 'preparing':
                            updateStatus = False
                            if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]:
                                fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \
                                    = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn,
                                                                   'starting')
                            if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                # the file is ready
                                fileSpec.status = 'ready'
                                # set group info if any
                                groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input',
                                                                            queueConfig.ddmEndpointIn)
                                if groupInfo is not None:
                                    fileSpec.groupID = groupInfo['groupID']
                                    fileSpec.groupStatus = groupInfo['groupStatus']
                                    fileSpec.groupUpdateTime = groupInfo['groupUpdateTime']
                                updateStatus = True
                            elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                # the file is being prepared by another
                                toWait = True
                            else:
                                # change file status if the file is not prepared by another
                                fileSpec.status = 'to_prepare'
                                updateStatus = True
                            # set new status
                            if updateStatus:
                                newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status))
                                if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                    fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0
                                fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1
                    if len(newFileStatusData) > 0:
                        self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy)
                    # wait since files are being prepared by another
                    if toWait:
                        # update job
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus})
                        tmpLog.debug('wait since files are being prepared by another job')
                        continue
                    # trigger preparation
                    tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        jobSpec.subStatus = 'preparing'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus},
                                                update_in_file=True)
                        tmpLog.debug('triggered')
                    elif tmpStat is False:
                        # fatal error
                        jobSpec.status = 'failed'
                        jobSpec.subStatus = 'failed_to_prepare'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        jobSpec.stateChangeTime = datetime.datetime.utcnow()
                        errStr = 'stage-in failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr)
                        jobSpec.trigger_propagation()
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus})
                        tmpLog.debug('failed to trigger with {0}'.format(tmpStr))
                    else:
                        # temporary error
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                          'subStatus': oldSubStatus})
                        tmpLog.debug('try to prepare later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.preparator.sleepTime):
                mainLog.debug('terminated')
                return
Beispiel #4
0
class Preparator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'preparator-{0}'.format(self.get_pid())
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('try to get jobs to check')
            # get jobs to check preparation
            try:
                maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck
                if maxFilesPerJob <= 0:
                    maxFilesPerJob = None
            except Exception:
                maxFilesPerJob = None
            jobsToCheck = self.dbProxy.get_jobs_in_sub_status(
                'preparing',
                harvester_config.preparator.maxJobsToCheck,
                'preparatorTime',
                'lockedBy',
                harvester_config.preparator.checkInterval,
                harvester_config.preparator.lockInterval,
                lockedBy,
                max_files_per_job=maxFilesPerJob,
                ng_file_status_list=['ready'])
            mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
            # loop over all jobs
            for jobSpec in jobsToCheck:
                tmpLog = self.make_logger(_logger,
                                          'PandaID={0}'.format(
                                              jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('start checking')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(
                            jobSpec.computingSite, configID):
                        tmpLog.error(
                            'queue config for {0}/{1} not found'.format(
                                jobSpec.computingSite, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        jobSpec.computingSite, jobSpec.configID)
                    oldSubStatus = jobSpec.subStatus
                    # get plugin
                    if jobSpec.auxInput in [None, JobSpec.AUX_allTriggered]:
                        preparatorCore = self.pluginFactory.get_plugin(
                            queueConfig.preparator)
                    else:
                        preparatorCore = self.pluginFactory.get_plugin(
                            queueConfig.aux_preparator)
                    if preparatorCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(
                            jobSpec.computingSite))
                        continue
                    tmpLog.debug("plugin={0}".format(
                        preparatorCore.__class__.__name__))
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(
                        jobSpec.PandaID, 'preparatorTime', 'lockedBy',
                        lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    tmpStat, tmpStr = preparatorCore.check_stage_in_status(
                        jobSpec)
                    # still running
                    if tmpStat is None:
                        # update job
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        })
                        tmpLog.debug(
                            'try to check later since still preparing with {0}'
                            .format(tmpStr))
                        continue
                    # succeeded
                    if tmpStat is True:
                        # resolve path
                        tmpStat, tmpStr = preparatorCore.resolve_input_paths(
                            jobSpec)
                        if tmpStat is False:
                            jobSpec.lockedBy = None
                            self.dbProxy.update_job(jobSpec, {
                                'lockedBy': lockedBy,
                                'subStatus': oldSubStatus
                            })
                            tmpLog.error(
                                'failed to resolve input file paths : {0}'.
                                format(tmpStr))
                            continue
                        # manipulate container-related job params
                        jobSpec.manipulate_job_params_for_container()
                        # update job
                        jobSpec.lockedBy = None
                        jobSpec.set_all_input_ready()
                        if (maxFilesPerJob is None and jobSpec.auxInput is None) or \
                                (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]):
                            # all done
                            allDone = True
                            jobSpec.subStatus = 'prepared'
                            jobSpec.preparatorTime = None
                            if jobSpec.auxInput is not None:
                                jobSpec.auxInput = JobSpec.AUX_allReady
                        else:
                            # immediate next lookup since there could be more files to check
                            allDone = False
                            jobSpec.trigger_preparation()
                            # change auxInput flag to check auxiliary inputs
                            if len(
                                    jobSpec.inFiles
                            ) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered:
                                jobSpec.auxInput = JobSpec.AUX_inReady
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        },
                                                update_in_file=True)
                        if allDone:
                            tmpLog.debug('succeeded')
                        else:
                            tmpLog.debug('partially succeeded')
                    else:
                        # update job
                        jobSpec.status = 'failed'
                        jobSpec.subStatus = 'failed_to_prepare'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        jobSpec.stateChangeTime = datetime.datetime.utcnow()
                        errStr = 'stage-in failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED,
                                                errStr)
                        jobSpec.trigger_propagation()
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        })
                        tmpLog.error('failed with {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to trigger preparation
            mainLog.debug('try to get jobs to prepare')
            try:
                maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare
                if maxFilesPerJob <= 0:
                    maxFilesPerJob = None
            except Exception:
                maxFilesPerJob = None
            jobsToTrigger = self.dbProxy.get_jobs_in_sub_status(
                'fetched',
                harvester_config.preparator.maxJobsToTrigger,
                'preparatorTime',
                'lockedBy',
                harvester_config.preparator.triggerInterval,
                harvester_config.preparator.lockInterval,
                lockedBy,
                'preparing',
                max_files_per_job=maxFilesPerJob,
                ng_file_status_list=['triggered', 'ready'])
            mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger)))
            # loop over all jobs
            fileStatMap = dict()
            for jobSpec in jobsToTrigger:
                tmpLog = self.make_logger(_logger,
                                          'PandaID={0}'.format(
                                              jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to trigger preparation')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(
                            jobSpec.computingSite, configID):
                        tmpLog.error(
                            'queue config for {0}/{1} not found'.format(
                                jobSpec.computingSite, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        jobSpec.computingSite, configID)
                    oldSubStatus = jobSpec.subStatus
                    # get plugin
                    if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]:
                        preparatorCore = self.pluginFactory.get_plugin(
                            queueConfig.preparator)
                        fileType = 'input'
                    else:
                        preparatorCore = self.pluginFactory.get_plugin(
                            queueConfig.aux_preparator)
                        fileType = FileSpec.AUX_INPUT
                    if preparatorCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(
                            jobSpec.computingSite))
                        continue
                    tmpLog.debug("plugin={0}".format(
                        preparatorCore.__class__.__name__))
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(
                        jobSpec.PandaID, 'preparatorTime', 'lockedBy',
                        lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # check file status
                    if queueConfig.ddmEndpointIn not in fileStatMap:
                        fileStatMap[queueConfig.ddmEndpointIn] = dict()
                    # check if has to_prepare
                    hasToPrepare = False
                    for fileSpec in jobSpec.inFiles:
                        if fileSpec.status == 'to_prepare':
                            hasToPrepare = True
                            break
                    newFileStatusData = []
                    toWait = False
                    newInFiles = []
                    for fileSpec in jobSpec.inFiles:
                        if fileSpec.status in ['preparing', 'to_prepare']:
                            newInFiles.append(fileSpec)
                            updateStatus = False
                            if fileSpec.lfn not in fileStatMap[
                                    queueConfig.ddmEndpointIn]:
                                fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \
                                    = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn,
                                                                   'starting')
                            if 'ready' in fileStatMap[
                                    queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                # the file is ready
                                fileSpec.status = 'ready'
                                if fileStatMap[queueConfig.ddmEndpointIn][
                                        fileSpec.lfn]['ready']['path']:
                                    fileSpec.path = list(
                                        fileStatMap[queueConfig.ddmEndpointIn][
                                            fileSpec.lfn]['ready']['path'])[0]
                                # set group info if any
                                groupInfo = self.dbProxy.get_group_for_file(
                                    fileSpec.lfn, fileType,
                                    queueConfig.ddmEndpointIn)
                                if groupInfo is not None:
                                    fileSpec.groupID = groupInfo['groupID']
                                    fileSpec.groupStatus = groupInfo[
                                        'groupStatus']
                                    fileSpec.groupUpdateTime = groupInfo[
                                        'groupUpdateTime']
                                updateStatus = True
                            elif (not hasToPrepare and
                                  'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \
                                  'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                # the file is being prepared by another
                                toWait = True
                                if fileSpec.status != 'preparing':
                                    fileSpec.status = 'preparing'
                                    updateStatus = True
                            else:
                                # change file status if the file is not prepared by another
                                if fileSpec.status != 'to_prepare':
                                    fileSpec.status = 'to_prepare'
                                    updateStatus = True
                            # set new status
                            if updateStatus:
                                newFileStatusData.append(
                                    (fileSpec.fileID, fileSpec.lfn,
                                     fileSpec.status))
                                fileStatMap[queueConfig.ddmEndpointIn][
                                    fileSpec.lfn].setdefault(
                                        fileSpec.status, None)
                    if len(newFileStatusData) > 0:
                        self.dbProxy.change_file_status(
                            jobSpec.PandaID, newFileStatusData, lockedBy)
                    # wait since files are being prepared by another
                    if toWait:
                        # update job
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        })
                        tmpLog.debug(
                            'wait since files are being prepared by another job'
                        )
                        continue
                    # trigger preparation
                    tmpStat, tmpStr = preparatorCore.trigger_preparation(
                        jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        jobSpec.lockedBy = None
                        if (maxFilesPerJob is None and jobSpec.auxInput is None) or \
                                (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]):
                            # all done
                            allDone = True
                            jobSpec.subStatus = 'preparing'
                            jobSpec.preparatorTime = None
                            if jobSpec.auxInput is not None:
                                jobSpec.auxInput = JobSpec.AUX_allTriggered
                        else:
                            # change file status but not change job sub status since
                            # there could be more files to prepare
                            allDone = False
                            for fileSpec in jobSpec.inFiles:
                                if fileSpec.status == 'to_prepare':
                                    fileSpec.status = 'triggered'
                            # immediate next lookup
                            jobSpec.trigger_preparation()
                            # change auxInput flag to prepare auxiliary inputs
                            if len(
                                    jobSpec.inFiles
                            ) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput:
                                jobSpec.auxInput = JobSpec.AUX_inTriggered
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        },
                                                update_in_file=True)
                        if allDone:
                            tmpLog.debug('triggered')
                        else:
                            tmpLog.debug('partially triggered')
                    elif tmpStat is False:
                        # fatal error
                        jobSpec.status = 'failed'
                        jobSpec.subStatus = 'failed_to_prepare'
                        jobSpec.lockedBy = None
                        jobSpec.preparatorTime = None
                        jobSpec.stateChangeTime = datetime.datetime.utcnow()
                        errStr = 'stage-in failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED,
                                                errStr)
                        jobSpec.trigger_propagation()
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        })
                        tmpLog.debug(
                            'failed to trigger with {0}'.format(tmpStr))
                    else:
                        # temporary error
                        jobSpec.lockedBy = None
                        self.dbProxy.update_job(jobSpec, {
                            'lockedBy': lockedBy,
                            'subStatus': oldSubStatus
                        })
                        tmpLog.debug(
                            'try to prepare later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.preparator.sleepTime):
                mainLog.debug('terminated')
                return
Beispiel #5
0
class Propagator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self._last_stats_update = None

    # main loop
    def run(self):
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(self.ident),
                                             method_name='run')
            mainLog.debug('getting jobs to propagate')
            jobSpecs = self.dbProxy.get_jobs_to_propagate(
                harvester_config.propagator.maxJobs,
                harvester_config.propagator.lockInterval,
                harvester_config.propagator.updateInterval, self.ident)
            mainLog.debug('got {0} jobs'.format(len(jobSpecs)))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(
                            tmpJobSpec.computingSite)
                        hbSuppressMap[
                            tmpJobSpec.
                            computingSite] = queueConfig.get_no_heartbeat_status(
                            )
                    # heartbeat is suppressed
                    if tmpJobSpec.status in hbSuppressMap[
                            tmpJobSpec.computingSite]:
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                retList += self.communicator.check_jobs(jobListToCheck)
                retList += self.communicator.update_jobs(jobListToUpdate)
                # logging
                for tmpJobSpec, tmpRet in zip(
                        jobListToSkip + jobListToCheck + jobListToUpdate,
                        retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug(
                                'updated PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        else:
                            mainLog.debug(
                                'skip updating PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status(
                        ) and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(
                                    tmpJobSpec)
                                if tmpEvStat and tmpEvRet == 0:
                                    mainLog.debug(
                                        'kill PandaID={0} due to no event'.
                                        format(tmpJobSpec.PandaID))
                                    tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in [
                                    'tobekilled'
                            ]:
                                nWorkers = self.dbProxy.kill_workers_with_job(
                                    tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no remaining workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow(
                                    )
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec,
                                                {'propagatorLock': self.ident})
                    else:
                        mainLog.error(
                            'failed to update PandaID={0} status={1}'.format(
                                tmpJobSpec.PandaID, tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            workSpecs = self.dbProxy.get_workers_to_propagate(
                harvester_config.propagator.maxWorkers,
                harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers'.format(len(workSpecs)))
            # update workers in central database
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nJobs]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error(
                        'failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug(
                                'updated workerID={0} status={1}'.format(
                                    tmpWorkSpec.workerID, tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(
                                        logRemoteName, logFileObj, logOffset,
                                        logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(
                                            logFilePath, logOffset + logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(
                                tmpWorkSpec,
                                {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error(
                                'failed to update workerID={0} status={1}'.
                                format(tmpWorkSpec.workerID,
                                       tmpWorkSpec.status))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(
                        CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error(
                            'failed to get worker stats for {0}'.format(
                                siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(
                            siteName, workerStats)
                        if tmpRet:
                            mainLog.debug(
                                'updated worker stats (command) for {0}'.
                                format(siteName))
                        else:
                            mainLog.error(
                                'failed to update worker stats (command) for {0} err={1}'
                                .format(siteName, tmpStr))

            if not self._last_stats_update or time.time(
            ) - self._last_stats_update > STATS_PERIOD:
                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk()
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(
                            site_name, worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug(
                                'update of worker stats (bulk) for {0}'.format(
                                    site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error(
                                'failed to update worker stats (bulk) for {0} err={1}'
                                .format(site_name, tmp_str))

            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
Beispiel #6
0
class Submitter(AgentBase):
    # fifos
    monitor_fifo = MonitorFIFO()

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        while True:
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(
                    commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} queue={1} rtype={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal[
                                    'nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(
                                        queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug(
                                        'skipped since no new worker is needed based on current stats'
                                    )
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(
                                    queueName)
                                workerMakerCore = self.workerMaker.get_plugin(
                                    queueConfig)
                                # check if resource is ready
                                if hasattr(
                                        workerMakerCore, 'dynamicSizing'
                                ) and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(
                                        queueConfig, resource_type,
                                        workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' %
                                                 numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore,
                                                   'staticWorkers'):
                                            nQRWorkers = tmpVal[
                                                'nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug(
                                                'staticWorkers: %s, nQRWorkers(Queue+Running): %s'
                                                %
                                                (workerMakerCore.staticWorkers,
                                                 nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug(
                                                    'No left static workers, skip'
                                                )
                                                continue
                                            else:
                                                nWorkers = min(
                                                    workerMakerCore.
                                                    staticWorkers - nQRWorkers,
                                                    nWorkers)
                                                tmpLog.debug(
                                                    'staticWorkers: %s, nWorkers: %s'
                                                    %
                                                    (workerMakerCore.
                                                     staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug(
                                                'skip since no resources are ready'
                                            )
                                            continue
                                    else:
                                        nWorkers = min(nWorkers,
                                                       numReadyResources)
                                # post action of worker maker
                                if hasattr(
                                        workerMakerCore, 'skipOnFail'
                                ) and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(
                                        nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady,
                                        nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(
                                        nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, None,
                                        nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(
                                        queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(
                                    len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(
                                    jobChunks,
                                    queueConfig,
                                    nReady,
                                    resource_type,
                                    maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug(
                                        'successfully made {0} workers'.format(
                                            len(okChunks)))
                                else:
                                    tmpLog.debug(
                                        'made {0} workers, while {1} workers failed'
                                        .format(len(okChunks), len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(
                                                jobSpec, {
                                                    'lockedBy': lockedBy,
                                                    'subStatus': 'prepared'
                                                })
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [
                                                    None, 0
                                            ]:
                                                workSpec.set_jobspec_list(
                                                    okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(
                                                    okJobs[:workSpec.
                                                           nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[
                                                        workSpec.
                                                        nJobsToReFill:]:
                                                    pandaIDs.add(
                                                        jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger[
                                            'accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level(
                                        )
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(
                                        queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(
                                        queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(
                                                workSpec,
                                                workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'
                                                    .format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'
                                                    .format(
                                                        workSpec.workerID,
                                                        tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(
                                        workSpecList, lockedBy)
                                    # submit
                                    tmpLog.info(
                                        'submitting {0} workers'.format(
                                            len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                        submitterCore, workSpecList)
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID, tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(
                                                WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(
                                                            jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.
                                                            submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(
                                                WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(
                                                WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({
                                                'lastCheckAt':
                                                timeNow_timestamp
                                            })
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(
                                            workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.
                                                                PandaID,
                                                                workSpec.
                                                                batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.PandaID
                                                            ))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(
                                                        tmpStr.format(
                                                            jobSpec.PandaID,
                                                            workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue))
                                        mainLog.debug(
                                            'put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter,
                                         'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow(
                        ) + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute(
                            'submitTime', newTime, site_name=siteName)
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)
        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
Beispiel #7
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()
        self.monitor_fifo = MonitorFIFO()
        self.apfmon = Apfmon(self.queueConfigMapper)

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval',
                                    harvester_config.submitter.lockInterval)
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues,
                                                                             harvester_config.submitter.lookupTime,
                                                                             harvester_config.submitter.lockInterval,
                                                                             lockedBy, queueLockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error('WorkerAdjuster failed to define the number of workers')
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy,
                                                                                                   queueName,
                                                                                                   resource_type),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug('skipped since no new worker is needed based on current stats')
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(queueName)
                                workerMakerCore = self.workerMaker.get_plugin(queueConfig)
                                # check if resource is ready
                                if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(queueConfig,
                                                                                             resource_type,
                                                                                             workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' % numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore, 'staticWorkers'):
                                            nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' %
                                                         (workerMakerCore.staticWorkers, nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug('No left static workers, skip')
                                                continue
                                            else:
                                                nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers)
                                                tmpLog.debug('staticWorkers: %s, nWorkers: %s' %
                                                             (workerMakerCore.staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug('skip since no resources are ready')
                                            continue
                                    else:
                                        nWorkers = min(nWorkers, numReadyResources)
                                # post action of worker maker
                                if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, None, nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy, max_workers_per_job_in_total=maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig,
                                                                                   nReady, resource_type,
                                                                                   maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug('successfully made {0} workers'.format(len(okChunks)))
                                else:
                                    tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks),
                                                                                                     len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                                              'subStatus': 'prepared'})
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [None, 0]:
                                                workSpec.set_jobspec_list(okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[workSpec.nJobsToReFill:]:
                                                    pandaIDs.add(jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger['accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level()
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'.format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'.format(workSpec.workerID,
                                                                                                tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info('submitting {0} workers'.format(len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore,
                                                                                               workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.format(len(workSpecList))
                                                    + sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[iWorker]
                                            jobList = workSpec.get_jobspec_list()
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname()
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID,
                                                tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({'lastCheckAt': timeNow_timestamp})
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(tmpStr.format(workSpec.workerID,
                                                                                  jobSpec.PandaID,
                                                                                  workSpec.batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(tmpStr.format(workSpec.workerID,
                                                                                   jobSpec.PandaID))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        check_delay = min(
                                                        getattr(harvester_config.monitor, 'eventBasedCheckInterval',
                                                                harvester_config.monitor.checkInterval),
                                                        getattr(harvester_config.monitor, 'fifoCheckInterval',
                                                                harvester_config.monitor.checkInterval))
                                        monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay)
                                        mainLog.debug('put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                # release the site
                self.dbProxy.release_site(siteName, lockedBy)
                if sw_main.get_elapsed_time_in_sec() > queueLockInterval:
                    mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval)
                                    + sw_main.get_elapsed_time())
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)

        # submit the workers to the monitoring
        self.apfmon.create_workers(workersToSubmit)

        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList