def get_work_attributes(self, workspec):
        # get logger
        tmpLog = core_utils.make_logger(_logger,
                                        'workerID={0}'.format(
                                            workspec.workerID),
                                        method_name='get_work_attributes')
        allRetDict = dict()
        numofreads = 0
        sw_readreports = core_utils.get_stopwatch()
        for pandaID in workspec.pandaid_list:
            # look for the json just under the access point
            accessPoint = self.get_access_point(workspec, pandaID)
            jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName)
            tmpLog.debug(
                'looking for attributes file {0}'.format(jsonFilePath))
            retDict = dict()
            if not os.path.exists(jsonFilePath):
                # not found
                tmpLog.debug('not found attributes file')
            else:
                try:
                    with open(jsonFilePath) as jsonFile:
                        retDict = json.load(jsonFile)
                except Exception:
                    tmpLog.debug('failed to load {0}'.format(jsonFilePath))
            # look for job report
            jsonFilePath = os.path.join(accessPoint, jsonJobReport)
            tmpLog.debug(
                'looking for job report file {0}'.format(jsonFilePath))
            sw_checkjobrep = core_utils.get_stopwatch()
            if not os.path.exists(jsonFilePath):
                # not found
                tmpLog.debug('not found job report file')
            else:
                try:
                    sw_readrep = core_utils.get_stopwatch()
                    with open(jsonFilePath) as jsonFile:
                        tmpDict = json.load(jsonFile)
                    retDict['metaData'] = tmpDict
                    tmpLog.debug('got {0} kB of job report. {1} sec.'.format(
                        os.stat(jsonFilePath).st_size / 1024,
                        sw_readrep.get_elapsed_time()))
                    numofreads += 1
                except Exception:
                    tmpLog.debug('failed to load {0}'.format(jsonFilePath))
            tmpLog.debug("Check file and read file time: {0} sec.".format(
                sw_checkjobrep.get_elapsed_time()))
            allRetDict[pandaID] = retDict

        tmpLog.debug("Reading {0} job report files {1}".format(
            numofreads, sw_readreports.get_elapsed_time()))
        return allRetDict
    def get_work_attributes(self, workspec):
        # get logger
        tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                        method_name='get_work_attributes')
        allRetDict = dict()
        numofreads = 0
        sw_readreports = core_utils.get_stopwatch()
        for pandaID in workspec.pandaid_list:
            # look for the json just under the access point
            accessPoint = self.get_access_point(workspec, pandaID)
            jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName)
            tmpLog.debug('looking for attributes file {0}'.format(jsonFilePath))
            retDict = dict()
            if not os.path.exists(jsonFilePath):
                # not found
                tmpLog.debug('not found attributes file')
            else:
                try:
                    with open(jsonFilePath) as jsonFile:
                        retDict = json.load(jsonFile)
                except Exception:
                    tmpLog.debug('failed to load {0}'.format(jsonFilePath))
            # look for job report
            jsonFilePath = os.path.join(accessPoint, jsonJobReport)
            tmpLog.debug('looking for job report file {0}'.format(jsonFilePath))
            sw_checkjobrep = core_utils.get_stopwatch()
            if not os.path.exists(jsonFilePath):
                # not found
                tmpLog.debug('not found job report file')
            else:
                try:
                    sw_readrep = core_utils.get_stopwatch()
                    with open(jsonFilePath) as jsonFile:
                        tmpDict = json.load(jsonFile)
                    retDict['metaData'] = tmpDict
                    tmpLog.debug('got {0} kB of job report. {1} sec.'.format(os.stat(jsonFilePath).st_size / 1024,
                                                                             sw_readrep.get_elapsed_time()))
                    numofreads += 1
                except Exception:
                    tmpLog.debug('failed to load {0}'.format(jsonFilePath))
            tmpLog.debug("Check file and read file time: {0} sec.".format(sw_checkjobrep.get_elapsed_time()))
            allRetDict[pandaID] = retDict

        tmpLog.debug("Reading {0} job report files {1}".format(numofreads, sw_readreports.get_elapsed_time()))
        return allRetDict
Example #3
0
 def post_ssl(self, path, data, cert=None, base_url=None):
     try:
         tmpLog = None
         if self.verbose:
             tmpLog = self.make_logger(method_name='post_ssl')
             if self.useInspect:
                 tmpExec = inspect.stack()[1][3]
                 tmpExec += '/'
             tmpExec = str(uuid.uuid4())
         if base_url is None:
             base_url = harvester_config.pandacon.pandaURLSSL
         url = '{0}/{1}'.format(base_url, path)
         if self.verbose:
             tmpLog.debug('exec={0} URL={1} data={2}'.format(
                 tmpExec, url, str(data)))
         if cert is None:
             cert = (harvester_config.pandacon.cert_file,
                     harvester_config.pandacon.key_file)
         session = get_http_adapter_with_random_dns_resolution()
         sw = core_utils.get_stopwatch()
         res = session.post(url,
                            data=data,
                            headers={
                                "Accept": "application/json",
                                "Connection": "close"
                            },
                            timeout=harvester_config.pandacon.timeout,
                            verify=harvester_config.pandacon.ca_cert,
                            cert=cert)
         if self.verbose:
             tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(
                 tmpExec, res.status_code, res.text, sw.get_elapsed_time()))
         if res.status_code == 200:
             return True, res
         else:
             errMsg = 'StatusCode={0} {1}'.format(res.status_code, res.text)
     except Exception:
         errType, errValue = sys.exc_info()[:2]
         errMsg = "failed to post with {0}:{1} ".format(errType, errValue)
         errMsg += traceback.format_exc()
     return False, errMsg
 def get_jobs(self, site_name, node_name, prod_source_label,
              computing_element, n_jobs, additional_criteria):
     # get logger
     tmpLog = self.make_logger('siteName={0}'.format(site_name),
                               method_name='get_jobs')
     tmpLog.debug('try to get {0} jobs'.format(n_jobs))
     data = {}
     data['siteName'] = site_name
     data['node'] = node_name
     data['prodSourceLabel'] = prod_source_label
     data['computingElement'] = computing_element
     data['nJobs'] = n_jobs
     data['schedulerID'] = 'harvester-{0}'.format(
         harvester_config.master.harvester_id)
     if additional_criteria is not None:
         for tmpKey, tmpVal in additional_criteria:
             data[tmpKey] = tmpVal
     sw = core_utils.get_stopwatch()
     tmpStat, tmpRes = self.post_ssl('getJob', data)
     tmpLog.debug('getJob for {0} jobs {1}'.format(n_jobs,
                                                   sw.get_elapsed_time()))
     errStr = 'OK'
     if tmpStat is False:
         errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             tmpDict = tmpRes.json()
             tmpLog.debug('StatusCode={0}'.format(tmpDict['StatusCode']))
             if tmpDict['StatusCode'] == 0:
                 tmpLog.debug('got {0} jobs'.format(len(tmpDict['jobs'])))
                 return tmpDict['jobs'], errStr
             else:
                 if 'errorDialog' in tmpDict:
                     errStr = tmpDict['errorDialog']
                 else:
                     errStr = "StatusCode={0}".format(tmpDict['StatusCode'])
             return [], errStr
         except Exception:
             errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     return [], errStr
 def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs,
              additional_criteria):
     # get logger
     tmpLog = self.make_logger('siteName={0}'.format(site_name), method_name='get_jobs')
     tmpLog.debug('try to get {0} jobs'.format(n_jobs))
     data = {}
     data['siteName'] = site_name
     data['node'] = node_name
     data['prodSourceLabel'] = prod_source_label
     data['computingElement'] = computing_element
     data['nJobs'] = n_jobs
     data['schedulerID'] = 'harvester-{0}'.format(harvester_config.master.harvester_id)
     if additional_criteria is not None:
         for tmpKey, tmpVal in additional_criteria:
             data[tmpKey] = tmpVal
     sw = core_utils.get_stopwatch()
     tmpStat, tmpRes = self.post_ssl('getJob', data)
     tmpLog.debug('getJob for {0} jobs {1}'.format(n_jobs, sw.get_elapsed_time()))
     errStr = 'OK'
     if tmpStat is False:
         errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             tmpDict = tmpRes.json()
             tmpLog.debug('StatusCode={0}'.format(tmpDict['StatusCode']))
             if tmpDict['StatusCode'] == 0:
                 tmpLog.debug('got {0} jobs'.format(len(tmpDict['jobs'])))
                 return tmpDict['jobs'], errStr
             else:
                 if 'errorDialog' in tmpDict:
                     errStr = tmpDict['errorDialog']
                 else:
                     errStr = "StatusCode={0}".format(tmpDict['StatusCode'])
             return [], errStr
         except Exception:
             errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     return [], errStr
 def post_ssl(self, path, data, cert=None):
     try:
         tmpLog = None
         if self.verbose:
             tmpLog = self.make_logger(method_name='post_ssl')
             if self.useInspect:
                 tmpExec = inspect.stack()[1][3]
                 tmpExec += '/'
             tmpExec = str(uuid.uuid4())
         url = '{0}/{1}'.format(harvester_config.pandacon.pandaURLSSL, path)
         if self.verbose:
             tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data)))
         if cert is None:
             cert = (harvester_config.pandacon.cert_file,
                     harvester_config.pandacon.key_file)
         sw = core_utils.get_stopwatch()
         res = requests.post(url,
                             data=data,
                             headers={"Accept": "application/json",
                                      "Connection": "close"},
                             timeout=harvester_config.pandacon.timeout,
                             verify=harvester_config.pandacon.ca_cert,
                             cert=cert)
         if self.verbose:
             tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(tmpExec, res.status_code, res.text,
                                                                     sw.get_elapsed_time()))
         if res.status_code == 200:
             return True, res
         else:
             errMsg = 'StatusCode={0} {1}'.format(res.status_code,
                                                  res.text)
     except Exception:
         errType, errValue = sys.exc_info()[:2]
         errMsg = "failed to post with {0}:{1} ".format(errType, errValue)
         errMsg += traceback.format_exc()
     return False, errMsg
Example #7
0
 def run(self):
     lockedBy = 'preparator-{0}'.format(self.ident)
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
         mainLog.debug('try to get jobs to check')
         # get jobs to check preparation
         jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing',
                                                           harvester_config.preparator.maxJobsToCheck,
                                                           'preparatorTime', 'lockedBy',
                                                           harvester_config.preparator.checkInterval,
                                                           harvester_config.preparator.lockInterval,
                                                           lockedBy)
         mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
         # loop over all jobs
         for jobSpec in jobsToCheck:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('start checking')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID)
                 oldSubStatus = jobSpec.subStatus
                 # get plugin
                 preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator)
                 if preparatorCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 tmpStat, tmpStr = preparatorCore.check_status(jobSpec)
                 # still running
                 if tmpStat is None:
                     # update job
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus})
                     tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr))
                     continue
                 # succeeded
                 if tmpStat is True:
                     # resolve path
                     tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec)
                     if tmpStat is False:
                         jobSpec.lockedBy = None
                         self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                           'subStatus': oldSubStatus})
                         tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr))
                         continue
                     # update job
                     jobSpec.subStatus = 'prepared'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     jobSpec.set_all_input_ready()
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus},
                                             update_in_file=True)
                     tmpLog.debug('succeeded')
                 else:
                     # update job
                     jobSpec.status = 'failed'
                     jobSpec.subStatus = 'failed_to_prepare'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     jobSpec.stateChangeTime = datetime.datetime.utcnow()
                     errStr = 'stage-in failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr)
                     jobSpec.trigger_propagation()
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus})
                     tmpLog.error('failed with {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to trigger preparation
         mainLog.debug('try to get jobs to prepare')
         jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched',
                                                             harvester_config.preparator.maxJobsToTrigger,
                                                             'preparatorTime', 'lockedBy',
                                                             harvester_config.preparator.triggerInterval,
                                                             harvester_config.preparator.lockInterval,
                                                             lockedBy,
                                                             'preparing')
         mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger)))
         # loop over all jobs
         fileStatMap = dict()
         for jobSpec in jobsToTrigger:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to trigger preparation')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 oldSubStatus = jobSpec.subStatus
                 # get plugin
                 preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator)
                 if preparatorCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # check file status
                 if queueConfig.ddmEndpointIn not in fileStatMap:
                     fileStatMap[queueConfig.ddmEndpointIn] = dict()
                 newFileStatusData = []
                 toWait = False
                 for fileSpec in jobSpec.inFiles:
                     if fileSpec.status == 'preparing':
                         updateStatus = False
                         if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]:
                             fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \
                                 = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn,
                                                                'starting')
                         if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                             # the file is ready
                             fileSpec.status = 'ready'
                             # set group info if any
                             groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input',
                                                                         queueConfig.ddmEndpointIn)
                             if groupInfo is not None:
                                 fileSpec.groupID = groupInfo['groupID']
                                 fileSpec.groupStatus = groupInfo['groupStatus']
                                 fileSpec.groupUpdateTime = groupInfo['groupUpdateTime']
                             updateStatus = True
                         elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                             # the file is being prepared by another
                             toWait = True
                         else:
                             # change file status if the file is not prepared by another
                             fileSpec.status = 'to_prepare'
                             updateStatus = True
                         # set new status
                         if updateStatus:
                             newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status))
                             if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                                 fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0
                             fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1
                 if len(newFileStatusData) > 0:
                     self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy)
                 # wait since files are being prepared by another
                 if toWait:
                     # update job
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus})
                     tmpLog.debug('wait since files are being prepared by another job')
                     continue
                 # trigger preparation
                 tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     jobSpec.subStatus = 'preparing'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus},
                                             update_in_file=True)
                     tmpLog.debug('triggered')
                 elif tmpStat is False:
                     # fatal error
                     jobSpec.status = 'failed'
                     jobSpec.subStatus = 'failed_to_prepare'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     jobSpec.stateChangeTime = datetime.datetime.utcnow()
                     errStr = 'stage-in failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr)
                     jobSpec.trigger_propagation()
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus})
                     tmpLog.debug('failed to trigger with {0}'.format(tmpStr))
                 else:
                     # temporary error
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                       'subStatus': oldSubStatus})
                     tmpLog.debug('try to prepare later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.preparator.sleepTime):
             mainLog.debug('terminated')
             return
Example #8
0
 def run(self):
     lockedBy = 'preparator-{0}'.format(self.get_pid())
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         mainLog.debug('try to get jobs to check')
         # get jobs to check preparation
         try:
             maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck
             if maxFilesPerJob <= 0:
                 maxFilesPerJob = None
         except Exception:
             maxFilesPerJob = None
         jobsToCheck = self.dbProxy.get_jobs_in_sub_status(
             'preparing',
             harvester_config.preparator.maxJobsToCheck,
             'preparatorTime',
             'lockedBy',
             harvester_config.preparator.checkInterval,
             harvester_config.preparator.lockInterval,
             lockedBy,
             max_files_per_job=maxFilesPerJob,
             ng_file_status_list=['ready'])
         mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
         # loop over all jobs
         for jobSpec in jobsToCheck:
             tmpLog = self.make_logger(_logger,
                                       'PandaID={0}'.format(
                                           jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('start checking')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(
                         jobSpec.computingSite, configID):
                     tmpLog.error(
                         'queue config for {0}/{1} not found'.format(
                             jobSpec.computingSite, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     jobSpec.computingSite, jobSpec.configID)
                 oldSubStatus = jobSpec.subStatus
                 # get plugin
                 if jobSpec.auxInput in [None, JobSpec.AUX_allTriggered]:
                     preparatorCore = self.pluginFactory.get_plugin(
                         queueConfig.preparator)
                 else:
                     preparatorCore = self.pluginFactory.get_plugin(
                         queueConfig.aux_preparator)
                 if preparatorCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(
                         jobSpec.computingSite))
                     continue
                 tmpLog.debug("plugin={0}".format(
                     preparatorCore.__class__.__name__))
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(
                     jobSpec.PandaID, 'preparatorTime', 'lockedBy',
                     lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 tmpStat, tmpStr = preparatorCore.check_stage_in_status(
                     jobSpec)
                 # still running
                 if tmpStat is None:
                     # update job
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     })
                     tmpLog.debug(
                         'try to check later since still preparing with {0}'
                         .format(tmpStr))
                     continue
                 # succeeded
                 if tmpStat is True:
                     # resolve path
                     tmpStat, tmpStr = preparatorCore.resolve_input_paths(
                         jobSpec)
                     if tmpStat is False:
                         jobSpec.lockedBy = None
                         self.dbProxy.update_job(jobSpec, {
                             'lockedBy': lockedBy,
                             'subStatus': oldSubStatus
                         })
                         tmpLog.error(
                             'failed to resolve input file paths : {0}'.
                             format(tmpStr))
                         continue
                     # manipulate container-related job params
                     jobSpec.manipulate_job_params_for_container()
                     # update job
                     jobSpec.lockedBy = None
                     jobSpec.set_all_input_ready()
                     if (maxFilesPerJob is None and jobSpec.auxInput is None) or \
                             (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]):
                         # all done
                         allDone = True
                         jobSpec.subStatus = 'prepared'
                         jobSpec.preparatorTime = None
                         if jobSpec.auxInput is not None:
                             jobSpec.auxInput = JobSpec.AUX_allReady
                     else:
                         # immediate next lookup since there could be more files to check
                         allDone = False
                         jobSpec.trigger_preparation()
                         # change auxInput flag to check auxiliary inputs
                         if len(
                                 jobSpec.inFiles
                         ) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered:
                             jobSpec.auxInput = JobSpec.AUX_inReady
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     },
                                             update_in_file=True)
                     if allDone:
                         tmpLog.debug('succeeded')
                     else:
                         tmpLog.debug('partially succeeded')
                 else:
                     # update job
                     jobSpec.status = 'failed'
                     jobSpec.subStatus = 'failed_to_prepare'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     jobSpec.stateChangeTime = datetime.datetime.utcnow()
                     errStr = 'stage-in failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED,
                                             errStr)
                     jobSpec.trigger_propagation()
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     })
                     tmpLog.error('failed with {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to trigger preparation
         mainLog.debug('try to get jobs to prepare')
         try:
             maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare
             if maxFilesPerJob <= 0:
                 maxFilesPerJob = None
         except Exception:
             maxFilesPerJob = None
         jobsToTrigger = self.dbProxy.get_jobs_in_sub_status(
             'fetched',
             harvester_config.preparator.maxJobsToTrigger,
             'preparatorTime',
             'lockedBy',
             harvester_config.preparator.triggerInterval,
             harvester_config.preparator.lockInterval,
             lockedBy,
             'preparing',
             max_files_per_job=maxFilesPerJob,
             ng_file_status_list=['triggered', 'ready'])
         mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger)))
         # loop over all jobs
         fileStatMap = dict()
         for jobSpec in jobsToTrigger:
             tmpLog = self.make_logger(_logger,
                                       'PandaID={0}'.format(
                                           jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to trigger preparation')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(
                         jobSpec.computingSite, configID):
                     tmpLog.error(
                         'queue config for {0}/{1} not found'.format(
                             jobSpec.computingSite, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     jobSpec.computingSite, configID)
                 oldSubStatus = jobSpec.subStatus
                 # get plugin
                 if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]:
                     preparatorCore = self.pluginFactory.get_plugin(
                         queueConfig.preparator)
                     fileType = 'input'
                 else:
                     preparatorCore = self.pluginFactory.get_plugin(
                         queueConfig.aux_preparator)
                     fileType = FileSpec.AUX_INPUT
                 if preparatorCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(
                         jobSpec.computingSite))
                     continue
                 tmpLog.debug("plugin={0}".format(
                     preparatorCore.__class__.__name__))
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(
                     jobSpec.PandaID, 'preparatorTime', 'lockedBy',
                     lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # check file status
                 if queueConfig.ddmEndpointIn not in fileStatMap:
                     fileStatMap[queueConfig.ddmEndpointIn] = dict()
                 # check if has to_prepare
                 hasToPrepare = False
                 for fileSpec in jobSpec.inFiles:
                     if fileSpec.status == 'to_prepare':
                         hasToPrepare = True
                         break
                 newFileStatusData = []
                 toWait = False
                 newInFiles = []
                 for fileSpec in jobSpec.inFiles:
                     if fileSpec.status in ['preparing', 'to_prepare']:
                         newInFiles.append(fileSpec)
                         updateStatus = False
                         if fileSpec.lfn not in fileStatMap[
                                 queueConfig.ddmEndpointIn]:
                             fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \
                                 = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn,
                                                                'starting')
                         if 'ready' in fileStatMap[
                                 queueConfig.ddmEndpointIn][fileSpec.lfn]:
                             # the file is ready
                             fileSpec.status = 'ready'
                             if fileStatMap[queueConfig.ddmEndpointIn][
                                     fileSpec.lfn]['ready']['path']:
                                 fileSpec.path = list(
                                     fileStatMap[queueConfig.ddmEndpointIn][
                                         fileSpec.lfn]['ready']['path'])[0]
                             # set group info if any
                             groupInfo = self.dbProxy.get_group_for_file(
                                 fileSpec.lfn, fileType,
                                 queueConfig.ddmEndpointIn)
                             if groupInfo is not None:
                                 fileSpec.groupID = groupInfo['groupID']
                                 fileSpec.groupStatus = groupInfo[
                                     'groupStatus']
                                 fileSpec.groupUpdateTime = groupInfo[
                                     'groupUpdateTime']
                             updateStatus = True
                         elif (not hasToPrepare and
                               'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \
                               'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]:
                             # the file is being prepared by another
                             toWait = True
                             if fileSpec.status != 'preparing':
                                 fileSpec.status = 'preparing'
                                 updateStatus = True
                         else:
                             # change file status if the file is not prepared by another
                             if fileSpec.status != 'to_prepare':
                                 fileSpec.status = 'to_prepare'
                                 updateStatus = True
                         # set new status
                         if updateStatus:
                             newFileStatusData.append(
                                 (fileSpec.fileID, fileSpec.lfn,
                                  fileSpec.status))
                             fileStatMap[queueConfig.ddmEndpointIn][
                                 fileSpec.lfn].setdefault(
                                     fileSpec.status, None)
                 if len(newFileStatusData) > 0:
                     self.dbProxy.change_file_status(
                         jobSpec.PandaID, newFileStatusData, lockedBy)
                 # wait since files are being prepared by another
                 if toWait:
                     # update job
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     })
                     tmpLog.debug(
                         'wait since files are being prepared by another job'
                     )
                     continue
                 # trigger preparation
                 tmpStat, tmpStr = preparatorCore.trigger_preparation(
                     jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     jobSpec.lockedBy = None
                     if (maxFilesPerJob is None and jobSpec.auxInput is None) or \
                             (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]):
                         # all done
                         allDone = True
                         jobSpec.subStatus = 'preparing'
                         jobSpec.preparatorTime = None
                         if jobSpec.auxInput is not None:
                             jobSpec.auxInput = JobSpec.AUX_allTriggered
                     else:
                         # change file status but not change job sub status since
                         # there could be more files to prepare
                         allDone = False
                         for fileSpec in jobSpec.inFiles:
                             if fileSpec.status == 'to_prepare':
                                 fileSpec.status = 'triggered'
                         # immediate next lookup
                         jobSpec.trigger_preparation()
                         # change auxInput flag to prepare auxiliary inputs
                         if len(
                                 jobSpec.inFiles
                         ) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput:
                             jobSpec.auxInput = JobSpec.AUX_inTriggered
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     },
                                             update_in_file=True)
                     if allDone:
                         tmpLog.debug('triggered')
                     else:
                         tmpLog.debug('partially triggered')
                 elif tmpStat is False:
                     # fatal error
                     jobSpec.status = 'failed'
                     jobSpec.subStatus = 'failed_to_prepare'
                     jobSpec.lockedBy = None
                     jobSpec.preparatorTime = None
                     jobSpec.stateChangeTime = datetime.datetime.utcnow()
                     errStr = 'stage-in failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED,
                                             errStr)
                     jobSpec.trigger_propagation()
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     })
                     tmpLog.debug(
                         'failed to trigger with {0}'.format(tmpStr))
                 else:
                     # temporary error
                     jobSpec.lockedBy = None
                     self.dbProxy.update_job(jobSpec, {
                         'lockedBy': lockedBy,
                         'subStatus': oldSubStatus
                     })
                     tmpLog.debug(
                         'try to prepare later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.preparator.sleepTime):
             mainLog.debug('terminated')
             return
 def update_jobs(self, jobspec_list, id):
     sw = core_utils.get_stopwatch()
     tmpLogG = self.make_logger('id={0}'.format(id),
                                method_name='update_jobs')
     tmpLogG.debug('update {0} jobs'.format(len(jobspec_list)))
     retList = []
     # update events
     for jobSpec in jobspec_list:
         eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000)
         if eventRanges != []:
             tmpLogG.debug('update {0} events for PandaID={1}'.format(
                 len(eventSpecs), jobSpec.PandaID))
             tmpRet = self.update_event_ranges(eventRanges, tmpLogG)
             if tmpRet['StatusCode'] == 0:
                 for eventSpec, retVal in zip(eventSpecs,
                                              tmpRet['Returns']):
                     if retVal in [True, False
                                   ] and eventSpec.is_final_status():
                         eventSpec.subStatus = 'done'
     # update jobs in bulk
     nLookup = 100
     iLookup = 0
     while iLookup < len(jobspec_list):
         dataList = []
         jobSpecSubList = jobspec_list[iLookup:iLookup + nLookup]
         for jobSpec in jobSpecSubList:
             data = jobSpec.get_job_attributes_for_panda()
             data['jobId'] = jobSpec.PandaID
             data['siteName'] = jobSpec.computingSite
             data['state'] = jobSpec.get_status()
             data['attemptNr'] = jobSpec.attemptNr
             data['jobSubStatus'] = jobSpec.subStatus
             # change cancelled to failed to be accepted by panda server
             if data['state'] in ['cancelled', 'missed']:
                 if jobSpec.is_pilot_closed():
                     data['jobSubStatus'] = 'pilot_closed'
                 else:
                     data['jobSubStatus'] = data['state']
                 data['state'] = 'failed'
             if jobSpec.startTime is not None and 'startTime' not in data:
                 data['startTime'] = jobSpec.startTime.strftime(
                     '%Y-%m-%d %H:%M:%S')
             if jobSpec.endTime is not None and 'endTime' not in data:
                 data['endTime'] = jobSpec.endTime.strftime(
                     '%Y-%m-%d %H:%M:%S')
             if 'coreCount' not in data and jobSpec.nCore is not None:
                 data['coreCount'] = jobSpec.nCore
             if jobSpec.is_final_status(
             ) and jobSpec.status == jobSpec.get_status():
                 if jobSpec.metaData is not None:
                     data['metaData'] = json.dumps(jobSpec.metaData)
                 if jobSpec.outputFilesToReport is not None:
                     data['xml'] = jobSpec.outputFilesToReport
             dataList.append(data)
         harvester_id = harvester_config.master.harvester_id
         tmpData = {
             'jobList': json.dumps(dataList),
             'harvester_id': harvester_id
         }
         tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData)
         retMaps = None
         errStr = ''
         if tmpStat is False:
             errStr = core_utils.dump_error_message(tmpLogG, tmpRes)
         else:
             try:
                 tmpStat, retMaps = tmpRes.json()
                 if tmpStat is False:
                     tmpLogG.error(
                         'updateJobsInBulk failed with {0}'.format(retMaps))
                     retMaps = None
             except Exception:
                 errStr = core_utils.dump_error_message(tmpLogG)
         if retMaps is None:
             retMap = {}
             retMap['content'] = {}
             retMap['content']['StatusCode'] = 999
             retMap['content']['ErrorDiag'] = errStr
             retMaps = [json.dumps(retMap)] * len(jobSpecSubList)
         for jobSpec, retMap, data in zip(jobSpecSubList, retMaps,
                                          dataList):
             tmpLog = self.make_logger('id={0} PandaID={1}'.format(
                 id, jobSpec.PandaID),
                                       method_name='update_jobs')
             try:
                 retMap = json.loads(retMap['content'])
             except Exception:
                 errStr = 'falied to load json'
                 retMap = {}
                 retMap['StatusCode'] = 999
                 retMap['ErrorDiag'] = errStr
             tmpLog.debug('data={0}'.format(str(data)))
             tmpLog.debug('done with {0}'.format(str(retMap)))
             retList.append(retMap)
         iLookup += nLookup
     tmpLogG.debug('done' + sw.get_elapsed_time())
     return retList
Example #10
0
    def run(self):
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(self.ident),
                                             method_name='run')
            mainLog.debug('getting jobs to propagate')
            jobSpecs = self.dbProxy.get_jobs_to_propagate(
                harvester_config.propagator.maxJobs,
                harvester_config.propagator.lockInterval,
                harvester_config.propagator.updateInterval, self.ident)
            mainLog.debug('got {0} jobs'.format(len(jobSpecs)))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(
                            tmpJobSpec.computingSite)
                        hbSuppressMap[
                            tmpJobSpec.
                            computingSite] = queueConfig.get_no_heartbeat_status(
                            )
                    # heartbeat is suppressed
                    if tmpJobSpec.status in hbSuppressMap[
                            tmpJobSpec.computingSite]:
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                retList += self.communicator.check_jobs(jobListToCheck)
                retList += self.communicator.update_jobs(jobListToUpdate)
                # logging
                for tmpJobSpec, tmpRet in zip(
                        jobListToSkip + jobListToCheck + jobListToUpdate,
                        retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug(
                                'updated PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        else:
                            mainLog.debug(
                                'skip updating PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status(
                        ) and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(
                                    tmpJobSpec)
                                if tmpEvStat and tmpEvRet == 0:
                                    mainLog.debug(
                                        'kill PandaID={0} due to no event'.
                                        format(tmpJobSpec.PandaID))
                                    tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in [
                                    'tobekilled'
                            ]:
                                nWorkers = self.dbProxy.kill_workers_with_job(
                                    tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no remaining workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow(
                                    )
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec,
                                                {'propagatorLock': self.ident})
                    else:
                        mainLog.error(
                            'failed to update PandaID={0} status={1}'.format(
                                tmpJobSpec.PandaID, tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            workSpecs = self.dbProxy.get_workers_to_propagate(
                harvester_config.propagator.maxWorkers,
                harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers'.format(len(workSpecs)))
            # update workers in central database
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nJobs]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error(
                        'failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug(
                                'updated workerID={0} status={1}'.format(
                                    tmpWorkSpec.workerID, tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(
                                        logRemoteName, logFileObj, logOffset,
                                        logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(
                                            logFilePath, logOffset + logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(
                                tmpWorkSpec,
                                {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error(
                                'failed to update workerID={0} status={1}'.
                                format(tmpWorkSpec.workerID,
                                       tmpWorkSpec.status))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(
                        CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error(
                            'failed to get worker stats for {0}'.format(
                                siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(
                            siteName, workerStats)
                        if tmpRet:
                            mainLog.debug(
                                'updated worker stats (command) for {0}'.
                                format(siteName))
                        else:
                            mainLog.error(
                                'failed to update worker stats (command) for {0} err={1}'
                                .format(siteName, tmpStr))

            if not self._last_stats_update or time.time(
            ) - self._last_stats_update > STATS_PERIOD:
                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk()
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(
                            site_name, worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug(
                                'update of worker stats (bulk) for {0}'.format(
                                    site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error(
                                'failed to update worker stats (bulk) for {0} err={1}'
                                .format(site_name, tmp_str))

            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
Example #11
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         sw_main = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         # get commands to kill
         sw_getcomm = core_utils.get_stopwatch()
         mainLog.debug('try to get commands')
         comStr = CommandSpec.COM_killWorkers
         commandSpecs = self.dbProxy.get_commands_for_receiver(
             'sweeper', comStr)
         mainLog.debug('got {0} {1} commands'.format(
             len(commandSpecs), comStr))
         for commandSpec in commandSpecs:
             n_to_kill = self.dbProxy.kill_workers_by_query(
                 commandSpec.params)
             mainLog.debug('will kill {0} workers with {1}'.format(
                 n_to_kill, commandSpec.params))
         mainLog.debug('done handling commands' +
                       sw_getcomm.get_elapsed_time())
         # killing stage
         sw_kill = core_utils.get_stopwatch()
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(
             harvester_config.sweeper.maxWorkers,
             harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(
             len(workersToKill)))
         # loop over all workers
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersToKill):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 try:
                     sweeperCore = self.pluginFactory.get_plugin(
                         queueConfig.sweeper)
                 except Exception:
                     mainLog.error(
                         'failed to launch sweeper plugin for {0}/{1}'.
                         format(queueName, configID))
                     core_utils.dump_error_message(mainLog)
                     continue
                 sw.reset()
                 n_workers = len(workspec_list)
                 try:
                     # try bulk method
                     tmpLog = self.make_logger(_logger,
                                               'id={0}'.format(lockedBy),
                                               method_name='run')
                     tmpLog.debug('start killing')
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpLog.debug('start killing one worker')
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                             tmpLog.debug(
                                 'done killing with status={0} diag={1}'.
                                 format(tmpStat, tmpOut))
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 else:
                     # bulk method
                     n_killed = 0
                     for workspec, (tmpStat,
                                    tmpOut) in zip(workspec_list, tmpList):
                         tmpLog.debug(
                             'done killing workerID={0} with status={1} diag={2}'
                             .format(workspec.workerID, tmpStat, tmpOut))
                         if tmpStat:
                             n_killed += 1
                     tmpLog.debug('killed {0}/{1} workers'.format(
                         n_killed, n_workers))
                 mainLog.debug(
                     'done killing {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
         # cleanup stage
         sw_cleanup = core_utils.get_stopwatch()
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         try:
             keepPending = harvester_config.sweeper.keepPending
         except Exception:
             keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {
             'finished': harvester_config.sweeper.keepFinished,
             'failed': harvester_config.sweeper.keepFailed,
             'cancelled': harvester_config.sweeper.keepCancelled,
             'missed': keepMissed,
             'pending': keepPending
         }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(
             harvester_config.sweeper.maxWorkers, statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(
             len(workersForCleanup)))
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(
                 workersForCleanup):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(
                     queueConfig.sweeper)
                 messenger = self.pluginFactory.get_plugin(
                     queueConfig.messenger)
                 sw.reset()
                 n_workers = len(workspec_list)
                 # make sure workers to clean up are all terminated
                 mainLog.debug(
                     'making sure workers to clean up are all terminated')
                 try:
                     # try bulk method
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 mainLog.debug(
                     'made sure workers to clean up are all terminated')
                 # start cleanup
                 for workspec in workspec_list:
                     tmpLog = self.make_logger(_logger,
                                               'workerID={0}'.format(
                                                   workspec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleaning up one worker')
                         # sweep worker
                         tmpStat, tmpOut = sweeperCore.sweep_worker(
                             workspec)
                         tmpLog.debug(
                             'swept_worker with status={0} diag={1}'.format(
                                 tmpStat, tmpOut))
                         tmpLog.debug('start messenger cleanup')
                         mc_tmpStat, mc_tmpOut = messenger.clean_up(
                             workspec)
                         tmpLog.debug(
                             'messenger cleaned up with status={0} diag={1}'
                             .format(mc_tmpStat, mc_tmpOut))
                         if tmpStat:
                             self.dbProxy.delete_worker(workspec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
                 mainLog.debug(
                     'done cleaning up {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
         # old-job-deletion stage
         sw_delete = core_utils.get_stopwatch()
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         # delete orphaned job info
         self.dbProxy.delete_orphaned_job_info()
         mainLog.debug('done deletion of old jobs' +
                       sw_delete.get_elapsed_time())
         # disk cleanup
         if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \
                 hasattr(harvester_config.sweeper, 'diskHighWatermark'):
             locked = self.dbProxy.get_process_lock(
                 'sweeper', self.get_pid(),
                 harvester_config.sweeper.diskCleanUpInterval * 60 * 60)
             if locked:
                 try:
                     all_active_files = None
                     for item in harvester_config.sweeper.diskHighWatermark.split(
                             ','):
                         # dir name and watermark in GB
                         dir_name, watermark = item.split('|')
                         mainLog.debug(
                             'checking {0} for cleanup with watermark {1} GB'
                             .format(dir_name, watermark))
                         watermark = int(watermark) * 10**9
                         total_size = 0
                         file_dict = {}
                         # scan dir
                         for root, dirs, filenames in walk(dir_name):
                             for base_name in filenames:
                                 full_name = os.path.join(root, base_name)
                                 f_size = os.path.getsize(full_name)
                                 total_size += f_size
                                 mtime = os.path.getmtime(full_name)
                                 file_dict.setdefault(mtime, set())
                                 file_dict[mtime].add(
                                     (base_name, full_name, f_size))
                         # delete if necessary
                         if total_size < watermark:
                             mainLog.debug(
                                 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB'
                                 .format(dir_name, total_size // (10**9),
                                         watermark // (10**9)))
                         else:
                             mainLog.debug(
                                 'cleanup {0} due to total_size {1} GB >= watermark {2} GB'
                                 .format(dir_name, total_size // (10**9),
                                         watermark // (10**9)))
                             # get active input files
                             if all_active_files is None:
                                 all_active_files = self.dbProxy.get_all_active_input_files(
                                 )
                             deleted_size = 0
                             mtimes = sorted(file_dict.keys())
                             for mtime in mtimes:
                                 for base_name, full_name, f_size in file_dict[
                                         mtime]:
                                     # keep if active
                                     if base_name in all_active_files:
                                         continue
                                     try:
                                         os.remove(full_name)
                                     except Exception:
                                         core_utils.dump_error_message(
                                             mainLog)
                                     deleted_size += f_size
                                     if total_size - deleted_size < watermark:
                                         break
                                 if total_size - deleted_size < watermark:
                                     break
                 except Exception:
                     core_utils.dump_error_message(mainLog)
         # time the cycle
         mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return
Example #12
0
 def run(self):
     lockedBy = 'monitor-{0}'.format(self.ident)
     # init messengers
     for queueConfig in self.queueConfigMapper.get_all_queues().values():
         # just import for module initialization
         self.pluginFactory.get_plugin(queueConfig.messenger)
     # main
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = core_utils.make_logger(_logger,
                                          'id={0}'.format(lockedBy),
                                          method_name='run')
         mainLog.debug('getting workers to monitor')
         workSpecsPerQueue = self.dbProxy.get_workers_to_update(
             harvester_config.monitor.maxWorkers,
             harvester_config.monitor.checkInterval,
             harvester_config.monitor.lockInterval, lockedBy)
         mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
         # loop over all workers
         for queueName, workSpecsList in iteritems(workSpecsPerQueue):
             tmpQueLog = core_utils.make_logger(_logger,
                                                'id={0} queue={1}'.format(
                                                    lockedBy, queueName),
                                                method_name='run')
             # check queue
             if not self.queueConfigMapper.has_queue(queueName):
                 tmpQueLog.error('config not found')
                 continue
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # get plugins
             monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
             messenger = self.pluginFactory.get_plugin(
                 queueConfig.messenger)
             # check workers
             allWorkers = [
                 item for sublist in workSpecsList for item in sublist
             ]
             tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
             tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                            queueConfig, tmpQueLog)
             # loop over all worker chunks
             tmpQueLog.debug('update jobs and workers')
             iWorker = 0
             for workSpecs in workSpecsList:
                 jobSpecs = None
                 filesToStageOut = dict()
                 pandaIDsList = []
                 eventsToUpdateList = []
                 filesToStageOutList = []
                 for workSpec in workSpecs:
                     tmpLog = core_utils.make_logger(_logger,
                                                     'workerID={0}'.format(
                                                         workSpec.workerID),
                                                     method_name='run')
                     tmpOut = tmpRetMap[workSpec.workerID]
                     newStatus = tmpOut['newStatus']
                     monStatus = tmpOut['monStatus']
                     diagMessage = tmpOut['diagMessage']
                     workAttributes = tmpOut['workAttributes']
                     eventsToUpdate = tmpOut['eventsToUpdate']
                     filesToStageOut = tmpOut['filesToStageOut']
                     eventsRequestParams = tmpOut['eventsRequestParams']
                     nJobsToReFill = tmpOut['nJobsToReFill']
                     pandaIDs = tmpOut['pandaIDs']
                     tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                     tmpStr += 'postProcessed={3} files={4}'
                     tmpLog.debug(
                         tmpStr.format(newStatus, monStatus, diagMessage,
                                       workSpec.is_post_processed(),
                                       str(filesToStageOut)))
                     iWorker += 1
                     # check status
                     if newStatus not in WorkSpec.ST_LIST:
                         tmpLog.error(
                             'unknown status={0}'.format(newStatus))
                         continue
                     # update worker
                     workSpec.set_status(newStatus)
                     workSpec.set_work_attributes(workAttributes)
                     # request events
                     if eventsRequestParams != {}:
                         workSpec.eventsRequest = WorkSpec.EV_requestEvents
                         workSpec.eventsRequestParams = eventsRequestParams
                     # jobs to refill
                     if nJobsToReFill is not None:
                         workSpec.nJobsToReFill = nJobsToReFill
                     # get associated jobs for the worker chunk
                     if workSpec.hasJob == 1 and jobSpecs is None:
                         jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                             workSpec.workerID, None, only_running=True)
                     # pandaIDs for push
                     pandaIDsList.append(pandaIDs)
                     if len(eventsToUpdate) > 0:
                         eventsToUpdateList.append(eventsToUpdate)
                     if len(filesToStageOut) > 0:
                         filesToStageOutList.append(filesToStageOut)
                 # update jobs and workers
                 if jobSpecs is not None:
                     tmpQueLog.debug(
                         'updating {0} jobs with {1} workers'.format(
                             len(jobSpecs), len(workSpecs)))
                     core_utils.update_job_attributes_with_workers(
                         queueConfig.mapType, jobSpecs, workSpecs,
                         filesToStageOutList, eventsToUpdateList)
                     for jobSpec in jobSpecs:
                         tmpLog = core_utils.make_logger(
                             _logger,
                             'PandaID={0}'.format(jobSpec.PandaID),
                             method_name='run')
                         tmpLog.debug(
                             'new status={0} subStatus={1} status_in_metadata={2}'
                             .format(
                                 jobSpec.status, jobSpec.subStatus,
                                 jobSpec.get_job_status_from_attributes()))
                 # update local database
                 tmpRet = self.dbProxy.update_jobs_workers(
                     jobSpecs, workSpecs, lockedBy, pandaIDsList)
                 if not tmpRet:
                     for workSpec in workSpecs:
                         tmpLog = core_utils.make_logger(
                             _logger,
                             'workerID={0}'.format(workSpec.workerID),
                             method_name='run')
                         tmpLog.error(
                             'failed to update the DB. lockInterval may be too short'
                         )
                 # send ACK to workers for events and files
                 if len(eventsToUpdateList) > 0 or len(
                         filesToStageOutList) > 0:
                     for workSpec in workSpecs:
                         messenger.acknowledge_events_files(workSpec)
             tmpQueLog.debug('done')
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.monitor.sleepTime):
             mainLog.debug('terminated')
             return
def fifo_benchmark(arguments):
    n_object = arguments.n_object
    n_thread = arguments.n_thread
    mq = harvesterFifos.BenchmarkFIFO()
    sw = core_utils.get_stopwatch()
    sum_dict = {
                'put_n' : 0,
                'put_time' : 0.0,
                'get_time' : 0.0,
                'get_protective_time' : 0.0,
                'clear_time' : 0.0,
                }
    def _put_object(i_index):
        workspec = WorkSpec()
        workspec.workerID = i_index
        data = {'random': [(i_index**2) % 2**16, random.random()]}
        workspec.workAttributes = data
        mq.put(workspec)
    def _get_object(i_index):
        return mq.get(timeout=3, protective=False)
    def _get_object_protective(i_index):
        return mq.get(timeout=3, protective=True)
    def put_test():
        sw.reset()
        multithread_executer(_put_object, n_object, n_thread)
        sum_dict['put_time'] += sw.get_elapsed_time_in_sec(True)
        sum_dict['put_n'] += 1
        print('Put {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time())
        print('Now fifo size is {0}'.format(mq.size()))
    def get_test():
        sw.reset()
        multithread_executer(_get_object, n_object, n_thread)
        sum_dict['get_time'] = sw.get_elapsed_time_in_sec(True)
        print('Get {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time())
        print('Now fifo size is {0}'.format(mq.size()))
    def get_protective_test():
        sw.reset()
        multithread_executer(_get_object_protective, n_object, n_thread)
        sum_dict['get_protective_time'] = sw.get_elapsed_time_in_sec(True)
        print('Get {0} objects protective dequeue by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time())
        print('Now fifo size is {0}'.format(mq.size()))
    def clear_test():
        sw.reset()
        mq.fifo.clear()
        sum_dict['clear_time'] = sw.get_elapsed_time_in_sec(True)
        print('Cleared fifo' + sw.get_elapsed_time())
        print('Now fifo size is {0}'.format(mq.size()))
    # Benchmark
    print('Start fifo benchmark ...')
    mq.fifo.clear()
    print('Cleared fifo')
    put_test()
    get_test()
    put_test()
    get_protective_test()
    put_test()
    clear_test()
    print('Finished fifo benchmark')
    # summary
    print('Summary:')
    print('FIFO plugin is: {0}'.format(mq.fifo.__class__.__name__))
    print('Benchmark with {0} objects by {1} threads'.format(n_object, n_thread))
    print('Put            : {0:.3f} ms / obj'.format(1000. * sum_dict['put_time']/(sum_dict['put_n']*n_object)))
    print('Get            : {0:.3f} ms / obj'.format(1000. * sum_dict['get_time']/n_object))
    print('Get protective : {0:.3f} ms / obj'.format(1000. * sum_dict['get_protective_time']/n_object))
    print('Clear          : {0:.3f} ms / obj'.format(1000. * sum_dict['clear_time']/n_object))
Example #14
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         sw_main = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         # get commands to kill
         sw_getcomm = core_utils.get_stopwatch()
         mainLog.debug('try to get commands')
         comStr = CommandSpec.COM_killWorkers
         commandSpecs = self.dbProxy.get_commands_for_receiver(
             'sweeper', comStr)
         mainLog.debug('got {0} {1} commands'.format(
             len(commandSpecs), comStr))
         for commandSpec in commandSpecs:
             n_to_kill = self.dbProxy.kill_workers_by_query(
                 commandSpec.params)
             mainLog.debug('will kill {0} workers with {1}'.format(
                 n_to_kill, commandSpec.params))
         mainLog.debug('done handling commands' +
                       sw_getcomm.get_elapsed_time())
         # killing stage
         sw_kill = core_utils.get_stopwatch()
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(
             harvester_config.sweeper.maxWorkers,
             harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(
             len(workersToKill)))
         # loop over all workers
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersToKill):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(
                     queueConfig.sweeper)
                 sw.reset()
                 n_workers = len(workspec_list)
                 try:
                     # try bulk method
                     tmpLog = self.make_logger(_logger,
                                               'id={0}'.format(lockedBy),
                                               method_name='run')
                     tmpLog.debug('start killing')
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpLog.debug('start killing one worker')
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                             tmpLog.debug(
                                 'done killing with status={0} diag={1}'.
                                 format(tmpStat, tmpOut))
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 else:
                     # bulk method
                     n_killed = 0
                     for workspec, (tmpStat,
                                    tmpOut) in zip(workspec_list, tmpList):
                         tmpLog.debug(
                             'done killing workerID={0} with status={1} diag={2}'
                             .format(workspec.workerID, tmpStat, tmpOut))
                         if tmpStat:
                             n_killed += 1
                     tmpLog.debug('killed {0}/{1} workers'.format(
                         n_killed, n_workers))
                 mainLog.debug(
                     'done killing {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
         # cleanup stage
         sw_cleanup = core_utils.get_stopwatch()
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         try:
             keepPending = harvester_config.sweeper.keepPending
         except Exception:
             keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {
             'finished': harvester_config.sweeper.keepFinished,
             'failed': harvester_config.sweeper.keepFailed,
             'cancelled': harvester_config.sweeper.keepCancelled,
             'missed': keepMissed,
             'pending': keepPending
         }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(
             harvester_config.sweeper.maxWorkers, statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(
             len(workersForCleanup)))
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(
                 workersForCleanup):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(
                     queueConfig.sweeper)
                 messenger = self.pluginFactory.get_plugin(
                     queueConfig.messenger)
                 sw.reset()
                 n_workers = len(workspec_list)
                 # make sure workers to clean up are all terminated
                 mainLog.debug(
                     'making sure workers to clean up are all terminated')
                 try:
                     # try bulk method
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 mainLog.debug(
                     'made sure workers to clean up are all terminated')
                 # start cleanup
                 for workspec in workspec_list:
                     tmpLog = self.make_logger(_logger,
                                               'workerID={0}'.format(
                                                   workspec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleaning up one worker')
                         # sweep worker
                         tmpStat, tmpOut = sweeperCore.sweep_worker(
                             workspec)
                         tmpLog.debug(
                             'swept_worker with status={0} diag={1}'.format(
                                 tmpStat, tmpOut))
                         tmpLog.debug('start messenger cleanup')
                         mc_tmpStat, mc_tmpOut = messenger.clean_up(
                             workspec)
                         tmpLog.debug(
                             'messenger cleaned up with status={0} diag={1}'
                             .format(mc_tmpStat, mc_tmpOut))
                         if tmpStat:
                             self.dbProxy.delete_worker(workspec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
                 mainLog.debug(
                     'done cleaning up {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
         # old-job-deletion stage
         sw_delete = core_utils.get_stopwatch()
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         # delete orphaned job info
         self.dbProxy.delete_orphaned_job_info()
         mainLog.debug('done deletion of old jobs' +
                       sw_delete.get_elapsed_time())
         # time the cycle
         mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return
Example #15
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         sw_main = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
         # killing stage
         sw_kill = core_utils.get_stopwatch()
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                          harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
         # loop over all workers
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersToKill):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(queueName, configID):
                     mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                 sw.reset()
                 n_workers = len(workspec_list)
                 try:
                     # try bulk method
                     tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
                     tmpLog.debug('start killing')
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpLog.debug('start killing one worker')
                             tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                             tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut))
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 else:
                     # bulk method
                     n_killed = 0
                     for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList):
                         tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format(
                                         workspec.workerID, tmpStat, tmpOut))
                         if tmpStat:
                             n_killed += 1
                     tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers))
                 mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time())
         mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
         # cleanup stage
         sw_cleanup = core_utils.get_stopwatch()
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         try:
             keepPending = harvester_config.sweeper.keepPending
         except Exception:
             keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                             'failed': harvester_config.sweeper.keepFailed,
                             'cancelled': harvester_config.sweeper.keepCancelled,
                             'missed': keepMissed,
                             'pending': keepPending
                             }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                  statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersForCleanup):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(queueName, configID):
                     mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                 messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                 sw.reset()
                 n_workers = len(workspec_list)
                 # make sure workers to clean up are all terminated
                 mainLog.debug('making sure workers to clean up are all terminated')
                 try:
                     # try bulk method
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 mainLog.debug('made sure workers to clean up are all terminated')
                 # start cleanup
                 for workspec in workspec_list:
                     tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleaning up one worker')
                         # sweep worker
                         tmpStat, tmpOut = sweeperCore.sweep_worker(workspec)
                         tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut))
                         tmpLog.debug('start messenger cleanup')
                         mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec)
                         tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut))
                         if tmpStat:
                             self.dbProxy.delete_worker(workspec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
                 mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time())
         mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
         # old-job-deletion stage
         sw_delete = core_utils.get_stopwatch()
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         # delete orphaned job info
         self.dbProxy.delete_orphaned_job_info()
         mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time())
         # time the cycle
         mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return
 def update_jobs(self, jobspec_list, id):
     sw = core_utils.get_stopwatch()
     tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs')
     tmpLogG.debug('update {0} jobs'.format(len(jobspec_list)))
     retList = []
     # update events
     for jobSpec in jobspec_list:
         eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000)
         if eventRanges != []:
             tmpLogG.debug('update {0} events for PandaID={1}'.format(len(eventSpecs), jobSpec.PandaID))
             tmpRet = self.update_event_ranges(eventRanges, tmpLogG)
             if tmpRet['StatusCode'] == 0:
                 for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']):
                     if retVal in [True, False] and eventSpec.is_final_status():
                         eventSpec.subStatus = 'done'
     # update jobs in bulk
     nLookup = 100
     iLookup = 0
     while iLookup < len(jobspec_list):
         dataList = []
         jobSpecSubList = jobspec_list[iLookup:iLookup+nLookup]
         for jobSpec in jobSpecSubList:
             data = jobSpec.get_job_attributes_for_panda()
             data['jobId'] = jobSpec.PandaID
             data['siteName'] = jobSpec.computingSite
             data['state'] = jobSpec.get_status()
             data['attemptNr'] = jobSpec.attemptNr
             data['jobSubStatus'] = jobSpec.subStatus
             # change cancelled to failed to be accepted by panda server
             if data['state'] in ['cancelled', 'missed']:
                 if jobSpec.is_pilot_closed():
                     data['jobSubStatus'] = 'pilot_closed'
                 else:
                     data['jobSubStatus'] = data['state']
                 data['state'] = 'failed'
             if jobSpec.startTime is not None and 'startTime' not in data:
                 data['startTime'] = jobSpec.startTime.strftime('%Y-%m-%d %H:%M:%S')
             if jobSpec.endTime is not None and 'endTime' not in data:
                 data['endTime'] = jobSpec.endTime.strftime('%Y-%m-%d %H:%M:%S')
             if 'coreCount' not in data and jobSpec.nCore is not None:
                 data['coreCount'] = jobSpec.nCore
             if jobSpec.is_final_status() and jobSpec.status == jobSpec.get_status():
                 if jobSpec.metaData is not None:
                     data['metaData'] = json.dumps(jobSpec.metaData)
                 if jobSpec.outputFilesToReport is not None:
                     data['xml'] = jobSpec.outputFilesToReport
             dataList.append(data)
         harvester_id = harvester_config.master.harvester_id
         tmpData = {'jobList': json.dumps(dataList), 'harvester_id': harvester_id}
         tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData)
         retMaps = None
         errStr = ''
         if tmpStat is False:
             errStr = core_utils.dump_error_message(tmpLogG, tmpRes)
         else:
             try:
                 tmpStat, retMaps = tmpRes.json()
                 if tmpStat is False:
                     tmpLogG.error('updateJobsInBulk failed with {0}'.format(retMaps))
                     retMaps = None
             except Exception:
                 errStr = core_utils.dump_error_message(tmpLogG)
         if retMaps is None:
             retMap = {}
             retMap['content'] = {}
             retMap['content']['StatusCode'] = 999
             retMap['content']['ErrorDiag'] = errStr
             retMaps = [json.dumps(retMap)] * len(jobSpecSubList)
         for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList):
             tmpLog = self.make_logger('id={0} PandaID={1}'.format(id, jobSpec.PandaID),
                                       method_name='update_jobs')
             try:
                 retMap = json.loads(retMap['content'])
             except Exception:
                 errStr = 'falied to load json'
                 retMap = {}
                 retMap['StatusCode'] = 999
                 retMap['ErrorDiag'] = errStr
             tmpLog.debug('data={0}'.format(str(data)))
             tmpLog.debug('done with {0}'.format(str(retMap)))
             retList.append(retMap)
         iLookup += nLookup
     tmpLogG.debug('done' + sw.get_elapsed_time())
     return retList
Example #17
0
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval',
                                    harvester_config.submitter.lockInterval)
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues,
                                                                             harvester_config.submitter.lookupTime,
                                                                             harvester_config.submitter.lockInterval,
                                                                             lockedBy, queueLockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error('WorkerAdjuster failed to define the number of workers')
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy,
                                                                                                   queueName,
                                                                                                   resource_type),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug('skipped since no new worker is needed based on current stats')
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(queueName)
                                workerMakerCore = self.workerMaker.get_plugin(queueConfig)
                                # check if resource is ready
                                if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(queueConfig,
                                                                                             resource_type,
                                                                                             workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' % numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore, 'staticWorkers'):
                                            nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' %
                                                         (workerMakerCore.staticWorkers, nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug('No left static workers, skip')
                                                continue
                                            else:
                                                nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers)
                                                tmpLog.debug('staticWorkers: %s, nWorkers: %s' %
                                                             (workerMakerCore.staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug('skip since no resources are ready')
                                            continue
                                    else:
                                        nWorkers = min(nWorkers, numReadyResources)
                                # post action of worker maker
                                if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, None, nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy, max_workers_per_job_in_total=maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig,
                                                                                   nReady, resource_type,
                                                                                   maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug('successfully made {0} workers'.format(len(okChunks)))
                                else:
                                    tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks),
                                                                                                     len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                                              'subStatus': 'prepared'})
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [None, 0]:
                                                workSpec.set_jobspec_list(okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[workSpec.nJobsToReFill:]:
                                                    pandaIDs.add(jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger['accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level()
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'.format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'.format(workSpec.workerID,
                                                                                                tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info('submitting {0} workers'.format(len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore,
                                                                                               workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.format(len(workSpecList))
                                                    + sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[iWorker]
                                            jobList = workSpec.get_jobspec_list()
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname()
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID,
                                                tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({'lastCheckAt': timeNow_timestamp})
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(tmpStr.format(workSpec.workerID,
                                                                                  jobSpec.PandaID,
                                                                                  workSpec.batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(tmpStr.format(workSpec.workerID,
                                                                                   jobSpec.PandaID))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        check_delay = min(
                                                        getattr(harvester_config.monitor, 'eventBasedCheckInterval',
                                                                harvester_config.monitor.checkInterval),
                                                        getattr(harvester_config.monitor, 'fifoCheckInterval',
                                                                harvester_config.monitor.checkInterval))
                                        monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay)
                                        mainLog.debug('put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                # release the site
                self.dbProxy.release_site(siteName, lockedBy)
                if sw_main.get_elapsed_time_in_sec() > queueLockInterval:
                    mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval)
                                    + sw_main.get_elapsed_time())
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return
Example #18
0
    def run(self):
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
            mainLog.debug('getting jobs to propagate')
            sw = core_utils.get_stopwatch()
            jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs,
                                                          harvester_config.propagator.lockInterval,
                                                          harvester_config.propagator.updateInterval,
                                                          self.get_pid())
            mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time()))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite,
                                                                       tmpJobSpec.configID)
                        hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status()
                    # heartbeat is suppressed
                    if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \
                            not tmpJobSpec.not_suppress_heartbeat():
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                sw.reset()
                retList += self.communicator.check_jobs(jobListToCheck)
                mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time()))
                sw.reset()
                retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid())
                mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate),
                                                                              sw.get_elapsed_time()))
                # logging
                for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                  tmpJobSpec.status))
                        else:
                            mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                        tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                            tmpJobSpec.modificationTime = datetime.datetime.utcnow()
                        elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done():
                            # trigger next propagation to update remaining events
                            tmpJobSpec.trigger_propagation()
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec)
                                if tmpEvStat:
                                    if tmpEvRet is not None:
                                        tmpJobSpec.nRemainingEvents = tmpEvRet
                                    if tmpEvRet == 0:
                                        mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID))
                                        tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']:
                                nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                               PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL])
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow()
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()})
                    else:
                        mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                       tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            sw.reset()
            workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers,
                                                              harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time()))
            # update workers in central database
            sw.reset()
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nWorkers]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error('failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                   tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj,
                                                                                    logOffset, logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                            tmpWorkSpec.status))
            mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers,
                                                                      sw.get_elapsed_time()))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error('failed to get worker stats for {0}'.format(siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats)
                        if tmpRet:
                            mainLog.debug('updated worker stats (command) for {0}'.format(siteName))
                        else:
                            mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName,
                                                                                                           tmpStr))

            if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD:

                # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking
                # care of them
                active_ups_queues = self.queueConfigMapper.get_active_ups_queues()

                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues)
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name,
                                                                                 worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name,
                                                                                                        tmp_str))

            if not self._last_metrics_update \
                    or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD):
                # get latest metrics from DB
                service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update)
                if not service_metrics_list:
                    mainLog.error('failed to get service metrics')
                    self._last_metrics_update = datetime.datetime.utcnow()
                else:
                    tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list)
                    if tmp_ret:
                        mainLog.debug('update of service metrics OK')
                        self._last_metrics_update = datetime.datetime.utcnow()
                    else:
                        mainLog.error('failed to update service metrics err={0}'.format(tmp_str))

            # send dialog messages
            mainLog.debug('getting dialog messages to propagate')
            try:
                maxDialogs = harvester_config.propagator.maxDialogs
            except Exception:
                maxDialogs = 50
            diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs,
                                                                 harvester_config.propagator.lockInterval)
            mainLog.debug('got {0} dialogs'.format(len(diagSpecs)))
            if len(diagSpecs) > 0:
                tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs)
                if tmpStat:
                    diagIDs = [diagSpec.diagID for diagSpec in diagSpecs]
                    self.dbProxy.delete_dialog_messages(diagIDs)
                    mainLog.debug('sent {0} dialogs'.format(len(diagSpecs)))

                else:
                    mainLog.error('failed to send dialogs err={0}'.format(tmpStr))
            if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval:
                mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time())
            else:
                mainLog.debug('done' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
Example #19
0
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.get_pid())
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        try:
            fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli
        except AttributeError:
            fifoSleepTimeMilli = 5000
        try:
            fifoCheckDuration = harvester_config.monitor.fifoCheckDuration
        except AttributeError:
            fifoCheckDuration = 30
        try:
            fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk
        except AttributeError:
            fifoMaxWorkersPerChunk = 500
        try:
            fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue
        except AttributeError:
            fifoProtectiveDequeue = True
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        sleepTime = (fifoSleepTimeMilli / 1000.0) \
                        if monitor_fifo.enabled else harvester_config.monitor.sleepTime
        adjusted_sleepTime = sleepTime
        if monitor_fifo.enabled:
            monitor_fifo.restore()
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('start a monitor cycle')
            if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \
                    not (monitor_fifo.enabled and self.singleMode):
                # run with workers from DB
                sw_db = core_utils.get_stopwatch()
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                if sw_db.get_elapsed_time_in_sec(
                ) > harvester_config.monitor.lockInterval:
                    mainLog.warning(
                        'a single DB cycle was longer than lockInterval ' +
                        sw_db.get_elapsed_time())
                else:
                    mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time())
                mainLog.debug('ended run with DB')
            elif monitor_fifo.enabled:
                # run with workers from FIFO
                sw = core_utils.get_stopwatch()
                n_loops = 0
                n_loops_hit = 0
                last_fifo_cycle_timestamp = time.time()
                to_break = False
                obj_dequeued_id_list = []
                obj_to_enqueue_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                obj_to_enqueue_to_head_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                remaining_obj_to_enqueue_dict = {}
                remaining_obj_to_enqueue_to_head_dict = {}
                n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0
                while time.time(
                ) < last_fifo_cycle_timestamp + fifoCheckDuration:
                    sw.reset()
                    n_loops += 1
                    retVal, overhead_time = monitor_fifo.to_check_workers()
                    if overhead_time is not None:
                        n_chunk_peeked_stat += 1
                        sum_overhead_time_stat += overhead_time
                    if retVal:
                        # check fifo size
                        fifo_size = monitor_fifo.size()
                        mainLog.debug('FIFO size is {0}'.format(fifo_size))
                        mainLog.debug('starting run with FIFO')
                        try:
                            obj_gotten = monitor_fifo.get(
                                timeout=1, protective=fifoProtectiveDequeue)
                        except Exception as errStr:
                            mainLog.error(
                                'failed to get object from FIFO: {0}'.format(
                                    errStr))
                        else:
                            if obj_gotten is not None:
                                sw_fifo = core_utils.get_stopwatch()
                                if fifoProtectiveDequeue:
                                    obj_dequeued_id_list.append(obj_gotten.id)
                                queueName, workSpecsList = obj_gotten.item
                                mainLog.debug(
                                    'got a chunk of {0} workers of {1} from FIFO'
                                    .format(len(workSpecsList), queueName) +
                                    sw.get_elapsed_time())
                                sw.reset()
                                configID = None
                                for workSpecs in workSpecsList:
                                    if configID is None and len(workSpecs) > 0:
                                        configID = workSpecs[0].configID
                                    for workSpec in workSpecs:
                                        if workSpec.pandaid_list is None:
                                            _jobspec_list = workSpec.get_jobspec_list(
                                            )
                                            if _jobspec_list is not None:
                                                workSpec.pandaid_list = [
                                                    j.PandaID
                                                    for j in workSpec.
                                                    get_jobspec_list()
                                                ]
                                            else:
                                                workSpec.pandaid_list = []
                                            workSpec.force_update(
                                                'pandaid_list')
                                retVal = self.monitor_agent_core(
                                    lockedBy,
                                    queueName,
                                    workSpecsList,
                                    from_fifo=True,
                                    config_id=configID)
                                if retVal is not None:
                                    workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                    try:
                                        if len(obj_to_enqueue_dict[queueName]
                                               [0]) + len(
                                                   workSpecsToEnqueue
                                               ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_dict[queueName][
                                                0].extend(workSpecsToEnqueue)
                                            obj_to_enqueue_dict[queueName][
                                                1] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_dict[queueName][
                                                2] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_dict[
                                                queueName] = [
                                                    workSpecsToEnqueue,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO: {0}'
                                            .format(errStr))
                                        to_break = True
                                    try:
                                        if len(obj_to_enqueue_to_head_dict[
                                                queueName][0]) + len(
                                                    workSpecsToEnqueueToHead
                                                ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_to_head_dict[
                                                queueName][0].extend(
                                                    workSpecsToEnqueueToHead)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][1] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][2] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_to_head_dict[
                                                queueName] = [
                                                    workSpecsToEnqueueToHead,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO head: {0}'
                                            .format(errStr))
                                        to_break = True
                                    mainLog.debug(
                                        'checked {0} workers from FIFO'.format(
                                            len(workSpecsList)) +
                                        sw.get_elapsed_time())
                                else:
                                    mainLog.debug(
                                        'monitor_agent_core returned None. Skipped putting to FIFO'
                                    )
                                if sw_fifo.get_elapsed_time_in_sec(
                                ) > harvester_config.monitor.lockInterval:
                                    mainLog.warning(
                                        'a single FIFO cycle was longer than lockInterval '
                                        + sw_fifo.get_elapsed_time())
                                else:
                                    mainLog.debug('done a FIFO cycle' +
                                                  sw_fifo.get_elapsed_time())
                                    n_loops_hit += 1
                                if to_break:
                                    break
                            else:
                                mainLog.debug('got nothing in FIFO')
                    else:
                        mainLog.debug(
                            'workers in FIFO too young to check. Skipped')
                        if self.singleMode:
                            break
                        if overhead_time is not None:
                            time.sleep(
                                max(-overhead_time * random.uniform(0.1, 1),
                                    adjusted_sleepTime))
                        else:
                            time.sleep(
                                max(fifoCheckDuration * random.uniform(0.1, 1),
                                    adjusted_sleepTime))
                mainLog.debug(
                    'run {0} loops, including {1} FIFO cycles'.format(
                        n_loops, n_loops_hit))

                # enqueue to fifo
                sw.reset()
                n_chunk_put = 0
                mainLog.debug('putting worker chunks to FIFO')
                for _dct in (obj_to_enqueue_dict,
                             remaining_obj_to_enqueue_dict):
                    for queueName, obj_to_enqueue in iteritems(_dct):
                        try:
                            workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue
                            if workSpecsToEnqueue:
                                score = fifoCheckInterval + timeNow_timestamp
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueue), score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueue), queueName,
                                            score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO: {0}'.format(
                                    errStr))
                mainLog.debug('putting worker chunks to FIFO head')
                for _dct in (obj_to_enqueue_to_head_dict,
                             remaining_obj_to_enqueue_to_head_dict):
                    for queueName, obj_to_enqueue_to_head in iteritems(_dct):
                        try:
                            workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head
                            if workSpecsToEnqueueToHead:
                                score = fifoCheckInterval + timeNow_timestamp - 2**32
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueueToHead),
                                    score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueueToHead),
                                            queueName, score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO head: {0}'.
                                format(errStr))
                # release protective dequeued objects
                if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0:
                    monitor_fifo.release(ids=obj_dequeued_id_list)
                mainLog.debug(
                    'put {0} worker chunks into FIFO'.format(n_chunk_put) +
                    sw.get_elapsed_time())
                # adjust adjusted_sleepTime
                if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime:
                    speedup_factor = (sum_overhead_time_stat - sleepTime) / (
                        n_chunk_peeked_stat *
                        harvester_config.monitor.checkInterval)
                    speedup_factor = max(speedup_factor, 0)
                    adjusted_sleepTime = adjusted_sleepTime / (1. +
                                                               speedup_factor)
                elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0:
                    adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2
                mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format(
                    adjusted_sleepTime))
                # end run with fifo
                mainLog.debug('ended run with FIFO')
            # time the cycle
            mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time())

            # check if being terminated
            if self.terminated(adjusted_sleepTime):
                mainLog.debug('terminated')
                return
Example #20
0
 def run(self):
     lockedBy = 'stager-{0}'.format(self.get_pid())
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
         mainLog.debug('try to get jobs to check')
         # get jobs to check preparation
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck
         except Exception:
             maxFilesPerJob = None
         jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck,
                                                           harvester_config.stager.checkInterval,
                                                           harvester_config.stager.lockInterval,
                                                           lockedBy, 'transferring',
                                                           JobSpec.HO_hasTransfer,
                                                           max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
         # loop over all jobs
         for jobSpec in jobsToCheck:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('start checking')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 tmpStat, tmpStr = stagerCore.check_status(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                     tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                     tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # on-going
                     tmpLog.debug('try to check later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to trigger stage-out
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger
         except Exception:
             maxFilesPerJob = None
         jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger,
                                                             harvester_config.stager.triggerInterval,
                                                             harvester_config.stager.lockInterval,
                                                             lockedBy, 'to_transfer',
                                                             JobSpec.HO_hasOutput,
                                                             JobSpec.HO_hasZipOutput,
                                                             max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger)))
         # loop over all jobs
         for jobSpec in jobsToTrigger:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to trigger stage-out')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger stage-out
                 tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     jobSpec.all_files_triggered_to_stage_out()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                     tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                     tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # temporary error
                     tmpLog.debug('try to trigger later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to zip output
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip
         except Exception:
             maxFilesPerJob = None
         try:
             zipInterval = harvester_config.stager.zipInterval
         except Exception:
             zipInterval = harvester_config.stager.triggerInterval
         jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip,
                                                         zipInterval,
                                                         harvester_config.stager.lockInterval,
                                                         lockedBy, 'to_transfer',
                                                         JobSpec.HO_hasZipOutput,
                                                         JobSpec.HO_hasOutput,
                                                         max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip)))
         # loop over all jobs
         for jobSpec in jobsToZip:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to zip output')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger preparation
                 tmpStat, tmpStr = stagerCore.zip_output(jobSpec)
                 # succeeded
                 if tmpStat is True:
                     # update job
                     jobSpec.all_files_zipped()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy)
                     tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus))
                 else:
                     # failed
                     tmpLog.debug('failed to zip with {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.stager.sleepTime):
             mainLog.debug('terminated')
             return
Example #21
0
 def run(self):
     while True:
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(self.get_pid()),
                                    method_name='run')
         mainLog.debug('getting number of jobs to be fetched')
         # get number of jobs to be fetched
         nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(
             harvester_config.jobfetcher.nQueues,
             harvester_config.jobfetcher.lookupTime)
         mainLog.debug('got {0} queues'.format(len(nJobsPerQueue)))
         # loop over all queues
         for queueName, nJobs in iteritems(nJobsPerQueue):
             # check queue
             if not self.queueConfigMapper.has_queue(queueName):
                 continue
             tmpLog = self.make_logger(_logger,
                                       'queueName={0}'.format(queueName),
                                       method_name='run')
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # upper limit
             if nJobs > harvester_config.jobfetcher.maxJobs:
                 nJobs = harvester_config.jobfetcher.maxJobs
             # get jobs
             default_prodSourceLabel = queueConfig.get_source_label()
             pdpm = getattr(queueConfig,
                            'prodSourceLabelRandomWeightsPermille', {})
             choice_list = core_utils.make_choice_list(
                 pdpm=pdpm, default=default_prodSourceLabel)
             prodSourceLabel = random.choice(choice_list)
             tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(
                 nJobs, prodSourceLabel))
             sw = core_utils.get_stopwatch()
             siteName = queueConfig.siteName
             jobs, errStr = self.communicator.get_jobs(
                 siteName, self.nodeName, prodSourceLabel, self.nodeName,
                 nJobs, queueConfig.getJobCriteria)
             tmpLog.info('got {0} jobs with {1} {2}'.format(
                 len(jobs), errStr, sw.get_elapsed_time()))
             # convert to JobSpec
             if len(jobs) > 0:
                 # get extractor plugin
                 if hasattr(queueConfig, 'extractor'):
                     extractorCore = self.pluginFactory.get_plugin(
                         queueConfig.extractor)
                 else:
                     extractorCore = None
                 jobSpecs = []
                 fileStatMap = dict()
                 sw_startconvert = core_utils.get_stopwatch()
                 for job in jobs:
                     timeNow = datetime.datetime.utcnow()
                     jobSpec = JobSpec()
                     jobSpec.convert_job_json(job)
                     jobSpec.computingSite = queueName
                     jobSpec.status = 'starting'
                     jobSpec.subStatus = 'fetched'
                     jobSpec.creationTime = timeNow
                     jobSpec.stateChangeTime = timeNow
                     jobSpec.configID = queueConfig.configID
                     jobSpec.set_one_attribute(
                         'schedulerID', 'harvester-{0}'.format(
                             harvester_config.master.harvester_id))
                     if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None:
                         jobSpec.zipPerMB = queueConfig.zipPerMB
                     fileGroupDictList = [
                         jobSpec.get_input_file_attributes()
                     ]
                     if extractorCore is not None:
                         fileGroupDictList.append(
                             extractorCore.get_aux_inputs(jobSpec))
                     for fileGroupDict in fileGroupDictList:
                         for tmpLFN, fileAttrs in iteritems(fileGroupDict):
                             # check file status
                             if tmpLFN not in fileStatMap:
                                 fileStatMap[
                                     tmpLFN] = self.dbProxy.get_file_status(
                                         tmpLFN, 'input',
                                         queueConfig.ddmEndpointIn,
                                         'starting')
                             # make file spec
                             fileSpec = FileSpec()
                             fileSpec.PandaID = jobSpec.PandaID
                             fileSpec.taskID = jobSpec.taskID
                             fileSpec.lfn = tmpLFN
                             fileSpec.endpoint = queueConfig.ddmEndpointIn
                             fileSpec.scope = fileAttrs['scope']
                             # set preparing to skip stage-in if the file is (being) taken care of by another job
                             if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \
                                     or 'to_prepare' in fileStatMap[tmpLFN]:
                                 fileSpec.status = 'preparing'
                             else:
                                 fileSpec.status = 'to_prepare'
                             if fileSpec.status not in fileStatMap[tmpLFN]:
                                 fileStatMap[tmpLFN][fileSpec.status] = 0
                             fileStatMap[tmpLFN][fileSpec.status] += 1
                             if 'INTERNAL_FileType' in fileAttrs:
                                 fileSpec.fileType = fileAttrs[
                                     'INTERNAL_FileType']
                                 jobSpec.auxInput = JobSpec.AUX_hasAuxInput
                             else:
                                 fileSpec.fileType = 'input'
                             if 'INTERNAL_URL' in fileAttrs:
                                 fileSpec.url = fileAttrs['INTERNAL_URL']
                             jobSpec.add_in_file(fileSpec)
                     jobSpec.trigger_propagation()
                     jobSpecs.append(jobSpec)
                 # insert to DB
                 tmpLog.debug("Converting of {0} jobs {1}".format(
                     len(jobs), sw_startconvert.get_elapsed_time()))
                 sw_insertdb = core_utils.get_stopwatch()
                 self.dbProxy.insert_jobs(jobSpecs)
                 tmpLog.debug('Insert of {0} jobs {1}'.format(
                     len(jobSpecs), sw_insertdb.get_elapsed_time()))
         mainLog.debug('done')
         # check if being terminated
         if self.terminated(harvester_config.jobfetcher.sleepTime):
             mainLog.debug('terminated')
             return
Example #22
0
 def run(self):
     lockedBy = 'stager-{0}'.format(self.ident)
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
         mainLog.debug('try to get jobs to check')
         # get jobs to check preparation
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck
         except Exception:
             maxFilesPerJob = None
         jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck,
                                                           harvester_config.stager.checkInterval,
                                                           harvester_config.stager.lockInterval,
                                                           lockedBy, 'transferring',
                                                           JobSpec.HO_hasTransfer,
                                                           max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
         # loop over all jobs
         for jobSpec in jobsToCheck:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('start checking')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 tmpStat, tmpStr = stagerCore.check_status(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True)
                     tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True)
                     tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # on-going
                     tmpLog.debug('try to check later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to trigger stage-out
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger
         except Exception:
             maxFilesPerJob = None
         jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger,
                                                             harvester_config.stager.triggerInterval,
                                                             harvester_config.stager.lockInterval,
                                                             lockedBy, 'to_transfer',
                                                             JobSpec.HO_hasOutput,
                                                             JobSpec.HO_hasZipOutput,
                                                             max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger)))
         # loop over all jobs
         for jobSpec in jobsToTrigger:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to trigger stage-out')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger stage-out
                 tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     jobSpec.all_files_triggered_to_stage_out()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True)
                     tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True)
                     tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # temporary error
                     tmpLog.debug('try to trigger later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to zip output
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip
         except Exception:
             maxFilesPerJob = None
         try:
             zipInterval = harvester_config.stager.zipInterval
         except Exception:
             zipInterval = harvester_config.stager.triggerInterval
         jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip,
                                                         zipInterval,
                                                         harvester_config.stager.lockInterval,
                                                         lockedBy, 'to_transfer',
                                                         JobSpec.HO_hasZipOutput,
                                                         JobSpec.HO_hasOutput,
                                                         max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip)))
         # loop over all jobs
         for jobSpec in jobsToZip:
             tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to zip output')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                     tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                              configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger preparation
                 tmpStat, tmpStr = stagerCore.zip_output(jobSpec)
                 # succeeded
                 if tmpStat is True:
                     # update job
                     jobSpec.all_files_zipped()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False)
                     tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus))
                 else:
                     # failed
                     tmpLog.debug('failed to zip with {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.stager.sleepTime):
             mainLog.debug('terminated')
             return
Example #23
0
 def run(self):
     while True:
         mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
         mainLog.debug('getting number of jobs to be fetched')
         # get number of jobs to be fetched
         nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues,
                                                            harvester_config.jobfetcher.lookupTime)
         mainLog.debug('got {0} queues'.format(len(nJobsPerQueue)))
         # loop over all queues
         for queueName, nJobs in iteritems(nJobsPerQueue):
             # check queue
             if not self.queueConfigMapper.has_queue(queueName):
                 continue
             tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName),
                                       method_name='run')
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # upper limit
             if nJobs > harvester_config.jobfetcher.maxJobs:
                 nJobs = harvester_config.jobfetcher.maxJobs
             # get jobs
             tmpLog.debug('getting {0} jobs'.format(nJobs))
             sw = core_utils.get_stopwatch()
             siteName = queueConfig.siteName
             jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName,
                                                       queueConfig.get_source_label(),
                                                       self.nodeName, nJobs,
                                                       queueConfig.getJobCriteria)
             tmpLog.info('got {0} jobs with {1} {2}'.format(len(jobs), errStr, sw.get_elapsed_time()))
             # convert to JobSpec
             if len(jobs) > 0:
                 jobSpecs = []
                 fileStatMap = dict()
                 sw_startconvert = core_utils.get_stopwatch()
                 for job in jobs:
                     timeNow = datetime.datetime.utcnow()
                     jobSpec = JobSpec()
                     jobSpec.convert_job_json(job)
                     jobSpec.computingSite = queueName
                     jobSpec.status = 'starting'
                     jobSpec.subStatus = 'fetched'
                     jobSpec.creationTime = timeNow
                     jobSpec.stateChangeTime = timeNow
                     jobSpec.configID = queueConfig.configID
                     jobSpec.set_one_attribute('schedulerID',
                                               'harvester-{0}'.format(harvester_config.master.harvester_id))
                     if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None:
                         jobSpec.zipPerMB = queueConfig.zipPerMB
                     for tmpLFN, fileAttrs in iteritems(jobSpec.get_input_file_attributes()):
                         # check file status
                         if tmpLFN not in fileStatMap:
                             fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input',
                                                                                queueConfig.ddmEndpointIn,
                                                                                'starting')
                         # make file spec
                         fileSpec = FileSpec()
                         fileSpec.PandaID = jobSpec.PandaID
                         fileSpec.taskID = jobSpec.taskID
                         fileSpec.lfn = tmpLFN
                         fileSpec.endpoint = queueConfig.ddmEndpointIn
                         fileSpec.scope = fileAttrs['scope']
                         # set preparing to skip stage-in if the file is (being) taken care of by another job
                         if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \
                                 or 'to_prepare' in fileStatMap[tmpLFN]:
                             fileSpec.status = 'preparing'
                         else:
                             fileSpec.status = 'to_prepare'
                         if fileSpec.status not in fileStatMap[tmpLFN]:
                             fileStatMap[tmpLFN][fileSpec.status] = 0
                         fileStatMap[tmpLFN][fileSpec.status] += 1
                         fileSpec.fileType = 'input'
                         jobSpec.add_in_file(fileSpec)
                     jobSpec.trigger_propagation()
                     jobSpecs.append(jobSpec)
                 # insert to DB
                 tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time()))
                 sw_insertdb =core_utils.get_stopwatch()
                 self.dbProxy.insert_jobs(jobSpecs)
                 tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time()))
         mainLog.debug('done')
         # check if being terminated
         if self.terminated(harvester_config.jobfetcher.sleepTime):
             mainLog.debug('terminated')
             return
Example #24
0
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.ident)
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')

            if time.time(
            ) >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime:
                # run with workers from DB
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if self.monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                mainLog.debug('ended run with DB')
            elif self.monitor_fifo.enabled:
                # run with workers from FIFO
                if monitor_fifo.to_check_workers():
                    # check fifo size
                    fifo_size = monitor_fifo.size()
                    mainLog.debug('FIFO size is {0}'.format(fifo_size))
                    mainLog.debug('starting run with FIFO')
                    try:
                        obj_gotten = monitor_fifo.get(timeout=1)
                    except Exception as errStr:
                        mainLog.error(
                            'failed to get object from FIFO: {0}'.format(
                                errStr))
                    else:
                        if obj_gotten is not None:
                            queueName, workSpecsList = obj_gotten
                            mainLog.debug('got {0} workers of {1}'.format(
                                len(workSpecsList), queueName))
                            configID = workSpecsList[0][0].configID
                            for workSpecs in workSpecsList:
                                for workSpec in workSpecs:
                                    if workSpec.pandaid_list is None:
                                        _jobspec_list = workSpec.get_jobspec_list(
                                        )
                                        if _jobspec_list is not None:
                                            workSpec.pandaid_list = [
                                                j.PandaID for j in
                                                workSpec.get_jobspec_list()
                                            ]
                                        else:
                                            workSpec.pandaid_list = []
                                        workSpec.force_update('pandaid_list')
                            retVal = self.monitor_agent_core(
                                lockedBy,
                                queueName,
                                workSpecsList,
                                from_fifo=True,
                                config_id=configID)
                            if retVal is not None:
                                workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                if workSpecsToEnqueue:
                                    mainLog.debug('putting workers to FIFO')
                                    try:
                                        score = fifoCheckInterval + timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue),
                                            score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO: {0}'
                                            .format(errStr))
                                if workSpecsToEnqueueToHead:
                                    mainLog.debug(
                                        'putting workers to FIFO head')
                                    try:
                                        score = fifoCheckInterval - timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName,
                                             workSpecsToEnqueueToHead), score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO head: {0}'
                                            .format(errStr))
                            else:
                                mainLog.debug(
                                    'monitor_agent_core returned None. Skipped putting to FIFO'
                                )
                        else:
                            mainLog.debug('got nothing in FIFO')
                    mainLog.debug('ended run with FIFO')
                else:
                    mainLog.debug(
                        'workers in FIFO too young to check. Skipped')

            if sw.get_elapsed_time_in_sec(
            ) > harvester_config.monitor.lockInterval:
                mainLog.warning(
                    'a single cycle was longer than lockInterval ' +
                    sw.get_elapsed_time())
            else:
                mainLog.debug('done' + sw.get_elapsed_time())

            # check if being terminated
            sleepTime = (harvester_config.monitor.fifoSleepTimeMilli / 1000.0) \
                            if self.monitor_fifo.enabled else harvester_config.monitor.sleepTime
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return