def get_work_attributes(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='get_work_attributes') allRetDict = dict() numofreads = 0 sw_readreports = core_utils.get_stopwatch() for pandaID in workspec.pandaid_list: # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName) tmpLog.debug( 'looking for attributes file {0}'.format(jsonFilePath)) retDict = dict() if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found attributes file') else: try: with open(jsonFilePath) as jsonFile: retDict = json.load(jsonFile) except Exception: tmpLog.debug('failed to load {0}'.format(jsonFilePath)) # look for job report jsonFilePath = os.path.join(accessPoint, jsonJobReport) tmpLog.debug( 'looking for job report file {0}'.format(jsonFilePath)) sw_checkjobrep = core_utils.get_stopwatch() if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found job report file') else: try: sw_readrep = core_utils.get_stopwatch() with open(jsonFilePath) as jsonFile: tmpDict = json.load(jsonFile) retDict['metaData'] = tmpDict tmpLog.debug('got {0} kB of job report. {1} sec.'.format( os.stat(jsonFilePath).st_size / 1024, sw_readrep.get_elapsed_time())) numofreads += 1 except Exception: tmpLog.debug('failed to load {0}'.format(jsonFilePath)) tmpLog.debug("Check file and read file time: {0} sec.".format( sw_checkjobrep.get_elapsed_time())) allRetDict[pandaID] = retDict tmpLog.debug("Reading {0} job report files {1}".format( numofreads, sw_readreports.get_elapsed_time())) return allRetDict
def get_work_attributes(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='get_work_attributes') allRetDict = dict() numofreads = 0 sw_readreports = core_utils.get_stopwatch() for pandaID in workspec.pandaid_list: # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName) tmpLog.debug('looking for attributes file {0}'.format(jsonFilePath)) retDict = dict() if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found attributes file') else: try: with open(jsonFilePath) as jsonFile: retDict = json.load(jsonFile) except Exception: tmpLog.debug('failed to load {0}'.format(jsonFilePath)) # look for job report jsonFilePath = os.path.join(accessPoint, jsonJobReport) tmpLog.debug('looking for job report file {0}'.format(jsonFilePath)) sw_checkjobrep = core_utils.get_stopwatch() if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found job report file') else: try: sw_readrep = core_utils.get_stopwatch() with open(jsonFilePath) as jsonFile: tmpDict = json.load(jsonFile) retDict['metaData'] = tmpDict tmpLog.debug('got {0} kB of job report. {1} sec.'.format(os.stat(jsonFilePath).st_size / 1024, sw_readrep.get_elapsed_time())) numofreads += 1 except Exception: tmpLog.debug('failed to load {0}'.format(jsonFilePath)) tmpLog.debug("Check file and read file time: {0} sec.".format(sw_checkjobrep.get_elapsed_time())) allRetDict[pandaID] = retDict tmpLog.debug("Reading {0} job report files {1}".format(numofreads, sw_readreports.get_elapsed_time())) return allRetDict
def post_ssl(self, path, data, cert=None, base_url=None): try: tmpLog = None if self.verbose: tmpLog = self.make_logger(method_name='post_ssl') if self.useInspect: tmpExec = inspect.stack()[1][3] tmpExec += '/' tmpExec = str(uuid.uuid4()) if base_url is None: base_url = harvester_config.pandacon.pandaURLSSL url = '{0}/{1}'.format(base_url, path) if self.verbose: tmpLog.debug('exec={0} URL={1} data={2}'.format( tmpExec, url, str(data))) if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) session = get_http_adapter_with_random_dns_resolution() sw = core_utils.get_stopwatch() res = session.post(url, data=data, headers={ "Accept": "application/json", "Connection": "close" }, timeout=harvester_config.pandacon.timeout, verify=harvester_config.pandacon.ca_cert, cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} {3}. return={2}'.format( tmpExec, res.status_code, res.text, sw.get_elapsed_time())) if res.status_code == 200: return True, res else: errMsg = 'StatusCode={0} {1}'.format(res.status_code, res.text) except Exception: errType, errValue = sys.exc_info()[:2] errMsg = "failed to post with {0}:{1} ".format(errType, errValue) errMsg += traceback.format_exc() return False, errMsg
def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, additional_criteria): # get logger tmpLog = self.make_logger('siteName={0}'.format(site_name), method_name='get_jobs') tmpLog.debug('try to get {0} jobs'.format(n_jobs)) data = {} data['siteName'] = site_name data['node'] = node_name data['prodSourceLabel'] = prod_source_label data['computingElement'] = computing_element data['nJobs'] = n_jobs data['schedulerID'] = 'harvester-{0}'.format( harvester_config.master.harvester_id) if additional_criteria is not None: for tmpKey, tmpVal in additional_criteria: data[tmpKey] = tmpVal sw = core_utils.get_stopwatch() tmpStat, tmpRes = self.post_ssl('getJob', data) tmpLog.debug('getJob for {0} jobs {1}'.format(n_jobs, sw.get_elapsed_time())) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() tmpLog.debug('StatusCode={0}'.format(tmpDict['StatusCode'])) if tmpDict['StatusCode'] == 0: tmpLog.debug('got {0} jobs'.format(len(tmpDict['jobs']))) return tmpDict['jobs'], errStr else: if 'errorDialog' in tmpDict: errStr = tmpDict['errorDialog'] else: errStr = "StatusCode={0}".format(tmpDict['StatusCode']) return [], errStr except Exception: errStr = core_utils.dump_error_message(tmpLog, tmpRes) return [], errStr
def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, additional_criteria): # get logger tmpLog = self.make_logger('siteName={0}'.format(site_name), method_name='get_jobs') tmpLog.debug('try to get {0} jobs'.format(n_jobs)) data = {} data['siteName'] = site_name data['node'] = node_name data['prodSourceLabel'] = prod_source_label data['computingElement'] = computing_element data['nJobs'] = n_jobs data['schedulerID'] = 'harvester-{0}'.format(harvester_config.master.harvester_id) if additional_criteria is not None: for tmpKey, tmpVal in additional_criteria: data[tmpKey] = tmpVal sw = core_utils.get_stopwatch() tmpStat, tmpRes = self.post_ssl('getJob', data) tmpLog.debug('getJob for {0} jobs {1}'.format(n_jobs, sw.get_elapsed_time())) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() tmpLog.debug('StatusCode={0}'.format(tmpDict['StatusCode'])) if tmpDict['StatusCode'] == 0: tmpLog.debug('got {0} jobs'.format(len(tmpDict['jobs']))) return tmpDict['jobs'], errStr else: if 'errorDialog' in tmpDict: errStr = tmpDict['errorDialog'] else: errStr = "StatusCode={0}".format(tmpDict['StatusCode']) return [], errStr except Exception: errStr = core_utils.dump_error_message(tmpLog, tmpRes) return [], errStr
def post_ssl(self, path, data, cert=None): try: tmpLog = None if self.verbose: tmpLog = self.make_logger(method_name='post_ssl') if self.useInspect: tmpExec = inspect.stack()[1][3] tmpExec += '/' tmpExec = str(uuid.uuid4()) url = '{0}/{1}'.format(harvester_config.pandacon.pandaURLSSL, path) if self.verbose: tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data))) if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) sw = core_utils.get_stopwatch() res = requests.post(url, data=data, headers={"Accept": "application/json", "Connection": "close"}, timeout=harvester_config.pandacon.timeout, verify=harvester_config.pandacon.ca_cert, cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(tmpExec, res.status_code, res.text, sw.get_elapsed_time())) if res.status_code == 200: return True, res else: errMsg = 'StatusCode={0} {1}'.format(res.status_code, res.text) except Exception: errType, errValue = sys.exc_info()[:2] errMsg = "failed to post with {0}:{1} ".format(errType, errValue) errMsg += traceback.format_exc() return False, errMsg
def run(self): lockedBy = 'preparator-{0}'.format(self.ident) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_status(jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr)) continue # update job jobSpec.subStatus = 'prepared' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.set_all_input_ready() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing') mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() newFileStatusData = [] toWait = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'preparing': updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' # set group info if any groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo['groupStatus'] fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] updateStatus = True elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True else: # change file status if the file is not prepared by another fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0 fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1 if len(newFileStatusData) > 0: self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('wait since files are being prepared by another job') continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.subStatus = 'preparing' jobSpec.lockedBy = None jobSpec.preparatorTime = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'preparator-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_in_sub_status( 'preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy, max_files_per_job=maxFilesPerJob, ng_file_status_list=['ready']) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_allTriggered]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_stage_in_status( jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to check later since still preparing with {0}' .format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths( jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error( 'failed to resolve input file paths : {0}'. format(tmpStr)) continue # manipulate container-related job params jobSpec.manipulate_job_params_for_container() # update job jobSpec.lockedBy = None jobSpec.set_all_input_ready() if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]): # all done allDone = True jobSpec.subStatus = 'prepared' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allReady else: # immediate next lookup since there could be more files to check allDone = False jobSpec.trigger_preparation() # change auxInput flag to check auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered: jobSpec.auxInput = JobSpec.AUX_inReady self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('succeeded') else: tmpLog.debug('partially succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_in_sub_status( 'fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing', max_files_per_job=maxFilesPerJob, ng_file_status_list=['triggered', 'ready']) mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) fileType = 'input' else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() # check if has to_prepare hasToPrepare = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': hasToPrepare = True break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: if fileSpec.status in ['preparing', 'to_prepare']: newInFiles.append(fileSpec) updateStatus = False if fileSpec.lfn not in fileStatMap[ queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[ queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' if fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path']: fileSpec.path = list( fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path'])[0] # set group info if any groupInfo = self.dbProxy.get_group_for_file( fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo[ 'groupStatus'] fileSpec.groupUpdateTime = groupInfo[ 'groupUpdateTime'] updateStatus = True elif (not hasToPrepare and 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True if fileSpec.status != 'preparing': fileSpec.status = 'preparing' updateStatus = True else: # change file status if the file is not prepared by another if fileSpec.status != 'to_prepare': fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append( (fileSpec.fileID, fileSpec.lfn, fileSpec.status)) fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn].setdefault( fileSpec.status, None) if len(newFileStatusData) > 0: self.dbProxy.change_file_status( jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'wait since files are being prepared by another job' ) continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation( jobSpec) # check result if tmpStat is True: # succeeded jobSpec.lockedBy = None if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]): # all done allDone = True jobSpec.subStatus = 'preparing' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allTriggered else: # change file status but not change job sub status since # there could be more files to prepare allDone = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': fileSpec.status = 'triggered' # immediate next lookup jobSpec.trigger_preparation() # change auxInput flag to prepare auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput: jobSpec.auxInput = JobSpec.AUX_inTriggered self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('triggered') else: tmpLog.debug('partially triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
def update_jobs(self, jobspec_list, id): sw = core_utils.get_stopwatch() tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs') tmpLogG.debug('update {0} jobs'.format(len(jobspec_list))) retList = [] # update events for jobSpec in jobspec_list: eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000) if eventRanges != []: tmpLogG.debug('update {0} events for PandaID={1}'.format( len(eventSpecs), jobSpec.PandaID)) tmpRet = self.update_event_ranges(eventRanges, tmpLogG) if tmpRet['StatusCode'] == 0: for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']): if retVal in [True, False ] and eventSpec.is_final_status(): eventSpec.subStatus = 'done' # update jobs in bulk nLookup = 100 iLookup = 0 while iLookup < len(jobspec_list): dataList = [] jobSpecSubList = jobspec_list[iLookup:iLookup + nLookup] for jobSpec in jobSpecSubList: data = jobSpec.get_job_attributes_for_panda() data['jobId'] = jobSpec.PandaID data['siteName'] = jobSpec.computingSite data['state'] = jobSpec.get_status() data['attemptNr'] = jobSpec.attemptNr data['jobSubStatus'] = jobSpec.subStatus # change cancelled to failed to be accepted by panda server if data['state'] in ['cancelled', 'missed']: if jobSpec.is_pilot_closed(): data['jobSubStatus'] = 'pilot_closed' else: data['jobSubStatus'] = data['state'] data['state'] = 'failed' if jobSpec.startTime is not None and 'startTime' not in data: data['startTime'] = jobSpec.startTime.strftime( '%Y-%m-%d %H:%M:%S') if jobSpec.endTime is not None and 'endTime' not in data: data['endTime'] = jobSpec.endTime.strftime( '%Y-%m-%d %H:%M:%S') if 'coreCount' not in data and jobSpec.nCore is not None: data['coreCount'] = jobSpec.nCore if jobSpec.is_final_status( ) and jobSpec.status == jobSpec.get_status(): if jobSpec.metaData is not None: data['metaData'] = json.dumps(jobSpec.metaData) if jobSpec.outputFilesToReport is not None: data['xml'] = jobSpec.outputFilesToReport dataList.append(data) harvester_id = harvester_config.master.harvester_id tmpData = { 'jobList': json.dumps(dataList), 'harvester_id': harvester_id } tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData) retMaps = None errStr = '' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLogG, tmpRes) else: try: tmpStat, retMaps = tmpRes.json() if tmpStat is False: tmpLogG.error( 'updateJobsInBulk failed with {0}'.format(retMaps)) retMaps = None except Exception: errStr = core_utils.dump_error_message(tmpLogG) if retMaps is None: retMap = {} retMap['content'] = {} retMap['content']['StatusCode'] = 999 retMap['content']['ErrorDiag'] = errStr retMaps = [json.dumps(retMap)] * len(jobSpecSubList) for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList): tmpLog = self.make_logger('id={0} PandaID={1}'.format( id, jobSpec.PandaID), method_name='update_jobs') try: retMap = json.loads(retMap['content']) except Exception: errStr = 'falied to load json' retMap = {} retMap['StatusCode'] = 999 retMap['ErrorDiag'] = errStr tmpLog.debug('data={0}'.format(str(data))) tmpLog.debug('done with {0}'.format(str(retMap))) retList.append(retMap) iLookup += nLookup tmpLogG.debug('done' + sw.get_elapsed_time()) return retList
def run(self): while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.ident), method_name='run') mainLog.debug('getting jobs to propagate') jobSpecs = self.dbProxy.get_jobs_to_propagate( harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.ident) mainLog.debug('got {0} jobs'.format(len(jobSpecs))) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue( tmpJobSpec.computingSite) hbSuppressMap[ tmpJobSpec. computingSite] = queueConfig.get_no_heartbeat_status( ) # heartbeat is suppressed if tmpJobSpec.status in hbSuppressMap[ tmpJobSpec.computingSite]: # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) retList += self.communicator.check_jobs(jobListToCheck) retList += self.communicator.update_jobs(jobListToUpdate) # logging for tmpJobSpec, tmpRet in zip( jobListToSkip + jobListToCheck + jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug( 'updated PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug( 'skip updating PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status( ) and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability( tmpJobSpec) if tmpEvStat and tmpEvRet == 0: mainLog.debug( 'kill PandaID={0} due to no event'. format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in [ 'tobekilled' ]: nWorkers = self.dbProxy.kill_workers_with_job( tmpJobSpec.PandaID) if nWorkers == 0: # no remaining workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.stateChangeTime = datetime.datetime.utcnow( ) tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.ident}) else: mainLog.error( 'failed to update PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') workSpecs = self.dbProxy.get_workers_to_propagate( harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers'.format(len(workSpecs))) # update workers in central database iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nJobs] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error( 'failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug( 'updated workerID={0} status={1}'.format( tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file( logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload( logFilePath, logOffset + logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker( tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error( 'failed to update workerID={0} status={1}'. format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith( CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error( 'failed to get worker stats for {0}'.format( siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats( siteName, workerStats) if tmpRet: mainLog.debug( 'updated worker stats (command) for {0}'. format(siteName)) else: mainLog.error( 'failed to update worker stats (command) for {0} err={1}' .format(siteName, tmpStr)) if not self._last_stats_update or time.time( ) - self._last_stats_update > STATS_PERIOD: # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk() if not worker_stats_bulk: mainLog.error('failed to get worker stats in bulk') else: for site_name in worker_stats_bulk: tmp_ret, tmp_str = self.communicator.update_worker_stats( site_name, worker_stats_bulk[site_name]) if tmp_ret: mainLog.debug( 'update of worker stats (bulk) for {0}'.format( site_name)) self._last_stats_update = time.time() else: mainLog.error( 'failed to update worker stats (bulk) for {0} err={1}' .format(site_name, tmp_str)) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # get commands to kill sw_getcomm = core_utils.get_stopwatch() mainLog.debug('try to get commands') comStr = CommandSpec.COM_killWorkers commandSpecs = self.dbProxy.get_commands_for_receiver( 'sweeper', comStr) mainLog.debug('got {0} {1} commands'.format( len(commandSpecs), comStr)) for commandSpec in commandSpecs: n_to_kill = self.dbProxy.kill_workers_by_query( commandSpec.params) mainLog.debug('will kill {0} workers with {1}'.format( n_to_kill, commandSpec.params)) mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time()) # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill( harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format( len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) try: sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) except Exception: mainLog.error( 'failed to launch sweeper plugin for {0}/{1}'. format(queueName, configID)) core_utils.dump_error_message(mainLog) continue sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker( workspec) tmpLog.debug( 'done killing with status={0} diag={1}'. format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug( 'done killing workerID={0} with status={1} diag={2}' .format(workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format( n_killed, n_workers)) mainLog.debug( 'done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = { 'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup( harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format( len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems( workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug( 'making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker( workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug( 'made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker( workspec) tmpLog.debug( 'swept_worker with status={0} diag={1}'.format( tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up( workspec) tmpLog.debug( 'messenger cleaned up with status={0} diag={1}' .format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug( 'done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # disk cleanup if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \ hasattr(harvester_config.sweeper, 'diskHighWatermark'): locked = self.dbProxy.get_process_lock( 'sweeper', self.get_pid(), harvester_config.sweeper.diskCleanUpInterval * 60 * 60) if locked: try: all_active_files = None for item in harvester_config.sweeper.diskHighWatermark.split( ','): # dir name and watermark in GB dir_name, watermark = item.split('|') mainLog.debug( 'checking {0} for cleanup with watermark {1} GB' .format(dir_name, watermark)) watermark = int(watermark) * 10**9 total_size = 0 file_dict = {} # scan dir for root, dirs, filenames in walk(dir_name): for base_name in filenames: full_name = os.path.join(root, base_name) f_size = os.path.getsize(full_name) total_size += f_size mtime = os.path.getmtime(full_name) file_dict.setdefault(mtime, set()) file_dict[mtime].add( (base_name, full_name, f_size)) # delete if necessary if total_size < watermark: mainLog.debug( 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) else: mainLog.debug( 'cleanup {0} due to total_size {1} GB >= watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) # get active input files if all_active_files is None: all_active_files = self.dbProxy.get_all_active_input_files( ) deleted_size = 0 mtimes = sorted(file_dict.keys()) for mtime in mtimes: for base_name, full_name, f_size in file_dict[ mtime]: # keep if active if base_name in all_active_files: continue try: os.remove(full_name) except Exception: core_utils.dump_error_message( mainLog) deleted_size += f_size if total_size - deleted_size < watermark: break if total_size - deleted_size < watermark: break except Exception: core_utils.dump_error_message(mainLog) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'monitor-{0}'.format(self.ident) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecsList in iteritems(workSpecsPerQueue): tmpQueLog = core_utils.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) # check workers allWorkers = [ item for sublist in workSpecsList for item in sublist ] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog) # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None filesToStageOut = dict() pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = [] for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error( 'unknown status={0}'.format(newStatus)) continue # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList.append(filesToStageOut) # update jobs and workers if jobSpecs is not None: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( queueConfig.mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) for jobSpec in jobSpecs: tmpLog = core_utils.make_logger( _logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = core_utils.make_logger( _logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len( filesToStageOutList) > 0: for workSpec in workSpecs: messenger.acknowledge_events_files(workSpec) tmpQueLog.debug('done') mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.monitor.sleepTime): mainLog.debug('terminated') return
def fifo_benchmark(arguments): n_object = arguments.n_object n_thread = arguments.n_thread mq = harvesterFifos.BenchmarkFIFO() sw = core_utils.get_stopwatch() sum_dict = { 'put_n' : 0, 'put_time' : 0.0, 'get_time' : 0.0, 'get_protective_time' : 0.0, 'clear_time' : 0.0, } def _put_object(i_index): workspec = WorkSpec() workspec.workerID = i_index data = {'random': [(i_index**2) % 2**16, random.random()]} workspec.workAttributes = data mq.put(workspec) def _get_object(i_index): return mq.get(timeout=3, protective=False) def _get_object_protective(i_index): return mq.get(timeout=3, protective=True) def put_test(): sw.reset() multithread_executer(_put_object, n_object, n_thread) sum_dict['put_time'] += sw.get_elapsed_time_in_sec(True) sum_dict['put_n'] += 1 print('Put {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) print('Now fifo size is {0}'.format(mq.size())) def get_test(): sw.reset() multithread_executer(_get_object, n_object, n_thread) sum_dict['get_time'] = sw.get_elapsed_time_in_sec(True) print('Get {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) print('Now fifo size is {0}'.format(mq.size())) def get_protective_test(): sw.reset() multithread_executer(_get_object_protective, n_object, n_thread) sum_dict['get_protective_time'] = sw.get_elapsed_time_in_sec(True) print('Get {0} objects protective dequeue by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) print('Now fifo size is {0}'.format(mq.size())) def clear_test(): sw.reset() mq.fifo.clear() sum_dict['clear_time'] = sw.get_elapsed_time_in_sec(True) print('Cleared fifo' + sw.get_elapsed_time()) print('Now fifo size is {0}'.format(mq.size())) # Benchmark print('Start fifo benchmark ...') mq.fifo.clear() print('Cleared fifo') put_test() get_test() put_test() get_protective_test() put_test() clear_test() print('Finished fifo benchmark') # summary print('Summary:') print('FIFO plugin is: {0}'.format(mq.fifo.__class__.__name__)) print('Benchmark with {0} objects by {1} threads'.format(n_object, n_thread)) print('Put : {0:.3f} ms / obj'.format(1000. * sum_dict['put_time']/(sum_dict['put_n']*n_object))) print('Get : {0:.3f} ms / obj'.format(1000. * sum_dict['get_time']/n_object)) print('Get protective : {0:.3f} ms / obj'.format(1000. * sum_dict['get_protective_time']/n_object)) print('Clear : {0:.3f} ms / obj'.format(1000. * sum_dict['clear_time']/n_object))
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # get commands to kill sw_getcomm = core_utils.get_stopwatch() mainLog.debug('try to get commands') comStr = CommandSpec.COM_killWorkers commandSpecs = self.dbProxy.get_commands_for_receiver( 'sweeper', comStr) mainLog.debug('got {0} {1} commands'.format( len(commandSpecs), comStr)) for commandSpec in commandSpecs: n_to_kill = self.dbProxy.kill_workers_by_query( commandSpec.params) mainLog.debug('will kill {0} workers with {1}'.format( n_to_kill, commandSpec.params)) mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time()) # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill( harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format( len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker( workspec) tmpLog.debug( 'done killing with status={0} diag={1}'. format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug( 'done killing workerID={0} with status={1} diag={2}' .format(workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format( n_killed, n_workers)) mainLog.debug( 'done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = { 'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup( harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format( len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems( workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug( 'making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker( workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug( 'made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker( workspec) tmpLog.debug( 'swept_worker with status={0} diag={1}'.format( tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up( workspec) tmpLog.debug( 'messenger cleaned up with status={0} diag={1}' .format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug( 'done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker(workspec) tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format( workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers)) mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug('making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker(workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker(workspec) tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec) tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def update_jobs(self, jobspec_list, id): sw = core_utils.get_stopwatch() tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs') tmpLogG.debug('update {0} jobs'.format(len(jobspec_list))) retList = [] # update events for jobSpec in jobspec_list: eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000) if eventRanges != []: tmpLogG.debug('update {0} events for PandaID={1}'.format(len(eventSpecs), jobSpec.PandaID)) tmpRet = self.update_event_ranges(eventRanges, tmpLogG) if tmpRet['StatusCode'] == 0: for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']): if retVal in [True, False] and eventSpec.is_final_status(): eventSpec.subStatus = 'done' # update jobs in bulk nLookup = 100 iLookup = 0 while iLookup < len(jobspec_list): dataList = [] jobSpecSubList = jobspec_list[iLookup:iLookup+nLookup] for jobSpec in jobSpecSubList: data = jobSpec.get_job_attributes_for_panda() data['jobId'] = jobSpec.PandaID data['siteName'] = jobSpec.computingSite data['state'] = jobSpec.get_status() data['attemptNr'] = jobSpec.attemptNr data['jobSubStatus'] = jobSpec.subStatus # change cancelled to failed to be accepted by panda server if data['state'] in ['cancelled', 'missed']: if jobSpec.is_pilot_closed(): data['jobSubStatus'] = 'pilot_closed' else: data['jobSubStatus'] = data['state'] data['state'] = 'failed' if jobSpec.startTime is not None and 'startTime' not in data: data['startTime'] = jobSpec.startTime.strftime('%Y-%m-%d %H:%M:%S') if jobSpec.endTime is not None and 'endTime' not in data: data['endTime'] = jobSpec.endTime.strftime('%Y-%m-%d %H:%M:%S') if 'coreCount' not in data and jobSpec.nCore is not None: data['coreCount'] = jobSpec.nCore if jobSpec.is_final_status() and jobSpec.status == jobSpec.get_status(): if jobSpec.metaData is not None: data['metaData'] = json.dumps(jobSpec.metaData) if jobSpec.outputFilesToReport is not None: data['xml'] = jobSpec.outputFilesToReport dataList.append(data) harvester_id = harvester_config.master.harvester_id tmpData = {'jobList': json.dumps(dataList), 'harvester_id': harvester_id} tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData) retMaps = None errStr = '' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLogG, tmpRes) else: try: tmpStat, retMaps = tmpRes.json() if tmpStat is False: tmpLogG.error('updateJobsInBulk failed with {0}'.format(retMaps)) retMaps = None except Exception: errStr = core_utils.dump_error_message(tmpLogG) if retMaps is None: retMap = {} retMap['content'] = {} retMap['content']['StatusCode'] = 999 retMap['content']['ErrorDiag'] = errStr retMaps = [json.dumps(retMap)] * len(jobSpecSubList) for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList): tmpLog = self.make_logger('id={0} PandaID={1}'.format(id, jobSpec.PandaID), method_name='update_jobs') try: retMap = json.loads(retMap['content']) except Exception: errStr = 'falied to load json' retMap = {} retMap['StatusCode'] = 999 retMap['ErrorDiag'] = errStr tmpLog.debug('data={0}'.format(str(data))) tmpLog.debug('done with {0}'.format(str(retMap))) retList.append(retMap) iLookup += nLookup tmpLogG.debug('done' + sw.get_elapsed_time()) return retList
def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return
def run(self): while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting jobs to propagate') sw = core_utils.get_stopwatch() jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.get_pid()) mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time())) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() # heartbeat is suppressed if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ not tmpJobSpec.not_suppress_heartbeat(): # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) sw.reset() retList += self.communicator.check_jobs(jobListToCheck) mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time())) sw.reset() retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid()) mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate), sw.get_elapsed_time())) # logging for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' tmpJobSpec.modificationTime = datetime.datetime.utcnow() elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done(): # trigger next propagation to update remaining events tmpJobSpec.trigger_propagation() else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec) if tmpEvStat: if tmpEvRet is not None: tmpJobSpec.nRemainingEvents = tmpEvRet if tmpEvRet == 0: mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']: nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID) if nWorkers == 0: # no workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL]) tmpJobSpec.stateChangeTime = datetime.datetime.utcnow() tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()}) else: mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') sw.reset() workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time())) # update workers in central database sw.reset() iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nWorkers] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error('failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers, sw.get_elapsed_time())) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error('failed to get worker stats for {0}'.format(siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats) if tmpRet: mainLog.debug('updated worker stats (command) for {0}'.format(siteName)) else: mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName, tmpStr)) if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD: # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking # care of them active_ups_queues = self.queueConfigMapper.get_active_ups_queues() # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues) if not worker_stats_bulk: mainLog.error('failed to get worker stats in bulk') else: for site_name in worker_stats_bulk: tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name, worker_stats_bulk[site_name]) if tmp_ret: mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name)) self._last_stats_update = time.time() else: mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name, tmp_str)) if not self._last_metrics_update \ or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD): # get latest metrics from DB service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update) if not service_metrics_list: mainLog.error('failed to get service metrics') self._last_metrics_update = datetime.datetime.utcnow() else: tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list) if tmp_ret: mainLog.debug('update of service metrics OK') self._last_metrics_update = datetime.datetime.utcnow() else: mainLog.error('failed to update service metrics err={0}'.format(tmp_str)) # send dialog messages mainLog.debug('getting dialog messages to propagate') try: maxDialogs = harvester_config.propagator.maxDialogs except Exception: maxDialogs = 50 diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs, harvester_config.propagator.lockInterval) mainLog.debug('got {0} dialogs'.format(len(diagSpecs))) if len(diagSpecs) > 0: tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs) if tmpStat: diagIDs = [diagSpec.diagID for diagSpec in diagSpecs] self.dbProxy.delete_dialog_messages(diagIDs) mainLog.debug('sent {0} dialogs'.format(len(diagSpecs))) else: mainLog.error('failed to send dialogs err={0}'.format(tmpStr)) if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval: mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time()) else: mainLog.debug('done' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'monitor-{0}'.format(self.get_pid()) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main try: fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli except AttributeError: fifoSleepTimeMilli = 5000 try: fifoCheckDuration = harvester_config.monitor.fifoCheckDuration except AttributeError: fifoCheckDuration = 30 try: fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk except AttributeError: fifoMaxWorkersPerChunk = 500 try: fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue except AttributeError: fifoProtectiveDequeue = True last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo sleepTime = (fifoSleepTimeMilli / 1000.0) \ if monitor_fifo.enabled else harvester_config.monitor.sleepTime adjusted_sleepTime = sleepTime if monitor_fifo.enabled: monitor_fifo.restore() while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('start a monitor cycle') if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \ not (monitor_fifo.enabled and self.singleMode): # run with workers from DB sw_db = core_utils.get_stopwatch() mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() if sw_db.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time()) else: mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time()) mainLog.debug('ended run with DB') elif monitor_fifo.enabled: # run with workers from FIFO sw = core_utils.get_stopwatch() n_loops = 0 n_loops_hit = 0 last_fifo_cycle_timestamp = time.time() to_break = False obj_dequeued_id_list = [] obj_to_enqueue_dict = collections.defaultdict( lambda: [[], 0, 0]) obj_to_enqueue_to_head_dict = collections.defaultdict( lambda: [[], 0, 0]) remaining_obj_to_enqueue_dict = {} remaining_obj_to_enqueue_to_head_dict = {} n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0 while time.time( ) < last_fifo_cycle_timestamp + fifoCheckDuration: sw.reset() n_loops += 1 retVal, overhead_time = monitor_fifo.to_check_workers() if overhead_time is not None: n_chunk_peeked_stat += 1 sum_overhead_time_stat += overhead_time if retVal: # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get( timeout=1, protective=fifoProtectiveDequeue) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: sw_fifo = core_utils.get_stopwatch() if fifoProtectiveDequeue: obj_dequeued_id_list.append(obj_gotten.id) queueName, workSpecsList = obj_gotten.item mainLog.debug( 'got a chunk of {0} workers of {1} from FIFO' .format(len(workSpecsList), queueName) + sw.get_elapsed_time()) sw.reset() configID = None for workSpecs in workSpecsList: if configID is None and len(workSpecs) > 0: configID = workSpecs[0].configID for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec. get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update( 'pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal try: if len(obj_to_enqueue_dict[queueName] [0]) + len( workSpecsToEnqueue ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_dict[queueName][ 0].extend(workSpecsToEnqueue) obj_to_enqueue_dict[queueName][ 1] = max( obj_to_enqueue_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_dict[queueName][ 2] = max( obj_to_enqueue_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_dict[ queueName] = [ workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO: {0}' .format(errStr)) to_break = True try: if len(obj_to_enqueue_to_head_dict[ queueName][0]) + len( workSpecsToEnqueueToHead ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_to_head_dict[ queueName][0].extend( workSpecsToEnqueueToHead) obj_to_enqueue_to_head_dict[ queueName][1] = max( obj_to_enqueue_to_head_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_to_head_dict[ queueName][2] = max( obj_to_enqueue_to_head_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_to_head_dict[ queueName] = [ workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO head: {0}' .format(errStr)) to_break = True mainLog.debug( 'checked {0} workers from FIFO'.format( len(workSpecsList)) + sw.get_elapsed_time()) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) if sw_fifo.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time()) else: mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time()) n_loops_hit += 1 if to_break: break else: mainLog.debug('got nothing in FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if self.singleMode: break if overhead_time is not None: time.sleep( max(-overhead_time * random.uniform(0.1, 1), adjusted_sleepTime)) else: time.sleep( max(fifoCheckDuration * random.uniform(0.1, 1), adjusted_sleepTime)) mainLog.debug( 'run {0} loops, including {1} FIFO cycles'.format( n_loops, n_loops_hit)) # enqueue to fifo sw.reset() n_chunk_put = 0 mainLog.debug('putting worker chunks to FIFO') for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict): for queueName, obj_to_enqueue in iteritems(_dct): try: workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue if workSpecsToEnqueue: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueue), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'.format( errStr)) mainLog.debug('putting worker chunks to FIFO head') for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict): for queueName, obj_to_enqueue_to_head in iteritems(_dct): try: workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head if workSpecsToEnqueueToHead: score = fifoCheckInterval + timeNow_timestamp - 2**32 monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueueToHead), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}'. format(errStr)) # release protective dequeued objects if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0: monitor_fifo.release(ids=obj_dequeued_id_list) mainLog.debug( 'put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time()) # adjust adjusted_sleepTime if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime: speedup_factor = (sum_overhead_time_stat - sleepTime) / ( n_chunk_peeked_stat * harvester_config.monitor.checkInterval) speedup_factor = max(speedup_factor, 0) adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor) elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0: adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2 mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format( adjusted_sleepTime)) # end run with fifo mainLog.debug('ended run with FIFO') # time the cycle mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(adjusted_sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'stager-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug('try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return
def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch( harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs default_prodSourceLabel = queueConfig.get_source_label() pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list( pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format( nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs( siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format( len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin if hasattr(queueConfig, 'extractor'): extractorCore = self.pluginFactory.get_plugin( queueConfig.extractor) else: extractorCore = None jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute( 'schedulerID', 'harvester-{0}'.format( harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [ jobSpec.get_input_file_attributes() ] if extractorCore is not None: fileGroupDictList.append( extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): # check file status if tmpLFN not in fileStatMap: fileStatMap[ tmpLFN] = self.dbProxy.get_file_status( tmpLFN, 'input', queueConfig.ddmEndpointIn, 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] # set preparing to skip stage-in if the file is (being) taken care of by another job if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ or 'to_prepare' in fileStatMap[tmpLFN]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs[ 'INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format( len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format( len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'stager-{0}'.format(self.ident) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True) tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug('try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True) tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False) tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return
def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs tmpLog.debug('getting {0} jobs'.format(nJobs)) sw = core_utils.get_stopwatch() siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, queueConfig.get_source_label(), self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format(len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute('schedulerID', 'harvester-{0}'.format(harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB for tmpLFN, fileAttrs in iteritems(jobSpec.get_input_file_attributes()): # check file status if tmpLFN not in fileStatMap: fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input', queueConfig.ddmEndpointIn, 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] # set preparing to skip stage-in if the file is (being) taken care of by another job if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ or 'to_prepare' in fileStatMap[tmpLFN]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 fileSpec.fileType = 'input' jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time())) sw_insertdb =core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
def run(self): lockedBy = 'monitor-{0}'.format(self.ident) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') if time.time( ) >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime: # run with workers from DB mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if self.monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() mainLog.debug('ended run with DB') elif self.monitor_fifo.enabled: # run with workers from FIFO if monitor_fifo.to_check_workers(): # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get(timeout=1) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: queueName, workSpecsList = obj_gotten mainLog.debug('got {0} workers of {1}'.format( len(workSpecsList), queueName)) configID = workSpecsList[0][0].configID for workSpecs in workSpecsList: for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec.get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update('pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}' .format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug( 'putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) else: mainLog.debug('got nothing in FIFO') mainLog.debug('ended run with FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if sw.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single cycle was longer than lockInterval ' + sw.get_elapsed_time()) else: mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated sleepTime = (harvester_config.monitor.fifoSleepTimeMilli / 1000.0) \ if self.monitor_fifo.enabled else harvester_config.monitor.sleepTime if self.terminated(sleepTime): mainLog.debug('terminated') return