def __init__(self, single_mode=False): AgentBase.__init__(self, single_mode) self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get module and class names moduleNames = self.get_list(harvester_config.credmanager.moduleName) classNames = self.get_list(harvester_config.credmanager.className) # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) else: inCertFiles = self.get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS vomses = self.get_list(harvester_config.credmanager.voms) # get plugin self.exeCores = [] for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore)
def run(self): tmpLog = _logger try: # get parameters param_dict = json.load(sys.stdin) plugin_config = param_dict['plugin_config'] function_name = param_dict['function_name'] tmpLog = core_utils.make_logger(_logger, 'pid={0}'.format(os.getpid()), method_name=function_name) tmpLog.debug('start') args = pickle.loads(str(param_dict['args'])) kwargs = pickle.loads(str(param_dict['kwargs'])) # get plugin pluginFactory = PluginFactory(no_db=True) core = pluginFactory.get_plugin(plugin_config) # execute ret = getattr(core, function_name)(*args, **kwargs) # make return return_dict = {'return': pickle.dumps(ret), 'args': pickle.dumps(args), 'kwargs': pickle.dumps(kwargs)} tmpLog.debug('done') except Exception as e: errMsg = core_utils.dump_error_message(tmpLog) return_dict = {'exception': pickle.dumps(e), 'dialog': pickle.dumps(errMsg)} return json.dumps(return_dict)
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory()
def test(): '''test submission''' from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory import json queuename = 'ARC-TEST' queueconfmapper = QueueConfigMapper() queueconf = queueconfmapper.get_queue(queuename) pluginfactory = PluginFactory() pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' pandajob = json.loads(pandajob) jspec = JobSpec() jspec.convert_job_json(pandajob) jspec.computingSite = queuename jspeclist = [jspec] maker = pluginfactory.get_plugin(queueconf.workerMaker) wspec = maker.make_worker(jspeclist, queueconf) wspec.hasJob = 1 wspec.set_jobspec_list(jspeclist) sub = ARCSubmitter() print sub.submit_workers([wspec]) print wspec.batchID
def test(): '''test submission''' from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory import json queuename = 'ARC-TEST' queueconfmapper = QueueConfigMapper() queueconf = queueconfmapper.get_queue(queuename) pluginfactory = PluginFactory() pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' pandajob = json.loads(pandajob) jspec = JobSpec() jspec.convert_job_json(pandajob) jspec.computingSite = queuename jspeclist = [jspec] maker = pluginfactory.get_plugin(queueconf.workerMaker) wspec = maker.make_worker(jspeclist, queueconf) wspec.hasJob = 1 wspec.set_jobspec_list(jspeclist) sub = ARCSubmitter() print sub.submit_workers([wspec]) print wspec.batchID
def _initialize_fifo(self, force_enable=False): self.fifoName = '{0}_fifo'.format(self.titleName) self.config = getattr(harvester_config, self.titleName) if force_enable: self.enabled = True elif hasattr(self.config, 'fifoEnable') and self.config.fifoEnable: self.enabled = True else: self.enabled = False return pluginConf = vars(self.config).copy() pluginConf.update({'titleName': self.titleName}) if hasattr(self.config, 'fifoModule') and hasattr( self.config, 'fifoClass'): pluginConf.update({ 'module': self.config.fifoModule, 'name': self.config.fifoClass, }) else: if not hasattr(harvester_config, 'fifo'): return pluginConf.update({ 'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass, }) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory()
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, **kwarg): FIFOBase.__init__(self, **kwarg) self.fifoName = '{0}_fifo'.format(self.titleName) pluginConf = {} pluginConf.update( {'titleName': self.titleName} ) pluginConf.update( {'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass,} ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
def __init__(self, **kwarg): FIFOBase.__init__(self, **kwarg) self.fifoName = '{0}_fifo'.format(self.titleName) pluginConf = {} pluginConf.update( {'titleName': self.titleName} ) pluginConf.update( {'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass,} ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queue_config_mapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # plugin cores self.exeCores = [] self.queue_exe_cores = [] # get plugin from harvester config self.get_cores_from_harvester_config() # update plugin cores from queue config self.update_cores_from_queue_config()
def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() if self.monitor_fifo.enabled: self.monitor_event_fifo = MonitorEventFIFO() else: self.monitor_event_fifo = None self.apfmon = Apfmon(self.queueConfigMapper) self.eventBasedMonCoreList = [] if getattr(harvester_config.monitor, 'eventBasedEnable', False): for pluginConf in harvester_config.monitor.eventBasedPlugins: pluginFactory = PluginFactory() self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.ident) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None # update local database tmpStat = self.dbProxy.update_worker(workSpec) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
def _initialize_fifo(self, force_enable=False): self.fifoName = '{0}_fifo'.format(self.titleName) self.config = getattr(harvester_config, self.titleName) if force_enable: self.enabled = True elif hasattr(self.config, 'fifoEnable') and self.config.fifoEnable: self.enabled = True else: self.enabled = False return pluginConf = vars(self.config).copy() pluginConf.update( {'titleName': self.titleName} ) if hasattr(self.config, 'fifoModule') and hasattr(self.config, 'fifoClass'): pluginConf.update( {'module': self.config.fifoModule, 'name': self.config.fifoClass,} ) else: if not hasattr(harvester_config, 'fifo'): return pluginConf.update( {'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass,} ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
class WorkerAdjuster: # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # define num of new workers for queueName in static_num_workers: for resource_type, tmpVal in iteritems(static_num_workers[queueName]): tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. format(queueName, resource_type, tmpVal)) # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', 'maintenance']: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) tmpLog.debug(retMsg) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin(queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] nQueueLimit = queueConfig.nQueueLimitWorker maxWorkers = queueConfig.maxWorkers if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueue >= nQueueLimit > 0: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format(nQueue, nQueueLimit) tmpLog.debug(retMsg) pass elif (nQueue + nReady + nRunning) >= maxWorkers > 0: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: maxQueuedWorkers = None if nQueueLimit > 0: # there is a limit set for the queue maxQueuedWorkers = nQueueLimit if nNewWorkersDef is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = nNewWorkersDef + nQueue if maxQueuedWorkers is not None: maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) else: maxQueuedWorkers = maxQueuedWorkers_slave if maxQueuedWorkers is None: # no value found, use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' .format(nNewWorkers)) if maxWorkers > 0: nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' .format(nNewWorkers)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' .format(nNewWorkers)) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory from pandaharvester.harvestercore.communicator_pool import CommunicatorPool from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestermisc import signal_utils fork_child_pid = os.fork() if fork_child_pid != 0: signal_utils.set_suicide_handler(None) os.wait() else: queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) pluginFactory = PluginFactory() com = CommunicatorPool() # get job jobSpecList = [] if queueConfig.mapType != WorkSpec.MT_NoJob: jobs, errStr = com.get_jobs(queueConfig.queueName, 'nodeName', queueConfig.prodSourceLabel, 'computingElement', 1, None) if len(jobs) == 0: print ("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec() jobSpec.convert_job_json(jobs[0])
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))
class Stager(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'stager-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug('try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers for queueName, workSpecs in iteritems(workersToKill): # get sweeper if not self.queueConfigMapper.has_queue(queueName): mainLog.error('queue config for {0} not found'.format(queueName)) continue queueConfig = self.queueConfigMapper.get_queue(queueName) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.debug('start killing') tmpStat, tmpOut = sweeperCore.kill_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) mainLog.debug('done kill') # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except: keepMissed = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) for queueName, workSpecs in iteritems(workersForCleanup): # get sweeper if not self.queueConfigMapper.has_queue(queueName): mainLog.error('queue config for {0} not found'.format(queueName)) continue queueConfig = self.queueConfigMapper.get_queue(queueName) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.debug('start cleanup') tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) if tmpStat: # delete from DB self.dbProxy.delete_worker(workSpec.workerID) mainLog.debug('done cleanup') # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = saga.job.Service(self.adaptor) except saga.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) sagadateformat_str = '%a %b %d %H:%M:%S %Y' retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.strptime(worker.created, sagadateformat_str) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.strptime(worker.started, sagadateformat_str) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.strptime(worker.finished, sagadateformat_str) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == saga.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails except saga.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory()
class WorkerMaker(object): # constructor def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get plugin def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), method_name='make_workers') tmpLog.debug('start') try: # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found tmpLog.error('plugin for {0} not found'.format(queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready) # loop over all chunks okChunks = [] ngChunks = [] for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: workSpec = maker.make_worker(jobChunk, queue_config, resource_type) else: # use ready worker if iChunk < len(readyWorkers): workSpec = readyWorkers[iChunk] else: workSpec = None # failed if workSpec is None: ngChunks.append(jobChunk) continue # set workerID if workSpec.workerID is None: workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID') workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error core_utils.dump_error_message(tmpLog) return [], jobchunk_list # get number of jobs per worker def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources def num_ready_resources(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_per_cycle()
class CredManager(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queue_config_mapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # plugin cores self.exeCores = [] self.queue_exe_cores = [] # get plugin from harvester config self.get_cores_from_harvester_config() # update plugin cores from queue config self.update_cores_from_queue_config() # get list def get_list(self, data): if isinstance(data, list): return data else: return [data] # get plugin cores from harvester config def get_cores_from_harvester_config(self): # get module and class names if hasattr(harvester_config.credmanager, 'moduleName'): moduleNames = self.get_list( harvester_config.credmanager.moduleName) else: moduleNames = [] if hasattr(harvester_config.credmanager, 'className'): classNames = self.get_list(harvester_config.credmanager.className) else: classNames = [] # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) elif hasattr(harvester_config.credmanager, 'certFile'): inCertFiles = self.get_list(harvester_config.credmanager.certFile) else: inCertFiles = [] # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS if hasattr(harvester_config.credmanager, 'voms'): vomses = self.get_list(harvester_config.credmanager.voms) else: vomses = [] # direct and merged plugin configuration in json if hasattr(harvester_config.credmanager, 'pluginConfigs'): pluginConfigs = harvester_config.credmanager.pluginConfigs else: pluginConfigs = [] # from traditional attributes for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms try: exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore) except Exception: _logger.error( 'failed to launch credmanager with traditional attributes for {0}' .format(pluginPar)) core_utils.dump_error_message(_logger) # from pluginConfigs for pc in pluginConfigs: try: setup_maps = pc['configs'] for setup_name, setup_map in setup_maps.items(): try: pluginPar = {} pluginPar['module'] = pc['module'] pluginPar['name'] = pc['name'] pluginPar['setup_name'] = setup_name pluginPar.update(setup_map) exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore) except Exception: _logger.error( 'failed to launch credmanager in pluginConfigs for {0}' .format(pluginPar)) core_utils.dump_error_message(_logger) except Exception: _logger.error('failed to parse pluginConfigs {0}'.format(pc)) core_utils.dump_error_message(_logger) # update plugin cores from queue config def update_cores_from_queue_config(self): self.queue_exe_cores = [] for queue_name, queue_config in self.queue_config_mapper.get_all_queues( ).items(): if queue_config.queueStatus == 'offline' \ or not hasattr(queue_config, 'credmanagers') \ or not isinstance(queue_config.credmanagers, list): continue for cm_setup in queue_config.credmanagers: try: pluginPar = {} pluginPar['module'] = cm_setup['module'] pluginPar['name'] = cm_setup['name'] pluginPar['setup_name'] = queue_name for k, v in cm_setup.items(): if k in ('module', 'name'): pass if isinstance(v, str) and '$' in v: # replace placeholders value = v patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v) for patt in patts: tmp_ph = '${' + patt + '}' tmp_val = None if patt == 'harvesterID': tmp_val = harvester_config.master.harvester_id elif patt == 'queueName': tmp_val = queue_name elif patt.startswith('common.'): # values from common blocks attr = patt.replace('common.', '') if hasattr( queue_config, 'common' ) and attr in queue_config.common: tmp_val = queue_config.common[attr] if tmp_val is not None: value = value.replace(tmp_ph, tmp_val) # fill in pluginPar[k] = value else: # fill in pluginPar[k] = v exe_core = self.pluginFactory.get_plugin(pluginPar) self.queue_exe_cores.append(exe_core) except Exception: _logger.error( 'failed to launch about queue={0} for {1}'.format( queue_name, pluginPar)) core_utils.dump_error_message(_logger) # main loop def run(self): while True: # update plugin cores from queue config self.update_cores_from_queue_config() # execute self.execute() # this is the main run # check if being terminated if self.terminated(harvester_config.credmanager.sleepTime, randomize=False): return # main def execute(self): # get lock locked = self.dbProxy.get_process_lock( 'credmanager', self.get_pid(), harvester_config.credmanager.sleepTime) if not locked: return # loop over all plugins for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores): # do nothing if exeCore is None: continue # make logger credmanager_name = '' if hasattr(exeCore, 'setup_name'): credmanager_name = exeCore.setup_name else: credmanager_name = '{0} {1}'.format(exeCore.inCertFile, exeCore.outCertFile) mainLog = self.make_logger(_logger, '{0} {1}'.format( exeCore.__class__.__name__, credmanager_name), method_name='execute') try: # check credential mainLog.debug('check credential') isValid = exeCore.check_credential() if isValid: mainLog.debug('valid') elif not isValid: # renew it if necessary mainLog.debug('invalid') mainLog.debug('renew credential') tmpStat, tmpOut = exeCore.renew_credential() if not tmpStat: mainLog.error('failed : {0}'.format(tmpOut)) continue except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('done') # monit main def execute_monit(self): self.update_cores_from_queue_config() metrics = {} # loop over all plugins for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores): # do nothing if exeCore is None: continue # make logger if hasattr(exeCore, 'setup_name'): credmanager_name = exeCore.setup_name else: credmanager_name = '{0} {1}'.format(exeCore.inCertFile, exeCore.outCertFile) subLog = self.make_logger(_logger, '{0} {1}'.format( exeCore.__class__.__name__, credmanager_name), method_name='execute_monit') try: # check credential subLog.debug('check credential lifetime') lifetime = exeCore.check_credential_lifetime() if lifetime is not None: metrics[exeCore.outCertFile] = lifetime except Exception: core_utils.dump_error_message(subLog) subLog.debug('done') return metrics
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker(workspec) tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format( workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers)) mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug('making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker(workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker(workspec) tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec) tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
#if len(sys.argv) > 2: # begin_job_id = int(sys.argv[2]) #if len(sys.argv) > 3: # end_job_id = int(sys.argv[3]) #if len(sys.argv) > 4: # globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator[ 'module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' queueConfig.preparator['name'] = 'GlobusBulkPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('further_testing_go_bulk_preparator') tmpLog = core_utils.make_logger( _logger, method_name='further_testing_go_bulk_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']:
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: scattered = True else: scattered = False # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped since locked by another') continue # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped before feeding since locked by another') continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # dump for pandaID, eventList in iteritems(events): try: nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] except Exception: nRanges = None tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
from future.utils import iteritems from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_factory import PluginFactory # Define a helper function - get list def get_list(data): if isinstance(data, list): return data else: return [data] pluginFactory = PluginFactory() # get the configuration details - from the harvester config file # get module and class names moduleNames = get_list(harvester_config.credmanager.moduleName) classNames = get_list(harvester_config.credmanager.className) # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = get_list(harvester_config.credmanager.inCertFile) else: inCertFiles = get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = get_list(harvester_config.credmanager.outCertFile) else:
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict()
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmp_log.debug('start') tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) if queue_stat is None: queue_stat = dict() else: queue_stat = queue_stat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queue_name in static_num_workers: # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) max_workers = worker_limits_dict.get('maxWorkers', 0) n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) n_queue_limit_per_rt = worker_limits_dict[ 'nQueueLimitWorkerPerRT'] n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None for job_type, jt_values in iteritems( static_num_workers[queue_name]): for resource_type, tmp_val in iteritems(jt_values): tmp_log.debug( 'Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}' .format(queue_name, job_type, resource_type, tmp_val)) # set 0 to num of new workers when the queue is disabled if queue_name in queue_stat and queue_stat[queue_name][ 'status'] in [ 'offline', 'standby', 'maintenance' ]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 since status={0}'.format( queue_stat[queue_name]['status']) tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers since queue status = {0}'.format( queue_stat[queue_name]['status']) continue # protection against not-up-to-date queue config if queue_config is None: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 due to missing queue_config' tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers because of missing queue_config' continue # get throttler if queue_name not in self.throttlerMap: if hasattr(queue_config, 'throttler'): throttler = self.pluginFactory.get_plugin( queue_config.throttler) else: throttler = None self.throttlerMap[queue_name] = throttler # check throttler throttler = self.throttlerMap[queue_name] if throttler is not None: to_throttle, tmp_msg = throttler.to_be_throttled( queue_config) if to_throttle: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmp_msg) tmp_log.debug(ret_msg) continue # check stats n_queue = tmp_val['nQueue'] n_ready = tmp_val['nReady'] n_running = tmp_val['nRunning'] if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: n_queue_total += n_queue n_ready_total += n_ready n_running_total += n_running if queue_config.runMode == 'slave': n_new_workers_def = tmp_val['nNewWorkers'] if n_new_workers_def == 0: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by panda in slave mode' tmp_log.debug(ret_msg) continue else: n_new_workers_def = None # define num of new workers based on static site config n_new_workers = 0 if n_queue >= n_queue_limit_per_rt > 0: # enough queued workers ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format( n_queue, n_queue_limit_per_rt) tmp_log.debug(ret_msg) pass elif (n_queue + n_ready + n_running) >= max_workers > 0: # enough workers in the system ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format( n_queue, n_ready, n_running) ret_msg += '>= max_workers({0})'.format( max_workers) tmp_log.debug(ret_msg) pass else: max_queued_workers = None if n_queue_limit_per_rt > 0: # there is a limit set for the queue max_queued_workers = n_queue_limit_per_rt # Reset the maxQueueWorkers according to particular if n_new_workers_def is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = n_new_workers_def + n_queue if max_queued_workers is not None: max_queued_workers = min( maxQueuedWorkers_slave, max_queued_workers) else: max_queued_workers = maxQueuedWorkers_slave elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max( job_stats[queue_name]['activated'], 1) # avoid no activity queues queue_limit = max_queued_workers max_queued_workers = min( n_activated, max_queued_workers) tmp_log.debug( 'limiting max_queued_workers to min(n_activated={0}, queue_limit={1})' .format(n_activated, queue_limit)) except KeyError: tmp_log.warning( 'n_activated not defined, defaulting to configured queue limits' ) pass if max_queued_workers is None: # no value found, use default value max_queued_workers = 1 # new workers n_new_workers = max(max_queued_workers - n_queue, 0) tmp_log.debug( 'setting n_new_workers to {0} in max_queued_workers calculation' .format(n_new_workers)) if max_workers > 0: n_new_workers = min( n_new_workers, max( max_workers - n_queue - n_ready - n_running, 0)) tmp_log.debug( 'setting n_new_workers to {0} to respect max_workers' .format(n_new_workers)) if queue_config.maxNewWorkersPerCycle > 0: n_new_workers = min( n_new_workers, queue_config.maxNewWorkersPerCycle) tmp_log.debug( 'setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' .format(n_new_workers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) tmp_log.debug( 'setting n_new_workers to {0} in order to respect universal maxNewWorkers' .format(n_new_workers)) dyn_num_workers[queue_name][job_type][resource_type][ 'nNewWorkers'] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: max_new_workers_per_cycle = 0 ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: total_new_workers_rts = 0 for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: if _jt != 'ANY' and _rt != 'ANY': total_new_workers_rts = total_new_workers_rts + dyn_num_workers[ queue_name][_jt][_rt]['nNewWorkers'] n_new_workers_max_agg = min( max(n_queue_limit - n_queue_total, 0), max( max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers_max_agg = min(n_new_workers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > n_new_workers_max_agg: if n_new_workers_max_agg == 0: for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[ queue_name][job_type]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 tmp_log.debug( 'No n_new_workers since n_new_workers_max_agg=0 for UCORE' ) else: tmp_log.debug( 'n_new_workers_max_agg={0} for UCORE'.format( n_new_workers_max_agg)) _d = dyn_num_workers[queue_name].copy() del _d['ANY'] # TODO: needs to be recalculated simple_rt_nw_list = [] for job_type in _d: # jt: job type for resource_type in _d[ job_type]: # rt: resource type simple_rt_nw_list.append([ (resource_type, job_type), _d[job_type][resource_type].get( 'nNewWorkers', 0), 0 ]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: (resource_type, job_type), n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod( n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) dyn_num_workers[queue_name][ job_type].setdefault( resource_type, { 'nReady': 0, 'nRunning': 0, 'nQueue': 0, 'nNewWorkers': 0 }) dyn_num_workers[queue_name][job_type][ resource_type][ 'nNewWorkers'] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for ( resource_type, job_type ), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] += 1 _countdown -= 1 for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][ job_type]: if job_type == 'ANY' or resource_type == 'ANY': continue n_new_workers = dyn_num_workers[queue_name][ job_type][resource_type]['nNewWorkers'] tmp_log.debug( 'setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' .format(n_new_workers, job_type, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error err_msg = core_utils.dump_error_message(tmp_log) return None
class JobFetcher(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch( harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs default_prodSourceLabel = queueConfig.get_source_label() pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list( pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format( nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs( siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format( len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin if hasattr(queueConfig, 'extractor'): extractorCore = self.pluginFactory.get_plugin( queueConfig.extractor) else: extractorCore = None jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute( 'schedulerID', 'harvester-{0}'.format( harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [ jobSpec.get_input_file_attributes() ] if extractorCore is not None: fileGroupDictList.append( extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): # check file status if tmpLFN not in fileStatMap: fileStatMap[ tmpLFN] = self.dbProxy.get_file_status( tmpLFN, 'input', queueConfig.ddmEndpointIn, 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] # set preparing to skip stage-in if the file is (being) taken care of by another job if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ or 'to_prepare' in fileStatMap[tmpLFN]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs[ 'INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format( len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format( len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
assFileSpec.path = os.getcwd() + '/' + assFileSpec.lfn oFile = open(assFileSpec.lfn, 'w') oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) oFile.close() fileSpec.add_associated_file(assFileSpec) jobSpec = JobSpec() jobSpec.jobParams = {'outFiles': fileSpec.lfn + ',log', 'scopeOut': 'panda', 'scopeLog': 'panda', 'logFile': 'log', 'realDatasets': 'panda.' + fileSpec.lfn, 'ddmEndPointOut': 'BNL-OSG2_DATADISK', } jobSpec.add_out_file(fileSpec) pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) print ("plugin={0}".format(stagerCore.__class__.__name__)) print ("testing zip") tmpStat, tmpOut = stagerCore.zip_output(jobSpec) if tmpStat: print (" OK") else: print (" NG {0}".format(tmpOut)) print () print ("testing stage-out")
class Monitor(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'monitor-{0}'.format(self.get_pid()) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main try: fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli except AttributeError: fifoSleepTimeMilli = 5000 try: fifoCheckDuration = harvester_config.monitor.fifoCheckDuration except AttributeError: fifoCheckDuration = 30 try: fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk except AttributeError: fifoMaxWorkersPerChunk = 500 try: fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue except AttributeError: fifoProtectiveDequeue = True last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo sleepTime = (fifoSleepTimeMilli / 1000.0) \ if monitor_fifo.enabled else harvester_config.monitor.sleepTime adjusted_sleepTime = sleepTime if monitor_fifo.enabled: monitor_fifo.restore() while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('start a monitor cycle') if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \ not (monitor_fifo.enabled and self.singleMode): # run with workers from DB sw_db = core_utils.get_stopwatch() mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() if sw_db.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time()) else: mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time()) mainLog.debug('ended run with DB') elif monitor_fifo.enabled: # run with workers from FIFO sw = core_utils.get_stopwatch() n_loops = 0 n_loops_hit = 0 last_fifo_cycle_timestamp = time.time() to_break = False obj_dequeued_id_list = [] obj_to_enqueue_dict = collections.defaultdict( lambda: [[], 0, 0]) obj_to_enqueue_to_head_dict = collections.defaultdict( lambda: [[], 0, 0]) remaining_obj_to_enqueue_dict = {} remaining_obj_to_enqueue_to_head_dict = {} n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0 while time.time( ) < last_fifo_cycle_timestamp + fifoCheckDuration: sw.reset() n_loops += 1 retVal, overhead_time = monitor_fifo.to_check_workers() if overhead_time is not None: n_chunk_peeked_stat += 1 sum_overhead_time_stat += overhead_time if retVal: # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get( timeout=1, protective=fifoProtectiveDequeue) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: sw_fifo = core_utils.get_stopwatch() if fifoProtectiveDequeue: obj_dequeued_id_list.append(obj_gotten.id) queueName, workSpecsList = obj_gotten.item mainLog.debug( 'got a chunk of {0} workers of {1} from FIFO' .format(len(workSpecsList), queueName) + sw.get_elapsed_time()) sw.reset() configID = None for workSpecs in workSpecsList: if configID is None and len(workSpecs) > 0: configID = workSpecs[0].configID for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec. get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update( 'pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal try: if len(obj_to_enqueue_dict[queueName] [0]) + len( workSpecsToEnqueue ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_dict[queueName][ 0].extend(workSpecsToEnqueue) obj_to_enqueue_dict[queueName][ 1] = max( obj_to_enqueue_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_dict[queueName][ 2] = max( obj_to_enqueue_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_dict[ queueName] = [ workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO: {0}' .format(errStr)) to_break = True try: if len(obj_to_enqueue_to_head_dict[ queueName][0]) + len( workSpecsToEnqueueToHead ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_to_head_dict[ queueName][0].extend( workSpecsToEnqueueToHead) obj_to_enqueue_to_head_dict[ queueName][1] = max( obj_to_enqueue_to_head_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_to_head_dict[ queueName][2] = max( obj_to_enqueue_to_head_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_to_head_dict[ queueName] = [ workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO head: {0}' .format(errStr)) to_break = True mainLog.debug( 'checked {0} workers from FIFO'.format( len(workSpecsList)) + sw.get_elapsed_time()) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) if sw_fifo.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time()) else: mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time()) n_loops_hit += 1 if to_break: break else: mainLog.debug('got nothing in FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if self.singleMode: break if overhead_time is not None: time.sleep( max(-overhead_time * random.uniform(0.1, 1), adjusted_sleepTime)) else: time.sleep( max(fifoCheckDuration * random.uniform(0.1, 1), adjusted_sleepTime)) mainLog.debug( 'run {0} loops, including {1} FIFO cycles'.format( n_loops, n_loops_hit)) # enqueue to fifo sw.reset() n_chunk_put = 0 mainLog.debug('putting worker chunks to FIFO') for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict): for queueName, obj_to_enqueue in iteritems(_dct): try: workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue if workSpecsToEnqueue: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueue), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'.format( errStr)) mainLog.debug('putting worker chunks to FIFO head') for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict): for queueName, obj_to_enqueue_to_head in iteritems(_dct): try: workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head if workSpecsToEnqueueToHead: score = fifoCheckInterval + timeNow_timestamp - 2**32 monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueueToHead), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}'. format(errStr)) # release protective dequeued objects if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0: monitor_fifo.release(ids=obj_dequeued_id_list) mainLog.debug( 'put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time()) # adjust adjusted_sleepTime if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime: speedup_factor = (sum_overhead_time_stat - sleepTime) / ( n_chunk_peeked_stat * harvester_config.monitor.checkInterval) speedup_factor = max(speedup_factor, 0) adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor) elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0: adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2 mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format( adjusted_sleepTime)) # end run with fifo mainLog.debug('ended run with FIFO') # time the cycle mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(adjusted_sleepTime): mainLog.debug('terminated') return # core of monitor agent to check workers in workSpecsList of queueName def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None): tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName, config_id): tmpQueLog.error('config not found') return None # get queue queueConfig = self.queueConfigMapper.get_queue(queueName, config_id) try: apfmon_status_updates = self.queueConfigMapper.queueConfig[ queueName].monitor['apfmon_status_updates'] except Exception: apfmon_status_updates = False tmpQueLog.debug( 'apfmon_status_updates: {0}'.format(apfmon_status_updates)) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # workspec chunk of active workers workSpecsToEnqueue_dict = {} workSpecsToEnqueueToHead_dict = {} timeNow_timestamp = time.time() # get fifoCheckInterval for PQ and other fifo attributes try: fifoCheckInterval = monCore.fifoCheckInterval except Exception: if hasattr(harvester_config.monitor, 'fifoCheckInterval'): fifoCheckInterval = harvester_config.monitor.fifoCheckInterval else: fifoCheckInterval = harvester_config.monitor.checkInterval try: forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval except AttributeError: forceEnqueueInterval = 3600 try: fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval except AttributeError: fifoMaxPreemptInterval = 60 # check workers allWorkers = [item for sublist in workSpecsList for item in sublist] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog, from_fifo) if tmpStat: # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = dict() isCheckedList = [] mapType = workSpecs[0].mapType # loop over workSpecs for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] oldStatus = tmpOut['oldStatus'] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] isChecked = tmpOut['isChecked'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error('unknown status={0}'.format(newStatus)) return # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) workSpec.set_dialog_message(diagMessage) if isChecked: workSpec.checkTime = datetime.datetime.utcnow() isCheckedList.append(isChecked) if monStatus == WorkSpec.ST_failed: if not workSpec.has_pilot_error(): workSpec.set_pilot_error( PilotErrors.ERR_GENERALERROR, diagMessage) elif monStatus == WorkSpec.ST_cancelled: if not workSpec.has_pilot_error(): workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, diagMessage) if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: workSpec.set_work_params({'finalMonStatus': monStatus}) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True, slim=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList[ workSpec.workerID] = filesToStageOut # apfmon status update if apfmon_status_updates and newStatus != oldStatus: tmpQueLog.debug( 'apfmon_status_updates: {0} newStatus: {1} monStatus: {2} oldStatus: {3} workSpecStatus: {4}' .format(apfmon_status_updates, newStatus, monStatus, oldStatus, workSpec.status)) self.apfmon.update_worker(workSpec, monStatus) # lock workers for fifo if from_fifo: # collect some attributes to be updated when workers are locked worker_id_list = dict() for workSpec, isChecked in zip(workSpecs, isCheckedList): attrs = dict() if isChecked: attrs['checkTime'] = workSpec.checkTime workSpec.force_not_update('checkTime') if workSpec.has_updated_attributes(): attrs['lockedBy'] = lockedBy workSpec.lockedBy = lockedBy workSpec.force_not_update('lockedBy') else: attrs['lockedBy'] = None worker_id_list[workSpec.workerID] = attrs temRetLockWorker = self.dbProxy.lock_workers( worker_id_list, harvester_config.monitor.lockInterval) # skip if not locked if not temRetLockWorker: continue # update jobs and workers if jobSpecs is not None and len(jobSpecs) > 0: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') if from_fifo: tmpLog.info( 'failed to update the DB. Maybe locked by other thread running with DB' ) else: if workSpec.status in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_missed ]: tmpLog.info( 'worker already in final status. Skipped') else: tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) else: if jobSpecs is not None: for jobSpec in jobSpecs: tmpLog = self.make_logger( _logger, 'id={0} PandaID={1}'.format( lockedBy, jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0: for workSpec in workSpecs: try: messenger.acknowledge_events_files(workSpec) except Exception: core_utils.dump_error_message(tmpQueLog) tmpQueLog.error( 'failed to send ACK to workerID={0}'.format( workSpec.workerID)) # active workers for fifo if self.monitor_fifo.enabled and workSpecs: workSpec = workSpecs[0] tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \ and workSpec.mapType != WorkSpec.MT_MultiWorkers \ and workSpec.workAttributes is not None: timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() # get lastCheckAt _bool, lastCheckAt = workSpec.get_work_params( 'lastCheckAt') try: last_check_period = timeNow_timestamp - lastCheckAt except TypeError: last_check_period = forceEnqueueInterval + 1.0 # get lastForceEnqueueAt _bool, lastForceEnqueueAt = workSpec.get_work_params( 'lastForceEnqueueAt') if not (_bool and lastForceEnqueueAt is not None): lastForceEnqueueAt = 0 # notification intolerable_delay = max( forceEnqueueInterval * 2, harvester_config.monitor.checkInterval * 4) if _bool and lastCheckAt is not None and last_check_period > harvester_config.monitor.checkInterval \ and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp: if last_check_period > intolerable_delay: tmpQueLog.error( 'last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly' .format(workSpec.workerID, last_check_period)) else: tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor checkInterval' .format(workSpec.workerID, last_check_period)) # prepartion to enqueue fifo if (from_fifo) \ or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp and last_check_period > forceEnqueueInterval and last_check_period < intolerable_delay and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval): if not from_fifo: # in DB cycle tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force' .format(workSpec.workerID, last_check_period)) workSpec.set_work_params( {'lastForceEnqueueAt': timeNow_timestamp}) workSpec.set_work_params( {'lastCheckAt': timeNow_timestamp}) workSpec.lockedBy = None workSpec.force_update('lockedBy') if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: # for post-processing _bool, startFifoPreemptAt = workSpec.get_work_params( 'startFifoPreemptAt') if not _bool or startFifoPreemptAt is None: startFifoPreemptAt = timeNow_timestamp workSpec.set_work_params({ 'startFifoPreemptAt': startFifoPreemptAt }) tmpQueLog.debug( 'workerID={0} , startFifoPreemptAt: {1}'. format(workSpec.workerID, startFifoPreemptAt)) if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval: workSpecsToEnqueueToHead_dict[ workSpec.workerID] = workSpecs else: workSpec.set_work_params({ 'startFifoPreemptAt': timeNow_timestamp }) workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: tmpQueLog.error('failed to check workers') workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values()) workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values()) retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval tmpQueLog.debug('done') return retVal # wrapper for checkWorkers def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, from_fifo): # check timeout value try: checkTimeout = mon_core.checkTimeout except Exception: try: checkTimeout = harvester_config.monitor.checkTimeout except Exception: checkTimeout = None try: workerQueueTimeLimit = harvester_config.monitor.workerQueueTimeLimit except AttributeError: workerQueueTimeLimit = 172800 workersToCheck = [] thingsToPostProcess = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = [] nJobsToReFill = None if workSpec.has_work_params('finalMonStatus'): # to post-process _bool, finalMonStatus = workSpec.get_work_params( 'finalMonStatus') _thing = (workSpec, (finalMonStatus, '')) thingsToPostProcess.append(_thing) else: # job-level late binding if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'oldStatus': workSpec.status, 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill, 'isChecked': True } # check workers tmp_log.debug('checking workers with plugin') try: if workersToCheck: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error( 'failed to check workers with: {0}'.format(tmpOut)) workersToCheck = [] tmpOut = [] else: tmp_log.debug('checked') else: tmp_log.debug('Nothing to be checked with plugin') tmpOut = [] timeNow = datetime.datetime.utcnow() for workSpec, (newStatus, diagMessage) in itertools.chain( zip(workersToCheck, tmpOut), thingsToPostProcess): workerID = workSpec.workerID tmp_log.debug('Going to check workerID={0}'.format(workerID)) pandaIDs = [] if workerID in retMap: # failed to check status if newStatus is None: tmp_log.warning( 'Failed to check workerID={0} with {1}'.format( workerID, diagMessage)) retMap[workerID]['isChecked'] = False # set status if workSpec.checkTime is not None and checkTimeout is not None and \ timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout): # kill due to timeout tmp_log.debug( 'kill workerID={0} due to consecutive check failures' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) newStatus = WorkSpec.ST_cancelled diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) else: # use original status newStatus = workSpec.status # request kill if messenger.kill_requested(workSpec): tmp_log.debug( 'kill workerID={0} as requested'.format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) # stuck queuing for too long if workSpec.status == WorkSpec.ST_submitted \ and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit): tmp_log.debug( 'kill workerID={0} due to queuing longer than {1} seconds' .format(workerID, workerQueueTimeLimit)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat worker_heartbeat_limit = int( queue_config.messenger['worker_heartbeat']) except (AttributeError, KeyError): worker_heartbeat_limit = None tmp_log.debug( 'workerID={0} heartbeat limit is configured to {1}'. format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): tmp_log.debug( 'heartbeat for workerID={0} is valid'.format( workerID)) else: tmp_log.debug( 'heartbeat for workerID={0} expired: sending kill request' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # get work attributes workAttributes = messenger.get_work_attributes(workSpec) retMap[workerID]['workAttributes'] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update(workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running or idle while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True, slim=True) # post processing messenger.post_processing( workSpec, jobSpecs, workSpec.mapType) workSpec.post_processed() if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle # reset modification time to immediately trigger subsequent lookup if not self.monitor_fifo.enabled: workSpec.trigger_next_lookup() retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage else: tmp_log.debug( 'workerID={0} not in retMap'.format(workerID)) return True, retMap except Exception: core_utils.dump_error_message(tmp_log) return False, None
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queueName in static_num_workers: # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerLimits_dict = self.dbProxy.get_worker_limits(queueName) maxWorkers = workerLimits_dict.get('maxWorkers', 0) nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] nQueue_total, nReady_total, nRunning_total = 0, 0, 0 apf_msg = None apf_data = None for resource_type, tmpVal in iteritems(static_num_workers[queueName]): tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. format(queueName, resource_type, tmpVal)) # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', 'maintenance']: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) tmpLog.debug(retMsg) apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) continue # protection against not-up-to-date queue config if queueConfig is None: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 due to missing queueConfig' tmpLog.debug(retMsg) apf_msg = 'Not submitting workers because of missing queueConfig' continue # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin(queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] if resource_type != 'ANY': nQueue_total += nQueue nReady_total += nReady nRunning_total += nRunning if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueue >= nQueueLimitPerRT > 0: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) tmpLog.debug(retMsg) pass elif (nQueue + nReady + nRunning) >= maxWorkers > 0: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: maxQueuedWorkers = None if nQueueLimitPerRT > 0: # there is a limit set for the queue maxQueuedWorkers = nQueueLimitPerRT # Reset the maxQueueWorkers according to particular if nNewWorkersDef is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = nNewWorkersDef + nQueue if maxQueuedWorkers is not None: maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) else: maxQueuedWorkers = maxQueuedWorkers_slave elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues queue_limit = maxQueuedWorkers maxQueuedWorkers = min(n_activated, maxQueuedWorkers) tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. format(n_activated, queue_limit)) except KeyError: tmpLog.warning('n_activated not defined, defaulting to configured queue limits') pass if maxQueuedWorkers is None: # no value found, use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' .format(nNewWorkers)) if maxWorkers > 0: nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' .format(nNewWorkers)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' .format(nNewWorkers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers = min(nNewWorkers, self.maxNewWorkers) tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' .format(nNewWorkers)) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers if queueConfig is None: maxNewWorkersPerCycle = 0 retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' tmpLog.debug(retMsg) else: maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle if len(dyn_num_workers[queueName]) > 1: total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] if _rt != 'ANY' else 0 for _rt in dyn_num_workers[queueName] ) nNewWorkers_max_agg = min( max(nQueueLimit - nQueue_total, 0), max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), ) if maxNewWorkersPerCycle >= 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > nNewWorkers_max_agg: if nNewWorkers_max_agg == 0: for resource_type in dyn_num_workers[queueName]: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') else: tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) _d = dyn_num_workers[queueName].copy() del _d['ANY'] simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] _countdown = nNewWorkers_max_agg for _rt_list in simple_rt_nw_list: resource_type, nNewWorkers_orig, _r = _rt_list nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers _rt_list[2] = remainder _countdown -= nNewWorkers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 _countdown -= 1 for resource_type in dyn_num_workers[queueName]: if resource_type == 'ANY': continue nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' .format(nNewWorkers, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queueName]) self.apf_mon.update_label(queueName, apf_msg, apf_data) # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy()
class WorkerMaker: # constructor def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type): tmpLog = core_utils.make_logger(_logger, 'queue={0}'.format( queue_config.queueName), method_name='make_workers') tmpLog.debug('start') try: # get plugin maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found tmpLog.error('plugin for {0} not found'.format( queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers( queue_config.queueName, n_ready) # loop over all chunks okChunks = [] ngChunks = [] for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: workSpec = maker.make_worker(jobChunk, queue_config, resource_type) else: # use ready worker if iChunk < len(readyWorkers): workSpec = readyWorkers[iChunk] else: workSpec = None # failed if workSpec is None: ngChunks.append(jobChunk) continue # set workerID if workSpec.workerID is None: workSpec.workerID = self.dbProxy.get_next_seq_number( 'SEQ_workerID') workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump tmpLog.debug('made {0} workers while {1} chunks failed'.format( len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error core_utils.dump_error_message(tmpLog) return [], jobchunk_list # get number of jobs per worker def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type): # get plugin maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job def get_num_workers_per_job(self, queue_config, n_workers, resource_type): # get plugin maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers)
def on_connect(self, conn): self.pluginFactory = PluginFactory(no_db=True)
class Preparator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'preparator-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_in_sub_status( 'preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy, max_files_per_job=maxFilesPerJob, ng_file_status_list=['ready']) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_allTriggered]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_stage_in_status( jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to check later since still preparing with {0}' .format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths( jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error( 'failed to resolve input file paths : {0}'. format(tmpStr)) continue # manipulate container-related job params jobSpec.manipulate_job_params_for_container() # update job jobSpec.lockedBy = None jobSpec.set_all_input_ready() if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]): # all done allDone = True jobSpec.subStatus = 'prepared' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allReady else: # immediate next lookup since there could be more files to check allDone = False jobSpec.trigger_preparation() # change auxInput flag to check auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered: jobSpec.auxInput = JobSpec.AUX_inReady self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('succeeded') else: tmpLog.debug('partially succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_in_sub_status( 'fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing', max_files_per_job=maxFilesPerJob, ng_file_status_list=['triggered', 'ready']) mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) fileType = 'input' else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() # check if has to_prepare hasToPrepare = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': hasToPrepare = True break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: if fileSpec.status in ['preparing', 'to_prepare']: newInFiles.append(fileSpec) updateStatus = False if fileSpec.lfn not in fileStatMap[ queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[ queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' if fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path']: fileSpec.path = list( fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path'])[0] # set group info if any groupInfo = self.dbProxy.get_group_for_file( fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo[ 'groupStatus'] fileSpec.groupUpdateTime = groupInfo[ 'groupUpdateTime'] updateStatus = True elif (not hasToPrepare and 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True if fileSpec.status != 'preparing': fileSpec.status = 'preparing' updateStatus = True else: # change file status if the file is not prepared by another if fileSpec.status != 'to_prepare': fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append( (fileSpec.fileID, fileSpec.lfn, fileSpec.status)) fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn].setdefault( fileSpec.status, None) if len(newFileStatusData) > 0: self.dbProxy.change_file_status( jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'wait since files are being prepared by another job' ) continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation( jobSpec) # check result if tmpStat is True: # succeeded jobSpec.lockedBy = None if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]): # all done allDone = True jobSpec.subStatus = 'preparing' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allTriggered else: # change file status but not change job sub status since # there could be more files to prepare allDone = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': fileSpec.status = 'triggered' # immediate next lookup jobSpec.trigger_preparation() # change auxInput flag to prepare auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput: jobSpec.auxInput = JobSpec.AUX_inTriggered self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('triggered') else: tmpLog.debug('partially triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
class RpcBot(rpyc.Service): # initialization action def on_connect(self, conn): self.pluginFactory = PluginFactory(no_db=True) ###################### # submitter section # submit workers def exposed_submit_workers(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.submit_workers(workspec_list) ###################### # monitor section # check workers def exposed_check_workers(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.check_workers(workspec_list) ###################### # messenger section # setup access points def exposed_setup_access_points(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.setup_access_points(workspec_list) # feed jobs def exposed_feed_jobs(self, plugin_config, workspec, jobspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.feed_jobs(workspec, jobspec_list) # request job def exposed_job_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.job_requested(workspec) # request kill def exposed_kill_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.kill_requested(workspec) # is alive def exposed_is_alive(self, plugin_config, workspec, worker_heartbeat_limit): core = self.pluginFactory.get_plugin(plugin_config) return core.is_alive(workspec, worker_heartbeat_limit) # get work attributes def exposed_get_work_attributes(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_work_attributes(workspec) # get output files def exposed_get_files_to_stage_out(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_files_to_stage_out(workspec) # feed events def exposed_feed_events(self, plugin_config, workspec, events_dict): core = self.pluginFactory.get_plugin(plugin_config) return core.feed_events(workspec, events_dict) # get events def exposed_events_to_update(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.events_to_update(workspec) # request events def exposed_events_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.events_requested(workspec) # get PandaIDs def exposed_get_panda_ids(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_panda_ids(workspec) # post processing def exposed_post_processing(self, plugin_config, workspec, jobspec_list, map_type): core = self.pluginFactory.get_plugin(plugin_config) return core.post_processing(workspec, jobspec_list, map_type) # send ACK def exposed_acknowledge_events_files(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.acknowledge_events_files(workspec)
class Preparator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'preparator-{0}'.format(self.ident) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_status(jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr)) continue # update job jobSpec.subStatus = 'prepared' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.set_all_input_ready() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing') mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() newFileStatusData = [] toWait = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'preparing': updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' # set group info if any groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo['groupStatus'] fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] updateStatus = True elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True else: # change file status if the file is not prepared by another fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0 fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1 if len(newFileStatusData) > 0: self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('wait since files are being prepared by another job') continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.subStatus = 'preparing' jobSpec.lockedBy = None jobSpec.preparatorTime = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime) mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands if siteName is not None: comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: nWorkersPerQueue = dict() else: nWorkersPerQueue = self.workerAdjuster.define_num_workers( curWorkers, siteName) if nWorkersPerQueue is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers') elif len(nWorkersPerQueue) == 0: pass else: # loop over all queues for queueName, tmpVal in iteritems(nWorkersPerQueue): tmpLog = core_utils.make_logger( _logger, 'queue={0}'.format(queueName), method_name='run') tmpLog.debug('start') nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers) tmpLog.debug( 'nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed'. format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() # NG for ngJobs in ngChunks: for jobSpec in ngJobs: jobSpec.status = 'failed' jobSpec.subStatus = 'failedtomake' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK pandaIDs = set() workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec. nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # events if len(okJobs) > 0 and ( 'eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format( jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format( jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'. format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'. format(workSpec.workerID, tmpStat)) # submit tmpLog.debug('submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission tmpLog.error( 'failed to submit a workerID={0} with {1}'. format(workSpec.workerID, tmpStr)) workSpec.set_status(WorkSpec.ST_missed) jobList = [] elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow # prefetch events if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = { 'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': jobSpec.jobParams['coreCount'], } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.submitter.sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status == WorkSpec.ST_ready: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
queueName = sys.argv[1] if len(sys.argv) > 2: begin_job_id = int(sys.argv[2]) if len(sys.argv) > 3: end_job_id = int(sys.argv[3]) if len(sys.argv) > 4: globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' queueConfig.preparator['name'] = 'GlobusBulkPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('stageInTest_go_bulk_preparator') tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_bulk_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue
'outFiles': fileSpec.lfn + ',log', 'scopeOut': 'panda', 'scopeLog': 'panda', 'logFile': 'log', 'realDatasets': 'panda.' + fileSpec.lfn, 'ddmEndPointOut': 'BNL-OSG2_DATADISK', } jobSpec.computingSite = queueName jobSpec.PandaID = job_id jobSpec.add_out_file(fileSpec) print "file to transfer - {}".format(assFileSpec.path) print "dump(jobSpec)" #dump(jobSpec) pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) print "plugin={0}".format(stagerCore.__class__.__name__) print "testing zip" tmpStat, tmpOut = stagerCore.zip_output(jobSpec) if tmpStat: print " OK" else: print " NG {0}".format(tmpOut) print print "testing stage-out"
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: scattered = True else: scattered = False # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped since locked by another') continue # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered, workSpec.get_access_point()) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped before feeding since locked by another') continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # dump for pandaID, eventList in iteritems(events): try: nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] except Exception: nRanges = None tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = rs.job.Service(self.adaptor) except rs.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.utcfromtimestamp(worker.created) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.utcfromtimestamp(worker.started) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.utcfromtimestamp(worker.finished) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) # jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) # tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) # try: # os.utime(jsonFilePath, None) # except OSError: # open(jsonFilePath, 'a').close() tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == rs.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails if worker.state == rs.job.RUNNING: tmpLog.info("Going to check that all jobs of the worker are in the final status.") dbProxy = DBProxy() job_spec_list = dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, only_running=False, slim=False) allFinal = True for job_spec in job_spec_list: if not job_spec.is_final_status(): allFinal = False tmpLog.info("Not all jobs are in the final status, skip till the next monitoring cycle.") break if allFinal: tmpLog.info("All jobs are in the final status, going to cancel the worker.") worker.cancel() worker.wait() workSpec.nativeExitCode = 0 cur_time = datetime.utcnow() workSpec.endTime = cur_time jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) try: os.utime(jsonFilePath, None) except OSError: open(jsonFilePath, 'a').close() workSpec.set_status(workSpec.ST_finished) harvester_job_state = workSpec.ST_finished tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) except rs.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
jobType = sys.argv[2] else: print('value for jobType not valid, defaulted to {0}'.format( jobType)) # resourceType should be 'SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'. If not specified defaults to single core if sys.argv[3] in ('SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'): resourceType = sys.argv[3] else: print('value for resourceType not valid, defaulted to {0}'.format( resourceType)) print('Running with queueName:{0}, jobType:{1}, resourceType:{2}'.format( queueName, jobType, resourceType)) pluginFactory = PluginFactory() com = CommunicatorPool() # get job jobSpecList = [] if queueConfig.mapType != WorkSpec.MT_NoJob: jobs, errStr = com.get_jobs(queueConfig.queueName, 'nodeName', queueConfig.prodSourceLabel, 'computingElement', 1, None) if len(jobs) == 0: print("Failed to get jobs at {0} due to {1}".format( queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec()
class MultiNodeWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" tmpLog = self.make_logger(baseLogger, method_name='_get_executable') # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot if self.pilot_params: exe_str = " ".join([exe_str, self.pilot_params]) except Exception: tmpLog.error("Unable to build executor command check configuration") exe_str = "" exe_str = "\n".join([env_str, exe_str]) tmpLog.debug("Shell script body: \n%s" % exe_str) return exe_str # make a worker from jobs def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.info("Multi node worker preparation started.") tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit workSpec.workParams = self._get_executable() if len(jobspec_list) > 0: # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: workSpec.minRamCount += jobSpec.jobParams['minRamCount'] except Exception: pass try: workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass #try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit #except Exception: # pass tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec # def get_num_jobs_per_worker(self, n_workers): # """ # Function to set 'size' of worker. Define number of jobs per worker # """ # tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), # method_name='get_num_jobs_per_worker') # tmpLog.info("Get number of jobs per worker") # self.nJobsPerWorker = 1 # if self.mode == "static": # tmpLog.info("Static configuration") # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # elif self.mode == "dynamic": # tmpLog.info("Dynamic configuration") # self.nNodes, self.walltimelimit = self.get_resources() # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # # tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit)) # return self.nJobsPerWorker def get_resources(self): """ Function to get resourcese and map them to number of jobs """ tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), method_name='get_resources') njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) resource_utils = self.pluginFactory.get_plugin(queue_config.resource) if resource_utils: nodes, walltime = resource_utils.get_resources() else: tmpLog.info("Resource plugin is not defined") nodes = self.nNodes return nodes, walltime