def initialize(self): """ Initialize the agent. """ self.am_setOption("PollingTime", 60) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK()
def initialize(self): """ Initialize the agent. """ self.am_setOption( "PollingTime", 60 ) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK()
class ResetCounters(AgentModule): """ Reset the number of jobs at all sites: some sites are not updated properly, so once in a while it's needed to restore the correct number of jobs. It does not need to be exact, but enough to clear some of the jobs. """ def initialize(self): """ Initialize the agent. """ self.am_setOption("PollingTime", 60) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK() def execute(self): """ This is called by the Agent Reactor """ res = self.ovc.getSites() if not res['OK']: return res sitedict = {} sites = res['Value'] gLogger.info("Will update info for sites %s" % sites) for site in sites: attribdict = { "Site": site, "ApplicationStatus": 'Getting overlay files' } res = self.jobmon.getCurrentJobCounters(attribdict) if not res['OK']: continue if 'Running' in res['Value']: sitedict[site] = res['Value']['Running'] else: sitedict[site] = 0 gLogger.info("Setting new values %s" % sitedict) res = self.ovc.setJobsAtSites(sitedict) if not res['OK']: gLogger.error(res['Message']) return res return S_OK()
class ResetCounters ( AgentModule ): """ Reset the number of jobs at all sites: some sites are not updated properly, so once in a while it's needed to restore the correct number of jobs. It does not need to be exact, but enough to clear some of the jobs. """ def initialize(self): """ Initialize the agent. """ self.am_setOption( "PollingTime", 60 ) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK() def execute(self): """ This is called by the Agent Reactor """ res = self.ovc.getSites() if not res['OK']: return res sitedict = {} sites = res['Value'] gLogger.info("Will update info for sites %s" % sites) for site in sites: attribdict = {"Site" : site, "ApplicationStatus": 'Getting overlay files'} res = self.jobmon.getCurrentJobCounters(attribdict) if not res['OK']: continue if res['Value'].has_key('Running'): sitedict[site] = res['Value']['Running'] else: sitedict[site] = 0 gLogger.info("Setting new values %s" % sitedict) res = self.ovc.setJobsAtSites(sitedict) if not res['OK']: gLogger.error(res['Message']) return res return S_OK()
def __getFilesLocaly(self): """ Download the files. """ numberofeventstoget = ceil(self.BXOverlay * self.ggtohadint) nbfiles = len(self.lfns) availableevents = nbfiles * self.nbofeventsperfile if availableevents < numberofeventstoget: return S_ERROR("Number of %s events available is less than requested" % ( self.BkgEvtType )) if not self.NbSigEvtsPerJob: ##Compute Nsignal events self.NbSigEvtsPerJob = self.nbinputsigfile * self.nbsigeventsperfile if not self.NbSigEvtsPerJob: return S_ERROR('Could not determine the number of signal events per job') LOG.verbose("There are %s signal event" % self.NbSigEvtsPerJob) ##Now determine how many files are needed to cover all signal events totnboffilestoget = int(ceil(self.NbSigEvtsPerJob * numberofeventstoget / self.nbofeventsperfile)) ##Limit ourself to some configuration maximum levels = [self.machine, self.energytouse, self.detectormodel, self.BkgEvtType] maxNbFilesToGet = getOptionValue(ops=self.ops, basePath="/Overlay", optionName="MaxNbFilesToGet", defaultValue=20, levels=levels) if totnboffilestoget > maxNbFilesToGet: totnboffilestoget = maxNbFilesToGet self.__disableWatchDog() overlaymon = OverlaySystemClient() ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time error_count = 0 count = 0 while 1: if error_count > 10 : LOG.error('OverlayDB returned too many errors') return S_ERROR('Failed to get number of concurrent overlay jobs') res = overlaymon.canRun(self.site) if not res['OK']: error_count += 1 time.sleep(60) continue error_count = 0 #if running < max_concurrent_running: if res['Value']: break else: count += 1 if count > 300: return S_ERROR("Waited too long: 5h, so marking job as failed") if count % 10 == 0 : self.setApplicationStatus("Overlay standby number %s" % count) time.sleep(60) self.__enableWatchDog() self.setApplicationStatus('Getting overlay files') LOG.info('Will obtain %s files for overlay' % totnboffilestoget) os.mkdir("./overlayinput_" + self.metaEventType) os.chdir("./overlayinput_" + self.metaEventType) filesobtained = [] usednumbers = [] fail = False fail_count = 0 max_fail_allowed = self.ops.getValue("/Overlay/MaxFailedAllowed", 20) while not len(filesobtained) == totnboffilestoget: if fail_count > max_fail_allowed: fail = True break fileindex = random.randrange(nbfiles) if fileindex in usednumbers: continue usednumbers.append(fileindex) triedDataManager = False if self.site == 'LCG.CERN.ch': res = self.getEOSFile(self.lfns[fileindex]) elif self.site == 'LCG.IN2P3-CC.fr': res = self.getLyonFile(self.lfns[fileindex]) elif self.site == 'LCG.UKI-LT2-IC-HEP.uk': res = self.getImperialFile(self.lfns[fileindex]) elif self.site == 'LCG.RAL-LCG2.uk': res = self.getRALFile(self.lfns[fileindex]) elif self.site == 'LCG.KEK.jp': res = self.getKEKFile(self.lfns[fileindex]) else: self.__disableWatchDog() res = self.datMan.getFile(self.lfns[fileindex]) triedDataManager = True # In case the specific copying did not work (mostly because the files do # not exist locally) try again to get the file via the DataManager if (not res['OK']) and (not triedDataManager): res = self.datMan.getFile(self.lfns[fileindex]) if not res['OK']: LOG.warn('Could not obtain %s' % self.lfns[fileindex]) fail_count += 1 # Wait for a random time around 3 minutes LOG.verbose("Waste happily some CPU time (on average 3 minutes)") resWaste = wasteCPUCycles(60 * random.gauss(3, 0.1)) if not resWaste['OK']: LOG.error("Could not waste as much CPU time as wanted, but whatever!") continue filesobtained.append(self.lfns[fileindex]) # If no file could be obtained, need to make sure the job fails if len(usednumbers) == nbfiles and not filesobtained: fail = True break ## Remove all scripts remaining scripts = glob.glob("*.sh") for script in scripts: os.remove(script) ##Print the file list mylist = os.listdir(os.getcwd()) LOG.info("List of Overlay files:") LOG.info("\n".join(mylist)) os.chdir(self.curdir) res = overlaymon.jobDone(self.site) if not res['OK']: LOG.error("Could not declare the job as finished getting the files") if fail: LOG.error("Did not manage to get all files needed, too many errors") return S_ERROR("Failed to get files") LOG.info('Got all files needed.') return S_OK()