def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() return S_OK()
class InputDataValidation( OptimizerExecutor ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ @classmethod def initializeOptimizer( cls ): """ Initialization of the Agent. """ random.seed() cls.__SEStatus = DictCache.DictCache() cls.__sitesForSE = DictCache.DictCache() try: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB except ImportError, excp : return S_ERROR( "Could not import JobDB: %s" % str( excp ) ) try: cls.__jobDB = JobDB() except RuntimeError: return S_ERROR( "Cannot connect to JobDB" ) cls.__siteStatus = SiteStatus() cls.ex_setOption( "FailedStatus", "Input Data Not Available" ) return S_OK()
def initializeHandler(cls, serviceInfoDict): cls.jobDB = JobDB() result = cls.jobDB._getConnection() if not result['OK']: cls.log.warn("Could not connect to JobDB (%s). Resorting to RPC" % result['Message']) result['Value'].close() #Try to do magic myStuff = dir(cls) jobStateStuff = dir(JobState) for method in jobStateStuff: if "export_%s" % method in myStuff: cls.log.info( "Wrapping method %s. It's already defined in the Handler" % method) # defMeth = getattr( cls, "export_%s" % method ) # setattr( cls, "_usr_def_%s" % method, defMeth ) # setattr( cls, "types_%s" % method, [ ( types.IntType, types.LongType ), types.TupleType ] ) # setattr( cls, "export_%s" % method, cls.__unwrapAndCall ) continue elif 'right_%s' % method in jobStateStuff: cls.log.info("Mimicking method %s" % method) setattr(cls, "auth_%s" % method, ['all']) setattr(cls, "types_%s" % method, [(types.IntType, types.LongType), types.TupleType]) setattr(cls, "export_%s" % method, cls.__mimeticFunction) return S_OK()
def initialize(self): """Sets defaults""" self.jobDB = JobDB() agentTSTypes = self.am_getOption("ProductionTypes", []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( "Transformations/DataProcessing", ["MCSimulation", "Merge"]) self.log.info( "Will exclude the following Production types from cleaning %s" % (", ".join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", self.maxJobsAtOnce) self.removeStatusDelay[JobStatus.DONE] = self.am_getOption( "RemoveStatusDelay/Done", 7) self.removeStatusDelay[JobStatus.KILLED] = self.am_getOption( "RemoveStatusDelay/Killed", 7) self.removeStatusDelay[JobStatus.FAILED] = self.am_getOption( "RemoveStatusDelay/Failed", 7) self.removeStatusDelay["Any"] = self.am_getOption( "RemoveStatusDelay/Any", -1) self.removeStatusDelayHB[JobStatus.DONE] = self.am_getOption( "RemoveStatusDelayHB/Done", -1) self.removeStatusDelayHB[JobStatus.KILLED] = self.am_getOption( "RemoveStatusDelayHB/Killed", -1) self.removeStatusDelayHB[JobStatus.FAILED] = self.am_getOption( "RemoveStatusDelayHB/Failed", -1) self.maxHBJobsAtOnce = self.am_getOption("MaxHBJobsAtOnce", 0) return S_OK()
def initializeJobMonitoringHandler(serviceInfo): global jobDB, jobLoggingDB, taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() return S_OK()
def initialize(self): """ Sets defaults """ self.am_setOption("PollingTime", 120) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info( "Will exclude the following Production types from cleaning %s" % (', '.join(self.prod_types))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500) self.jobByJob = self.am_getOption('JobByJob', False) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1) return S_OK()
def initialize(self, jobDB=False, logDB=False): """ Initialization of the Optimizer Agent. """ if not jobDB: self.jobDB = JobDB() else: self.jobDB = jobDB if not logDB: self.logDB = JobLoggingDB() else: self.logDB = logDB trailing = "Agent" optimizerName = self.am_getModuleParam('agentName') if optimizerName[-len(trailing):].find(trailing) == 0: optimizerName = optimizerName[:-len(trailing)] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.startingMajorStatus = "Checking" self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.requiredJobInfo = 'jdl' self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def initializeJobStateUpdateHandler(serviceInfo): global jobDB global logDB jobDB = JobDB() logDB = JobLoggingDB() return S_OK()
def initializeJobMonitoringHandler(serviceInfo): global gJobDB, gJobLoggingDB, gTaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gTaskQueueDB = TaskQueueDB() return S_OK()
def initialize(self, jobDB=None, logDB=None): """Initialization of the Optimizer Agent.""" self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) useESForJobParametersFlag = Operations().getValue( "/Services/JobMonitoring/useESForJobParametersFlag", False) if useESForJobParametersFlag: try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.ElasticJobParametersDB", "ElasticJobParametersDB") if not result["OK"]: return result self.elasticJobParametersDB = result["Value"]() except RuntimeError as excp: return S_ERROR("Can't connect to DB: %s" % excp) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam("agentName") if optimizerName.endswith("Agent"): optimizerName = optimizerName[:-len("Agent")] self.am_setModuleParam("optimizerName", optimizerName) self.startingMinorStatus = self.am_getModuleParam("optimizerName") self.failedStatus = self.am_getOption("FailedJobStatus", JobStatus.FAILED) self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def __jobStatePath(self, jid, section, jobState): path = [] result = jobState.getManifest() if not result['OK']: return result jobManifest = result['Value'] ancestorDepth = jobManifest.getOption('AncestorDepth', '').replace('Unknown', '') if ancestorDepth: self.log.info('Job %s has specified ancestor depth' % (jid)) ancestors = gConfig.getValue('%s/AncestorFiles' % section, 'AncestorFiles') path.append(ancestors) inputData = jobManifest.getOption("InputData", '').replace('Unknown', '') if inputData: if not jobManifest.getOption('DisableDataScheduling', False): self.log.info('Job %s has input data requirement' % (jid)) path.append('InputData') else: self.log.info('Job %s has input data requirement but scheduling via input data is disabled' % (jid)) result = JobDB().setInputData(jid, []) if not result['OK']: self.log.error(result) return S_ERROR('Could not reset input data to null') if not path: self.log.info('No LHCb specific optimizers to be added') return S_OK(path)
def checkDBAccess(cls): # Init DB if there if not JobState.__db.checked: JobState.__db.jobDB = JobDB() JobState.__db.logDB = JobLoggingDB() JobState.__db.tqDB = TaskQueueDB() JobState.__db.checked = True
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gTaskQueueDB global jlDB global pilotAgentsDB gJobDB = JobDB() gTaskQueueDB = TaskQueueDB() jlDB = JobLoggingDB() pilotAgentsDB = PilotAgentsDB() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def cleanTaskQueues(): tqDB = TaskQueueDB() jobDB = JobDB() logDB = JobLoggingDB() result = tqDB.enableAllTaskQueues() if not result['OK']: return result result = tqDB.findOrphanJobs() if not result['OK']: return result for jid in result['Value']: result = tqDB.deleteJob(jid) if not result['OK']: gLogger.error("Cannot delete from TQ job %s" % jid, result['Message']) continue result = jobDB.rescheduleJob(jid) if not result['OK']: gLogger.error("Cannot reschedule in JobDB job %s" % jid, result['Message']) continue result = logDB.addLoggingRecord(jid, JobStatus.RECEIVED, "", "", source="JobState") if not result['OK']: gLogger.error("Cannot add logging record in JobLoggingDB %s" % jid, result['Message']) continue return S_OK()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.jobLoggingDB = JobLoggingDB() self._optimizers = {} self.am_setOption("PollingTime", 30) return S_OK()
def initializeTaskManagerHandler(serviceInfo): global gTaskDB, gJobDB gTaskDB = TaskDB() gJobDB = JobDB() return S_OK()
def initialize(self): """Sets default parameters""" self.jobDB = JobDB() self.logDB = JobLoggingDB() # getting parameters if not self.am_getOption("Enable", True): self.log.info("Stalled Job Agent running in disabled mode") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( "Can not get the WorkloadManagement system instance") self.stalledJobsTolerantSites = self.am_getOption( "StalledJobsTolerantSites", []) self.stalledJobsToleranceTime = self.am_getOption( "StalledJobsToleranceTime", 0) self.stalledJobsToRescheduleSites = self.am_getOption( "StalledJobsToRescheduleSites", []) self.submittingTime = self.am_getOption("SubmittingTime", self.submittingTime) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") failedTime = self.am_getOption("FailedTimeHours", 6) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) stalledTime = self.am_getOption("StalledTimeHours", 2) self.log.verbose("", "StalledTime = %s cycles" % (stalledTime)) self.stalledTime = int(watchdogCycle * (stalledTime + 0.5)) self.log.verbose("", "FailedTime = %s cycles" % (failedTime)) # Add half cycle to avoid race conditions self.failedTime = int(watchdogCycle * (failedTime + 0.5)) self.minorStalledStatuses = ( JobMinorStatus.STALLED_PILOT_NOT_RUNNING, "Stalling for more than %d sec" % self.failedTime, ) # setting up the threading maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor( max_workers=maxNumberOfThreads) return S_OK()
def doNew(self, masterParams=None): hosts = masterParams sql = """ select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP where J.JobID = JP.JobID and JP.Name = 'HostName' and J.EndExecTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value, J.Status """ jobDB = JobDB() queryRes = jobDB._query(sql) if not queryRes['OK']: return queryRes records = queryRes['Value'] hostJobs = {} for record in records: hostName = record[0] status = record[1] if (status != 'Done' and status != 'Failed'): continue if hostName not in hostJobs: hostJobs[hostName] = { 'Site': record[2], 'Done': 0, 'Failed': 0 } hostJobs[hostName][record[1]] = record[3] uniformResult = [] for host, hostDict in hostJobs.items(): hostDict['Host'] = host try: hosts.remove(host) except ValueError: pass if hostDict['Done'] == 0 and hostDict['Failed'] == 0: hostDict['Efficiency'] = 0.0 else: hostDict['Efficiency'] = math.floor( float(hostDict['Done']) / (hostDict['Done'] + hostDict['Failed']) * 1000) / 10 uniformResult.append(hostDict) if len(hosts) != 0: deleteRes = self.rmIHEPClient.deleteWorkNodeCache(host=hosts) if not deleteRes['OK']: return deleteRes storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult)
def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') return S_OK()
def execute(self): """ Main execution method """ # Get the WMS Snapshot! result = JobDB().getSummarySnapshot(self.__jobDBFields) now = Time.dateTime() if not result['OK']: self.log.error( "Can't get the JobDB summary", "%s: won't commit at this cycle" % result['Message']) return S_ERROR() # Now we try to commit values = result['Value'][1] self.log.info("Start sending records") for record in values: record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP in range(len(self.__summaryKeyFieldsMapping)): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP in range(len(self.__summaryValueFieldsMapping)): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) for backend in self.datastores: if backend.lower() == 'monitoring': rD['timestamp'] = int(Time.toEpoch(now)) self.datastores['Monitoring'].addRecord(rD) elif backend.lower() == 'accounting': acWMS = WMSHistory() acWMS.setStartTime(now) acWMS.setEndTime(now) acWMS.setValuesFromDict(rD) retVal = acWMS.checkValues() if not retVal['OK']: self.log.error("Invalid accounting record ", "%s -> %s" % (retVal['Message'], rD)) else: self.datastores['Accounting'].addRegister(acWMS) for backend, datastore in self.datastores.items(): self.log.info("Committing to %s backend" % backend) result = datastore.commit() if not result['OK']: self.log.error("Couldn't commit WMS history to %s" % backend, result['Message']) return S_ERROR() self.log.verbose("Done committing to %s backend" % backend) return S_OK()
def setUp(self): def mockInit(self): self.log = MagicMock() self.logger = MagicMock() self._connected = True from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB with patch(MODULE_NAME + ".JobDB.__init__", new=mockInit): self.jobDB = JobDB() self.jobDB._query = MagicMock(name="Query") self.jobDB._escapeString = MagicMock(return_value=S_OK())
def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.am_setOption( 'GridEnv', '' ) self.am_setOption( 'PilotStalledDays', 3 ) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() return S_OK()
def initializeWMSAdministratorHandler(serviceInfo): """ WMS AdministratorService initialization """ global jobDB global taskQueueDB jobDB = JobDB() taskQueueDB = TaskQueueDB() return S_OK()
def initializeHandler(cls, svcInfoDict): cls.gJobDB = JobDB() cls.gJobLoggingDB = JobLoggingDB() cls.gTaskQueueDB = TaskQueueDB() cls.gElasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.gElasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def __init__(self, args=None, clients=None): super(WorkNodeIHEPCommand, self).__init__(args, clients) if 'JobDB' in self.apis: self.jobDB = self.apis['JobDB'] else: self.jobDB = JobDB() if 'ResourceManagementIHEPClient' in self.apis: self.rmIHEPClient = self.apis['ResourceManagementIHEPClient'] else: self.rmIHEPClient = ResourceManagementIHEPClient()
def initializeHandler(cls, svcInfoDict): """ WMS AdministratorService initialization """ cls.jobDB = JobDB() cls.taskQueueDB = TaskQueueDB() cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.elasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def initialize(self): """Sets defaults""" self.am_setOption("GridEnv", "") self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30) self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay", 7) self.pilots = PilotManagerClient() return S_OK()
def initializeHandler(cls, svcInfoDict): """ Determines the switching of ElasticSearch and MySQL backends """ cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.elasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def __updateJobStatus(self, jobID, status, minorstatus=None): """ This method updates the job status in the JobDB FIXME: Use the JobStateUpdate service instead of the JobDB """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (jobID, status)) from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB jobDB = JobDB() if self.enabled: result = jobDB.setJobAttribute(jobID, 'Status', status, update=True) else: return S_OK('DisabledMode') if not result['OK']: self.log.error("Failed to update job status", result['Message']) raise RuntimeError("Failed to update job status") if minorstatus is None: #Retain last minor status for stalled jobs result = jobDB.getJobAttributes(jobID, ['MinorStatus']) if result['OK']: minorstatus = result['Value']['MinorStatus'] else: self.log.error("Failed to get Minor Status", result['Message']) raise RuntimeError("Failed to get Minorstatus") else: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (jobID, minorstatus)) result = jobDB.setJobAttribute(jobID, 'MinorStatus', minorstatus, update=True) logStatus = status from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB result = JobLoggingDB().addLoggingRecord(jobID, status=logStatus, minor=minorstatus, source='DataRecoveryAgent') if not result['OK']: ## just the logging entry, no big loss so no exception self.log.warn(result) return result