def initSites(): ''' Initializes Sites statuses taking their values from the "SiteMask" table of "JobDB" database. ''' jobClient = JobDB() rssClient = ResourceStatusDB() sites = jobClient.getAllSiteMaskStatus() if not sites[ 'OK' ]: subLogger.error( sites[ 'Message' ] ) DIRACExit( 1 ) for site, elements in sites['Value'].iteritems(): table = { 'table': 'SiteStatus' } insert = { 'Status': elements[0], 'Reason': 'Synchronized', 'Name': site, 'DateEffective': elements[1], 'TokenExpiration': Datetime, 'ElementType': 'Site', 'StatusType': 'all', 'LastCheckTime': None, 'TokenOwner': elements[2] } result = rssClient.addIfNotThere(insert, table) if not result[ 'OK' ]: subLogger.error( result[ 'Message' ] ) DIRACExit( 1 ) return S_OK()
def doNew(self, masterParams=None): hosts = masterParams sql = """ select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP where J.JobID = JP.JobID and JP.Name = 'HostName' and J.EndExecTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value, J.Status """ jobDB = JobDB() queryRes = jobDB._query(sql) if not queryRes["OK"]: return queryRes records = queryRes["Value"] hostJobs = {} for record in records: hostName = record[0] status = record[1] if status != "Done" and status != "Failed": continue if hostName not in hostJobs: hostJobs[hostName] = {"Site": record[2], "Done": 0, "Failed": 0} hostJobs[hostName][record[1]] = record[3] uniformResult = [] for host, hostDict in hostJobs.items(): hostDict["Host"] = host try: hosts.remove(host) except ValueError: pass if hostDict["Done"] == 0 and hostDict["Failed"] == 0: hostDict["Efficiency"] = 0.0 else: hostDict["Efficiency"] = ( math.floor(float(hostDict["Done"]) / (hostDict["Done"] + hostDict["Failed"]) * 1000) / 10 ) uniformResult.append(hostDict) if len(hosts) != 0: deleteRes = self.rmClient.deleteWorkNodeCache(host=hosts) if not deleteRes["OK"]: return deleteRes storeRes = self._storeCommand(uniformResult) if not storeRes["OK"]: return storeRes return S_OK(uniformResult)
def mysql_querry(querry): if USE_PURE_MYSQL: import MySQLdb#@UnresolvedImport db = MySQLdb.connect(host="diracdb2.ihep.ac.cn", # your host, usually localhost user="******", # your username passwd="###DIRAC_DB_PASS###", # your password db="JobDB") cur = db.cursor() cur.execute(querry) data = cur.fetchall() cur.close() return data else: db = JobDB() return db._query( querry )['Value']
class JobDBTestCase( unittest.TestCase ): """ Base class for the JobDB test cases """ def setUp( self ): gLogger.setLevel( 'DEBUG' ) self.jobDB = JobDB() def tearDown( self ): result = self.jobDB.selectJobs( {} ) self.assert_( result['OK'], 'Status after selectJobs' ) jobs = result['Value'] for job in jobs: result = self.jobDB.removeJobFromDB( job ) self.assert_( result['OK'] )
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
class JobDBTest( unittest.TestCase ): def setUp( self ): def mockInit(self ): self.log = MagicMock() self.logger = MagicMock() self._connected = True from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB with patch( MODULE_NAME+".JobDB.__init__", new=mockInit): self.jobDB = JobDB() self.jobDB._query = MagicMock( name="Query" ) self.jobDB._escapeString = MagicMock( return_value=S_OK() ) def tearDown( self ): pass def test_getInputData( self ): self.jobDB._query.return_value = S_OK( (( '/vo/user/lfn1',), ('LFN:/vo/user/lfn2',)) ) result = self.jobDB.getInputData( 1234 ) print result self.assertTrue( result['OK'] ) self.assertEqual( result['Value'], [ '/vo/user/lfn1', '/vo/user/lfn2' ] )
def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK()
def initialize( self, jobDB = False, logDB = False ): """ Initialization of the Optimizer Agent. """ if not jobDB: self.jobDB = JobDB() else: self.jobDB = jobDB if not logDB: self.logDB = JobLoggingDB() else: self.logDB = logDB trailing = "Agent" optimizerName = self.am_getModuleParam( 'agentName' ) if optimizerName[ -len( trailing ):].find( trailing ) == 0: optimizerName = optimizerName[ :-len( trailing ) ] self.am_setModuleParam( 'optimizerName', optimizerName ) self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' ) self.startingMajorStatus = "Checking" self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' ) self.requiredJobInfo = 'jdl' self.am_setOption( "PollingTime", 30 ) return self.initializeOptimizer()
class JobDBTestCase(unittest.TestCase): """ Base class for the JobDB test cases """ def setUp(self): print self.jobDB = JobDB('Test',20) def createJob(self): result = self.jobDB.getJobID() jobID = result['Value'] jdlfile = open("test.jdl","r") jdl = jdlfile.read() jdlfile.close() result = self.jobDB.insertJobIntoDB(jobID,jdl) return jobID
def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.jobLoggingDB = JobLoggingDB() self._optimizers = {} self.am_setOption( "PollingTime", 30 ) return S_OK()
def initialize( self ): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption( 'PollingTime', 60 * 60 ) if not self.am_getOption( 'Enable', True ): self.log.info( 'Stalled Job Agent running in disabled mode' ) return S_OK()
class JobHistoryAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize( self ): self.jobDB = JobDB() for status in MONITOR_STATUS: for site in MONITOR_SITES: gLogger.verbose( "Registering activity %s-%s" % ( status, site ) ) gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) ) gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ), "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN ) self.last_update = 0 self.resultDB = None self.reportPeriod = 60 return S_OK() def execute( self ): """ Main execution method """ delta = time.time() - self.last_update if delta > self.reportPeriod: result = self.jobDB.getCounters( 'Jobs', ['Status', 'Site'], {}, '' ) if not result['OK']: return S_ERROR( 'Failed to get data from the Job Database' ) self.resultDB = result['Value'] self.last_update = time.time() totalDict = {} for status in MONITOR_STATUS: totalDict[status] = 0 for row in self.resultDB: site = row[0]['Site'] status = row[0]['Status'] count = row[1] if site in MONITOR_SITES and status in MONITOR_STATUS: gLogger.verbose( "Adding mark %s-%s: " % ( status, site ) + str( count ) ) gMonitor.addMark( "%s-%s" % ( status, site ), count ) if status in totalDict: totalDict[status] += count for status in MONITOR_STATUS: gLogger.verbose( "Adding mark %s-All sites: " % status + str( totalDict[status] ) ) gMonitor.addMark( "%s-All sites" % status, totalDict[status] ) return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.am_setOption( 'GridEnv', '' ) self.am_setOption( 'PilotStalledDays', 3 ) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() return S_OK()
def setUp( self ): def mockInit(self ): self.log = MagicMock() self.logger = MagicMock() self._connected = True from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB with patch( MODULE_NAME+".JobDB.__init__", new=mockInit): self.jobDB = JobDB() self.jobDB._query = MagicMock( name="Query" ) self.jobDB._escapeString = MagicMock( return_value=S_OK() )
def getDataFromDB(self): db = JobDB() cmd = 'SELECT * FROM Jobs' res = db._query( cmd ) #Week data cmd = "select J.Site, JP.Value, sum(1), S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value group by JP.Value order by J.Site asc" result = mysql_querry( cmd ) #print result data = {} for i in result: data[(i[0], i[1])]=[int(i[2]), int(i[3])] #TwoDays data cmd = "select J.Site, JP.Value, sum(1), S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 48 HOUR) group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 48 HOUR) group by JP.Value order by J.Site asc" result = mysql_querry( cmd ) for i in result: if (i[0],i[1]) not in data: data[(i[0],i[1])] = [0,0] print "Strange behavior: for ", (i[0],i[1]), " was no week data" data[(i[0], i[1])] += [int(i[2]), int(i[3])] #OneDay data cmd = "select J.Site, JP.Value, sum(1), S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value order by J.Site asc" result = mysql_querry( cmd ) for i in result: if (i[0],i[1]) not in data: data[(i[0],i[1])] = [0,0] print "Strange behavior: for ", (i[0],i[1]), " was no week data" data[(i[0], i[1])] += [int(i[2]), int(i[3])] prejson = [[x[0], x[1]]+data[x] for x in data] for i in range(0, len(prejson)): prejson[i] += [0 for j in range(0, 8-len(prejson[i]))] return prejson
def __init__( self, args = None, clients = None ): super( JobCommand, self ).__init__( args, clients ) if 'JobDB' in self.apis: self.jobDB = self.apis[ 'JobDB' ] else: self.jobDB = JobDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient()
def doNew( self, masterParams = None ): sql = """ select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP where J.JobID = JP.JobID and JP.Name = 'HostName' and J.LastUpdateTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value, J.Status """ jobDB = JobDB() queryRes = jobDB._query(sql) if not queryRes[ 'OK' ]: return queryRes records = queryRes[ 'Value' ] hostJobs = {} for record in records: hostName = record[ 0 ] if hostName not in hostJobs: hostJobs[ hostName ] = { 'Site' : record[ 2 ], 'Running' : 0, 'Done' : 0, 'Failed' : 0 } hostJobs[ hostName ][ record[ 1 ] ] = record[ 3 ] uniformResult = [] for host, hostDict in hostJobs.items(): hostDict[ 'Host' ] = host if hostDict[ 'Done' ] == 0 and hostDict[ 'Failed' ] == 0: hostDict[ 'Efficiency' ] = 0 else: hostDict[ 'Efficiency' ] = float( hostDict[ 'Done' ] ) / ( hostDict[ 'Done' ] + hostDict[ 'Failed' ] ) uniformResult.append( hostDict ) storeRes = self._storeCommand( uniformResult ) if not storeRes[ 'OK' ]: return storeRes return S_OK( uniformResult )
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production']) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initialize( self ): self.jobDB = JobDB() for status in MONITOR_STATUS: for site in MONITOR_SITES: gLogger.verbose( "Registering activity %s-%s" % ( status, site ) ) gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) ) gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ), "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN ) self.last_update = 0 self.resultDB = None self.reportPeriod = 60 return S_OK()
def initialize( self ): """ Standard constructor """ self.dsClients = {} self.jobDB = JobDB() self.reportPeriod = 850 self.am_setOption( "PollingTime", self.reportPeriod ) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) return S_OK()
class SPTCorrector(BaseCorrector): _GLOBAL_MAX_CORRECTION = "MaxGlobalCorrection" _SLICE_TIME_SPAN = "TimeSpan" _SLICE_WEIGHT = "Weight" _SLICE_MAX_CORRECTION = "MaxCorrection" def initialize(self): self.__jobDB = JobDB() return S_OK() def applyCorrection(self, entitiesExpectedShare): print "AT >>> entitiesExpectedShare", entitiesExpectedShare ownerDNs = entitiesExpectedShare.keys() group = self.getGroup() result = self.__jobDB.getCounters("Jobs", ["OwnerDN"], {"OwnerGroup": group, "Status": "Waiting"}) if not result["OK"]: print "AT >>> result", result return entitiesExpectedShare ownerDict = {} for row in result["Value"]: ownerDict[row[0]["OwnerDN"]] = row[1] print "AT >>> ownerDict", ownerDict resultShare = {} minNumber = 1000000000000 minOwnerDN = "" for ownerDN in ownerDNs: resultShare[ownerDN] = 0 if minNumber > ownerDict[ownerDN]: minNumber = ownerDict[ownerDN] minOwnerDN = ownerDN resultShare[minOwnerDN] = 1 print "AT >>> resultShare", resultShare return resultShare def updateHistoryKnowledge(self): return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption( "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", ")) ) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200) self.jobByJob = self.am_getOption("JobByJob", True) self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0) return S_OK()
def initialize(self, jobDB=None, logDB=None): """ Initialization of the Optimizer Agent. """ self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam('agentName') if optimizerName.endswith('Agent'): optimizerName = optimizerName[:-len('Agent')] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter(monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK()
def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.reportPeriod = 120 self.am_setOption( "PollingTime", self.reportPeriod ) self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" ) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) return S_OK()
def __init__(self, jobDB=None, opsHelper=None): """ Constructor """ self.__runningLimitSection = "JobScheduling/RunningLimit" self.__matchingDelaySection = "JobScheduling/MatchingDelay" self.csDictCache = DictCache() self.condCache = DictCache() self.delayMem = {} if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() self.log = gLogger.getSubLogger("Limiter") if opsHelper: self.__opsHelper = opsHelper else: self.__opsHelper = Operations()
def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption( "PollingTime", 120 ) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) result = gConfig.getOption( "/Systems/RabbitMQ/User" ) if not result['OK']: raise RuntimeError( 'Failed to get the configuration parameters: User' ) user = result['Value'] result = gConfig.getOption( "/Systems/RabbitMQ/Password" ) if not result['OK']: raise RuntimeError( 'Failed to get the configuration parameters: Password' ) password = result['Value'] result = gConfig.getOption( "/Systems/RabbitMQ/Host" ) if not result['OK']: raise RuntimeError( 'Failed to get the configuration parameters: Host' ) self.host = result['Value'] self.credentials = pika.PlainCredentials( user, password ) self.connection = pika.BlockingConnection( pika.ConnectionParameters( host = self.host, credentials = self.credentials ) ) self.channel = self.connection.channel() self.channel.exchange_declare( exchange = 'mdatabase', exchange_type = 'fanout' ) return S_OK()
class Matcher: """Logic for matching""" def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None, pilotRef=None): """c'tor""" if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() if pilotRef: self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) else: self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper, pilotRef=pilotRef) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """Main job selection function to find the highest priority job matching the resource capacity""" startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"] if "NumberOfProcessors" in resourceDescription: toPrintDict["NumberOfProcessors"] = resourceDescription[ "NumberOfProcessors"] toPrintDict["Tag"] = [] if "Tag" in resourceDict: for tag in resourceDict["Tag"]: if not tag.endswith("GB") and not tag.endswith("Processors"): toPrintDict["Tag"].append(tag) if not toPrintDict["Tag"]: toPrintDict.pop("Tag") self.log.info("Resource description for matching", printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict["Site"], resourceDict.get("GridCE")) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result["OK"]: raise RuntimeError(result["Message"]) result = result["Value"] if not result["matchFound"]: self.log.info("No match found") return {} jobID = result["jobId"] resAtt = self.jobDB.getJobAttributes( jobID, ["OwnerDN", "OwnerGroup", "Status"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if not resAtt["Value"]["Status"] == "Waiting": self.log.error("Job matched by the TQ is not in Waiting state", str(jobID)) result = self.tqDB.deleteJob(jobID) if not result["OK"]: raise RuntimeError(result["Message"]) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result["OK"]: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict["JDL"] = result["Value"] resultDict["JobID"] = jobID matchTime = time.time() - startTime self.log.verbose("Match time", "[%s]" % str(matchTime)) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt["OK"]: for key, value in resOpt["Value"].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict["Site"], jobID) pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict["DN"] = resAtt["Value"]["OwnerDN"] resultDict["Group"] = resAtt["Value"]["OwnerGroup"] resultDict["PilotInfoReportedFlag"] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """from resourceDescription to resourceDict (just various mods)""" resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict["JobType"] = "Test" self.log.verbose("Resource description") for key in resourceDict: self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get("Tag"): tags = resourceDescription["Tag"] resourceDict["Tag"] = (tags if isinstance(tags, list) else list( {tag.strip("\"' ") for tag in tags.strip("[]").split(",")})) if "RequiredTag" in resourceDescription: requiredTagsList = (list({ tag.strip("\"' ") for tag in resourceDescription["RequiredTag"].strip( "[]").split(",") }) if isinstance(resourceDescription["RequiredTag"], str) else resourceDescription["RequiredTag"]) resourceDict["RequiredTag"] = requiredTagsList if "JobID" in resourceDescription: resourceDict["JobID"] = resourceDescription["JobID"] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get("MaxRAM") if maxRAM: try: maxRAM = int(maxRAM / 1000) except ValueError: maxRAM = None nProcessors = resourceDescription.get("NumberOfProcessors") if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]: if param and param <= 1024: paramList = list(range(2, param + 1)) paramTags = ["%d%s" % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors and nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if "Tag" in resourceDict: resourceDict["Tag"] = list(set(resourceDict["Tag"])) if "RequiredTag" in resourceDict: resourceDict["RequiredTag"] = list(set( resourceDict["RequiredTag"])) for k in ( "DIRACVersion", "ReleaseVersion", "ReleaseProject", "VirtualOrganization", "PilotReference", "PilotBenchmark", "PilotInfoReportedFlag", ): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"] attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result["OK"]: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Set job attributes for jobID", jobID) result = self.jlDB.addLoggingRecord(jobID, status=JobStatus.MATCHED, minorStatus="Assigned", source="Matcher") if not result["OK"]: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Added logging record for jobID", jobID) def _checkMask(self, resourceDict): """Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if "Site" not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict["Site"]) if not result["OK"]: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result["Message"]) raise RuntimeError("Internal error") if resourceDict["Site"] not in result["Value"]: return False return True def _updatePilotInfo(self, resourceDict): """Update pilot information - do not fail if we don't manage to do it""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": gridCE = resourceDict.get("GridCE", "Unknown") site = resourceDict.get("Site", "Unknown") benchmark = resourceDict.get("PilotBenchmark", 0.0) self.log.verbose( "Reporting pilot info", "for %s: gridCE=%s, site=%s, benchmark=%f" % (pilotReference, gridCE, site, benchmark), ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status=PilotStatus.RUNNING, gridSite=site, destination=gridCE, benchmark=benchmark) if not result["OK"]: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _updatePilotJobMapping(self, resourceDict, jobID): """Update pilot to job mapping information""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result["OK"]: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result["OK"]: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _checkCredentials(self, resourceDict, credDict): """Check if we can get a job given the passed credentials""" if Properties.GENERIC_PILOT in credDict["properties"]: # You can only match groups in the same VO if credDict["group"] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get("VirtualOrganization", "") else: vo = Registry.getVOForGroup(credDict["group"]) if "OwnerGroup" not in resourceDict: result = Registry.getGroupsForVO(vo) if result["OK"]: resourceDict["OwnerGroup"] = result["Value"] else: raise RuntimeError(result["Message"]) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict["properties"]: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict["OwnerDN"] = credDict["DN"] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict["properties"]: resourceDict["OwnerGroup"] = credDict["group"] self.log.notice( "Setting the resource group to the credentials group") if "OwnerDN" in resourceDict and resourceDict[ "OwnerDN"] != credDict["DN"]: ownerDN = resourceDict["OwnerDN"] result = Registry.getGroupsForDN(resourceDict["OwnerDN"]) if not result["OK"]: raise RuntimeError(result["Message"]) if credDict["group"] not in result["Value"]: # DN is not in the same group! bad boy. self.log.warn( "You cannot request jobs from this DN, as it does not belong to your group!", "(%s)" % ownerDN, ) resourceDict["OwnerDN"] = credDict["DN"] # Nothing special, group and DN have to be the same else: resourceDict["OwnerDN"] = credDict["DN"] resourceDict["OwnerGroup"] = credDict["group"] return resourceDict def _checkPilotVersion(self, resourceDict): """Check the pilot DIRAC version""" if self.opsHelper.getValue("Pilot/CheckVersion", True): if "ReleaseVersion" not in resourceDict: if "DIRACVersion" not in resourceDict: raise PilotVersionError( "Version check requested and not provided by Pilot") else: pilotVersion = resourceDict["DIRACVersion"] else: pilotVersion = resourceDict["ReleaseVersion"] validVersions = [ convertToPy3VersionNumber(newStyleVersion) for newStyleVersion in self.opsHelper.getValue( "Pilot/Version", []) ] if validVersions and convertToPy3VersionNumber( pilotVersion) not in validVersions: raise PilotVersionError( "Pilot version does not match the production version: %s not in ( %s )" % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if "ReleaseProject" not in resourceDict: raise PilotVersionError( "Version check requested but expected project %s not received" % validProject) if resourceDict["ReleaseProject"] != validProject: raise PilotVersionError( "Version check requested but expected project %s != received %s" % (validProject, resourceDict["ReleaseProject"]))
def __init__(self): self.JobDB = JobDB() self.custom_timers = {}
def __init__(self): """ The standard constructor takes the database name (dbname) and the name of the configuration section (dbconfig) """ DIRACJobDB.__init__(self) self.jdl2DBParameters += ['runNumber']
class Limiter(object): def __init__(self, jobDB=None, opsHelper=None): """ Constructor """ self.__runningLimitSection = "JobScheduling/RunningLimit" self.__matchingDelaySection = "JobScheduling/MatchingDelay" self.csDictCache = DictCache() self.condCache = DictCache() self.delayMem = {} if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() self.log = gLogger.getSubLogger("Limiter") if opsHelper: self.__opsHelper = opsHelper else: self.__opsHelper = Operations() def getNegativeCond(self): """ Get negative condition for ALL sites """ orCond = self.condCache.get("GLOBAL") if orCond: return orCond negCond = {} # Run Limit result = self.__opsHelper.getSections(self.__runningLimitSection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getRunningCondition(siteName) if not result['OK']: continue data = result['Value'] if data: negCond[siteName] = data # Delay limit result = self.__opsHelper.getSections(self.__matchingDelaySection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getDelayCondition(siteName) if not result['OK']: continue data = result['Value'] if not data: continue if siteName in negCond: negCond[siteName] = self.__mergeCond(negCond[siteName], data) else: negCond[siteName] = data orCond = [] for siteName in negCond: negCond[siteName]['Site'] = siteName orCond.append(negCond[siteName]) self.condCache.add("GLOBAL", 10, orCond) return orCond def getNegativeCondForSite(self, siteName): """ Generate a negative query based on the limits set on the site """ # Check if Limits are imposed onto the site negativeCond = {} if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True): result = self.__getRunningCondition(siteName) if result['OK']: negativeCond = result['Value'] self.log.verbose('Negative conditions for site %s after checking limits are: %s' % (siteName, str(negativeCond))) if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): result = self.__getDelayCondition(siteName) if result['OK']: delayCond = result['Value'] self.log.verbose('Negative conditions for site %s after delay checking are: %s' % (siteName, str(delayCond))) negativeCond = self.__mergeCond(negativeCond, delayCond) if negativeCond: self.log.info('Negative conditions for site %s are: %s' % (siteName, str(negativeCond))) return negativeCond def __mergeCond(self, negCond, addCond): """ Merge two negative dicts """ # Merge both negative dicts for attr in addCond: if attr not in negCond: negCond[attr] = [] for value in addCond[attr]: if value not in negCond[attr]: negCond[attr].append(value) return negCond def __extractCSData(self, section): """ Extract limiting information from the CS in the form: { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } """ stuffDict = self.csDictCache.get(section) if stuffDict: return S_OK(stuffDict) result = self.__opsHelper.getSections(section) if not result['OK']: return result attribs = result['Value'] stuffDict = {} for attName in attribs: result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName)) if not result['OK']: return result attLimits = result['Value'] try: attLimits = dict([(k, int(attLimits[k])) for k in attLimits]) except Exception as excp: errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp)) self.log.error(errMsg) return S_ERROR(errMsg) stuffDict[attName] = attLimits self.csDictCache.add(section, 300, stuffDict) return S_OK(stuffDict) def __getRunningCondition(self, siteName): """ Get extra conditions allowing site throttling """ siteSection = "%s/%s" % (self.__runningLimitSection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result limitsDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not limitsDict: return S_OK({}) # Check if the site exceeding the given limits negCond = {} for attName in limitsDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute %s does not exist. Check the job limits" % attName) continue cK = "Running:%s:%s" % (siteName, attName) data = self.condCache.get(cK) if not data: result = self.jobDB.getCounters( 'Jobs', [attName], { 'Site': siteName, 'Status': [ 'Running', 'Matched', 'Stalled']}) if not result['OK']: return result data = result['Value'] data = dict([(k[0][attName], k[1]) for k in data]) self.condCache.add(cK, 10, data) for attValue in limitsDict[attName]: limit = limitsDict[attName][attValue] running = data.get(attValue, 0) if running >= limit: self.log.verbose('Job Limit imposed at %s on %s/%s=%d,' ' %d jobs already deployed' % (siteName, attName, attValue, limit, running)) if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) # negCond is something like : {'JobType': ['Merge']} return S_OK(negCond) def updateDelayCounters(self, siteName, jid): # Get the info from the CS siteSection = "%s/%s" % (self.__matchingDelaySection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result delayDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not delayDict: return S_OK() attNames = [] for attName in delayDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute %s does not exist in the JobDB. Please fix it!" % attName) else: attNames.append(attName) result = self.jobDB.getJobAttributes(jid, attNames) if not result['OK']: self.log.error("While retrieving attributes coming from %s: %s" % (siteSection, result['Message'])) return result atts = result['Value'] # Create the DictCache if not there if siteName not in self.delayMem: self.delayMem[siteName] = DictCache() # Update the counters delayCounter = self.delayMem[siteName] for attName in atts: attValue = atts[attName] if attValue in delayDict[attName]: delayTime = delayDict[attName][attValue] self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName, attValue, delayTime)) delayCounter.add((attName, attValue), delayTime) return S_OK() def __getDelayCondition(self, siteName): """ Get extra conditions allowing matching delay """ if siteName not in self.delayMem: return S_OK({}) lastRun = self.delayMem[siteName].getKeys() negCond = {} for attName, attValue in lastRun: if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) return S_OK(negCond)
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class StalledJobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.logDB = None self.matchedTime = 7200 self.rescheduledTime = 600 self.completedTime = 86400 self.submittingTime = 300 ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) self.stalledJobsTolerantSites = self.am_getOption( 'StalledJobsTolerantSites', []) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.verbose('Waking up Stalled Job Agent') wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.stalledJobsToleranceTime = self.am_getOption( 'StalledJobsToleranceTime', 0) self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.completedTime = self.am_getOption('CompletedTime', self.completedTime) self.log.verbose('StalledTime = %s cycles' % (stalledTime)) self.log.verbose('FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result['OK']: self.log.error('Failed to detect stalled jobs', result['Message']) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs(failedTime) if not result['OK']: self.log.error('Failed to process stalled jobs', result['Message']) result = self.__failCompletedJobs() if not result['OK']: self.log.error('Failed to process completed jobs', result['Message']) result = self.__failSubmittingJobs() if not result['OK']: self.log.error('Failed to process jobs being submitted', result['Message']) result = self.__kickStuckJobs() if not result['OK']: self.log.error('Failed to kick stuck jobs', result['Message']) return S_OK('Stalled Job Agent cycle complete') ############################################################################# def __markStalledJobs(self, stalledTime): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs({'Status': 'Running'}) if not result['OK']: return result if not result['Value']: return S_OK() jobs = result['Value'] self.log.info('%s Running jobs will be checked for being stalled' % (len(jobs))) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: site = self.jobDB.getJobAttribute(job, 'site')['Value'] if site in self.stalledJobsTolerantSites: result = self.__getStalledJob( job, stalledTime + self.stalledJobsToleranceTime) else: result = self.__getStalledJob(job, stalledTime) if result['OK']: self.log.verbose('Updating status to Stalled for job %s' % (job)) self.__updateJobStatus(job, 'Stalled') stalledCounter += 1 else: self.log.verbose(result['Message']) runningCounter += 1 self.log.info( 'Total jobs: %s, Stalled job count: %s, Running job count: %s' % (len(jobs), stalledCounter, runningCounter)) return S_OK() ############################################################################# def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: self.__sendKillCommand(job) self.__updateJobStatus(job, 'Failed', setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter) ############################################################################# def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = WMSAdministratorClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus) ############################################################################# def __getStalledJob(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime)) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % (job, elapsedTime)) return S_OK('Stalled') return S_ERROR('Job %s is running and will be ignored' % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK']: self.log.error('Failed to get job attributes', result['Message']) if not result['OK'] or not result['Value']: self.log.error('Could not get attributes for job', '%s' % job) return S_ERROR('Could not get attributes for job') self.log.verbose(result) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value'][ 'HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null for job %s' % job) else: latestUpdate = toEpoch(fromString( result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value'][ 'LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null for job %s' % job) else: lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime'])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorstatus=None): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) if self.am_getOption('Enable', True): result = self.jobDB.setJobAttribute(job, 'Status', status, update=True) else: result = S_OK('DisabledMode') if result['OK']: if minorstatus: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus)) result = self.jobDB.setJobAttribute(job, 'MinorStatus', minorstatus, update=True) if not minorstatus: # Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ['MinorStatus']) if result['OK']: minorstatus = result['Value']['MinorStatus'] logStatus = status result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source='StalledJobAgent') if not result['OK']: self.log.warn(result) return result def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL(jobID, original=True) if not result['OK']: return processingType classAdJob = ClassAd(result['Value']) if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') return processingType def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, 'CPUNormalizationFactor') if not result['OK'] or not result['Value']: self.log.error( 'Error getting Job Parameter CPUNormalizationFactor, setting 0', result['Message']) cpuNormalization = 0.0 else: cpuNormalization = float( result['Value'].get('CPUNormalizationFactor')) except Exception as e: self.log.exception( "Exception in __sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if name == 'CPUConsumed': try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if name == 'WallClockTime': try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if isinstance(startTime, basestring): startTime = fromString(startTime) if startTime is None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime is None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Matched status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs']) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({'Status': 'Rescheduled'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Rescheduled status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs']) if message: return S_ERROR(message) return S_OK() def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({'Status': 'Completed'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes(jobID, ['Status', 'MinorStatus']) if not result['OK']: self.log.error('Failed to get job attributes', result['Message']) continue if result['Value']['Status'] != "Completed": continue if result['Value']['MinorStatus'] == "Pending Requests": continue result = self.__updateJobStatus(jobID, 'Failed', "Job died during finalization") result = self.__sendAccounting(jobID) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue return S_OK() def __failSubmittingJobs(self): """ Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = str(dateTime() - self.submittingTime * second) result = self.jobDB.selectJobs({'Status': 'Submitting'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() for jobID in jobIDs: result = self.__updateJobStatus(jobID, 'Failed') if not result['OK']: self.log.error('Failed to update job status', result['Message']) continue return S_OK() def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error( "Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get( 'Message', ''), ownerGroup.get('Message', '')))
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} ############################################################################# def initialize(self): """ Sets defaults """ self.am_setOption("PollingTime", 120) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info( "Will exclude the following Production types from cleaning %s" % (', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500) self.jobByJob = self.am_getOption('JobByJob', False) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1) return S_OK() def __getAllowedJobTypes(self): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes('JobType') if not result['OK']: return result cleanJobTypes = [] for jobType in result['Value']: if jobType not in self.prodTypes: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus({'Status': 'Deleted'}) if not result['OK']: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result['OK']: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = {'JobType': result['Value']} # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[status] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict(baseCond) if status != 'Any': condDict['Status'] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result['OK']: gLogger.warn('Failed to remove jobs in status %s' % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s day(s)" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay, limit=self.maxJobsAtOnce) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result['OK']: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result['OK']: gLogger.error("Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) resultLogDB = self.jobLoggingDB.deleteJob(jobID) errorFlag = False if not resultJobDB['OK']: gLogger.warn('Failed to remove job %d from JobDB' % jobID, result['Message']) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message']) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message']) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message']) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList)) if count > 0 or error_count > 0: gLogger.info('Deleted %d jobs from JobDB, %d errors' % (count, error_count)) return S_OK() def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = JobMonitoringClient().getJobParameter( jobID, 'OutputSandboxLFN') if result['OK']: lfn = result['Value'].get('OutputSandboxLFN') if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.error('Error interrogating JobDB: %s' % result['Message']) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn, jobID in lfnDict.items(): result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result) def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile(removedFile) oRequest.addOperation(removeFile) return ReqClient().putRequest(oRequest)
class StatesAccountingAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', ] __summaryDefinedFields = [('ApplicationStatus', 'unset'), ('MinorStatus', 'unset')] __summaryValueFieldsMapping = [ 'Jobs', 'Reschedules', ] __renameFieldsMapping = {'JobType': 'JobSplitType'} def initialize(self): """ Standard constructor """ self.dsClients = {} self.jobDB = JobDB() self.retryOnce = False self.retryValues = [] self.reportPeriod = 850 self.am_setOption("PollingTime", self.reportPeriod) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK() def execute(self): """ Main execution method """ result = gConfig.getSections("/DIRAC/Setups") if not result['OK']: return result validSetups = result['Value'] self.log.info("Valid setups for this cycle are %s" % ", ".join(validSetups)) #Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot(self.__jobDBFields) now = Time.dateTime() if not result['OK']: self.log.error( "Can't get the JobDB summary", "%s: won't commit at this cycle" % result['Message']) else: values = result['Value'][1] if self.retryOnce: self.log.verbose( "Adding to records to commit those not committed within the previous cycle" ) acWMSListAdded = [] for record in values: recordSetup = record[0] if recordSetup not in validSetups: self.log.error("Setup %s is not valid" % recordSetup) continue if recordSetup not in self.dsClients: self.log.info("Creating DataStore client for %s" % recordSetup) self.dsClients[recordSetup] = DataStoreClient( setup=recordSetup, retryGraceTime=900) record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP in range(len(self.__summaryKeyFieldsMapping)): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP in range(len(self.__summaryValueFieldsMapping)): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) acWMS = WMSHistory() acWMS.setStartTime(now) acWMS.setEndTime(now) acWMS.setValuesFromDict(rD) retVal = acWMS.checkValues() if not retVal['OK']: self.log.error("Invalid accounting record ", "%s -> %s" % (retVal['Message'], rD)) else: self.dsClients[recordSetup].addRegister(acWMS) acWMSListAdded.append(acWMS) if self.retryOnce and self.retryValues: for acWMSCumulated in self.retryValues: retVal = acWMSCumulated.checkValues() if not retVal['OK']: self.log.error("Invalid accounting record ", "%s" % (retVal['Message'])) else: self.dsClients[recordSetup].addRegister(acWMSCumulated) for setup in self.dsClients: self.log.info("Sending records for setup %s" % setup) result = self.dsClients[setup].commit() if not result['OK']: self.log.error( "Couldn't commit wms history for setup %s" % setup, result['Message']) # Re-creating the client: for new connection, and for avoiding accumulating too large of a backlog self.dsClients[setup] = DataStoreClient(setup=setup, retryGraceTime=900) if not self.retryOnce: self.log.info("Will try again at next cycle") self.retryOnce = True self.retryValues = acWMSListAdded else: self.log.warn("Won't retry one more time") self.retryOnce = False self.retryValues = [] else: self.log.info("Sent %s records for setup %s" % (result['Value'], setup)) self.retryOnce = False return S_OK()
class MightyOptimizer(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __jobStates = ['Received', 'Checking'] def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.jobLoggingDB = JobLoggingDB() self._optimizers = {} self.am_setOption("PollingTime", 30) return S_OK() def execute(self): """ The method call by AgentModule on each iteration """ jobTypeCondition = self.am_getOption("JobTypeRestriction", []) jobCond = {'Status': self.__jobStates} if jobTypeCondition: jobCond['JobType'] = jobTypeCondition result = self.jobDB.selectJobs(jobCond) if not result['OK']: return result jobsList = result['Value'] self.log.info("Got %s jobs for this iteration" % len(jobsList)) if not jobsList: return S_OK() result = self.jobDB.getAttributesForJobList(jobsList) if not result['OK']: return result jobsToProcess = result['Value'] for jobId in jobsToProcess: self.log.info("== Processing job %s == " % jobId) jobAttrs = jobsToProcess[jobId] jobDef = False jobOptimized = False jobOK = True while not jobOptimized: result = self.optimizeJob(jobId, jobAttrs, jobDef) if not result['OK']: self.log.error( "Optimizer %s error" % jobAttrs['MinorStatus'], "Job %s: %s" % (str(jobID), result['Message'])) jobOK = False break optResult = result['Value'] jobOptimized = optResult['done'] if 'jobDef' in optResult: jobDef = optResult['jobDef'] if jobOK: self.log.info("Finished optimizing job %s" % jobId) return S_OK() def optimizeJob(self, jobId, jobAttrs, jobDef): """ The method call for each Job to be optimized """ #Get the next optimizer result = self._getNextOptimizer(jobAttrs) if not result['OK']: return result optimizer = result['Value'] if not optimizer: return S_OK({'done': True}) #If there's no job def then get it if not jobDef: result = optimizer.getJobDefinition(jobId, jobDef) if not result['OK']: optimizer.setFailedJob(jobId, result['Message']) return result jobDef = result['Value'] #Does the optimizer require a proxy? shifterEnv = False if optimizer.am_getModuleParam('shifterProxy'): shifterEnv = True result = setupShifterProxyInEnv( optimizer.am_getModuleParam('shifterProxy'), optimizer.am_getShifterProxyLocation()) if not result['OK']: return result #Call the initCycle function result = self.am_secureCall(optimizer.beginExecution, name="beginExecution") if not result['OK']: return result #Do the work result = optimizer.optimizeJob(jobId, jobDef['classad']) if not result['OK']: return result nextOptimizer = result['Value'] #If there was a shifter proxy, unset it if shifterEnv: del (os.environ['X509_USER_PROXY']) #Check if the JDL has changed newJDL = jobDef['classad'].asJDL() if newJDL != jobDef['jdl']: jobDef['jdl'] = newJDL #If there's a new optimizer set it! if nextOptimizer: jobAttrs['Status'] = 'Checking' jobAttrs['MinorStatus'] = nextOptimizer return S_OK({'done': False, 'jobDef': jobDef}) return S_OK({'done': True, 'jobDef': jobDef}) def _getNextOptimizer(self, jobAttrs): """ Determine next Optimizer in the Path """ if jobAttrs['Status'] == 'Received': nextOptimizer = "JobPath" else: nextOptimizer = jobAttrs['MinorStatus'] if nextOptimizer in self.am_getOption("FilteredOptimizers", "InputData, BKInputData"): return S_OK(False) gLogger.info("Next optimizer for job %s is %s" % (jobAttrs['JobID'], nextOptimizer)) if nextOptimizer not in self._optimizers: result = self.__loadOptimizer(nextOptimizer) if not result['OK']: return result self._optimizers[nextOptimizer] = result['Value'] return S_OK(self._optimizers[nextOptimizer]) @gOptimizerLoadSync def __loadOptimizer(self, optimizerName): """Need to load an optimizer """ gLogger.info("Loading optimizer %s" % optimizerName) try: agentName = "%sAgent" % optimizerName optimizerModule = __import__( 'DIRAC.WorkloadManagementSystem.Agent.%s' % agentName, globals(), locals(), agentName) optimizerClass = getattr(optimizerModule, agentName) optimizer = optimizerClass("WorkloadManagement/%s" % agentName, self.am_getModuleParam('fullName')) result = optimizer.am_initialize(self.jobDB, self.jobLoggingDB) if not result['OK']: return S_ERROR("Can't initialize optimizer %s: %s" % (optimizerName, result['Message'])) except Exception, e: gLogger.exception("LOADERROR") return S_ERROR("Can't load optimizer %s: %s" % (optimizerName, str(e))) return S_OK(optimizer)