def test02Props( self ): """ props """ # # valid values req = Request() req.RequestID = 1 self.assertEqual( req.RequestID, 1 ) req.RequestName = "test" self.assertEqual( req.RequestName, "test" ) req.JobID = 1 self.assertEqual( req.JobID, 1 ) req.JobID = "1" self.assertEqual( req.JobID, 1 ) req.CreationTime = "1970-01-01 00:00:00" self.assertEqual( req.CreationTime, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.CreationTime = datetime.datetime( 1970, 1, 1, 0, 0, 0 ) self.assertEqual( req.CreationTime, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.SubmitTime = "1970-01-01 00:00:00" self.assertEqual( req.SubmitTime, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.SubmitTime = datetime.datetime( 1970, 1, 1, 0, 0, 0 ) self.assertEqual( req.SubmitTime, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.LastUpdate = "1970-01-01 00:00:00" self.assertEqual( req.LastUpdate, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.LastUpdate = datetime.datetime( 1970, 1, 1, 0, 0, 0 ) self.assertEqual( req.LastUpdate, datetime.datetime( 1970, 1, 1, 0, 0, 0 ) ) req.Error = ""
def myRequest(): """Create a request and put it to the db""" request = Request() request.RequestName = 'myAwesomeRemovalRequest.xml' request.JobID = 0 request.SourceComponent = "myScript" remove = Operation() remove.Type = "RemoveFile" lfn = "/ilc/user/s/sailer/test.txt" rmFile = File() rmFile.LFN = lfn remove.addFile(rmFile) request.addOperation(remove) isValid = RequestValidator().validate(request) if not isValid['OK']: raise RuntimeError("Failover request is not valid: %s" % isValid['Message']) else: print "It is a GOGOGO" requestClient = ReqClient() result = requestClient.putRequest(request) print result
def test_Props(): """props""" # # valid values req = Request() req.RequestID = 1 assert req.RequestID == 1 req.RequestName = "test" assert req.RequestName == "test" req.JobID = 1 assert req.JobID == 1 req.CreationTime = "1970-01-01 00:00:00" assert req.CreationTime == datetime.datetime(1970, 1, 1, 0, 0, 0) req.CreationTime = datetime.datetime(1970, 1, 1, 0, 0, 0) assert req.CreationTime == datetime.datetime(1970, 1, 1, 0, 0, 0) req.SubmitTime = "1970-01-01 00:00:00" assert req.SubmitTime == datetime.datetime(1970, 1, 1, 0, 0, 0) req.SubmitTime = datetime.datetime(1970, 1, 1, 0, 0, 0) assert req.SubmitTime == datetime.datetime(1970, 1, 1, 0, 0, 0) req.LastUpdate = "1970-01-01 00:00:00" assert req.LastUpdate == datetime.datetime(1970, 1, 1, 0, 0, 0) req.LastUpdate = datetime.datetime(1970, 1, 1, 0, 0, 0) assert req.LastUpdate == datetime.datetime(1970, 1, 1, 0, 0, 0) req.Error = ""
def myRequest(): """Create a request and put it to the db""" request = Request() request.RequestName = 'myAwesomeRemovalRequest.xml' request.JobID = 0 request.SourceComponent = "myScript" remove = Operation() remove.Type = "RemoveFile" lfn = "/ilc/user/s/sailer/test.txt" rmFile = File() rmFile.LFN = lfn remove.addFile( rmFile ) request.addOperation( remove ) isValid = RequestValidator().validate( request ) if not isValid['OK']: raise RuntimeError( "Failover request is not valid: %s" % isValid['Message'] ) else: print("It is a GOGOGO") requestClient = ReqClient() result = requestClient.putRequest( request ) print(result)
def test02Props(self): """ props """ # # valid values req = Request() req.RequestID = 1 self.assertEqual(req.RequestID, 1) req.RequestName = "test" self.assertEqual(req.RequestName, "test") req.JobID = 1 self.assertEqual(req.JobID, 1) req.JobID = "1" self.assertEqual(req.JobID, 1) req.CreationTime = "1970-01-01 00:00:00" self.assertEqual(req.CreationTime, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.CreationTime = datetime.datetime(1970, 1, 1, 0, 0, 0) self.assertEqual(req.CreationTime, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.SubmitTime = "1970-01-01 00:00:00" self.assertEqual(req.SubmitTime, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.SubmitTime = datetime.datetime(1970, 1, 1, 0, 0, 0) self.assertEqual(req.SubmitTime, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.LastUpdate = "1970-01-01 00:00:00" self.assertEqual(req.LastUpdate, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.LastUpdate = datetime.datetime(1970, 1, 1, 0, 0, 0) self.assertEqual(req.LastUpdate, datetime.datetime(1970, 1, 1, 0, 0, 0)) req.Error = ""
def execute(self): """The JobAgent execution method.""" # Temporary mechanism to pass a shutdown message to the agent if os.path.exists("/var/lib/dirac_drain"): return self._finish("Node is being drained by an operator") self.log.verbose("Job Agent execution loop") # Check that there is enough slots to match a job result = self._checkCEAvailability(self.computingElement) if not result["OK"]: return self._finish(result["Message"]) if result["OK"] and result["Value"]: return result # Check that we are allowed to continue and that time left is sufficient if self.jobCount: cpuWorkLeft = self._computeCPUWorkLeft() result = self._checkCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result result = self._setCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result # Get environment details and enhance them result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDictList = result["Value"] for ceDict in ceDictList: self._setCEDict(ceDict) # Try to match a job jobRequest = self._matchAJob(ceDictList) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: res = self._checkMatchingIssues(jobRequest) if not res["OK"]: self._finish(res["Message"]) return res # if we don't match a job, independently from the reason, # we wait a bit longer before trying again time.sleep( int(self.am_getOption("PollingTime")) * (self.matchFailedCount + 1) * 2) return res # If we are, we matched a job # Reset the Counter self.matchFailedCount = 0 # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: return self._finish(result["Message"]) # Get matcher information if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( "PilotInfoReportedFlag", False) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) return self._finish("JDL Problem") params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: return self._finish(result["Value"]) submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for thisp in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( "/LocalSite/%s" % thisp, "Unknown"), sendFlag=False) jobReport.setJobStatus(minorStatus="Job Received by Agent", sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) return self._finish(result["Message"], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get("Value") # Save the job jdl for external monitoring self._saveJobJDLRequest(jobID, jobJDL) # Check software and install them if required software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) return self._finish(result["Message"], self.stopOnApplicationFailure) gridCE = gConfig.getValue("/LocalSite/GridCE", "") if gridCE: jobReport.setJobParameter(par_name="GridCE", par_value=gridCE, sendFlag=False) queue = gConfig.getValue("/LocalSite/CEQueue", "") if queue: jobReport.setJobParameter(par_name="CEQueue", par_value=queue, sendFlag=False) self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error("Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: return self._finish(result_submitJob["Message"]) elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] if self.stopOnApplicationFailure: return self._finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception", direct=True) return self._finish(result["Message"], self.stopOnApplicationFailure) return S_OK("Job Agent cycle complete")
def test01fullChain(self): put = self.requestClient.putRequest(self.request) self.assertTrue(put['OK'], put) self.assertEqual(type(put['Value']), long) reqID = put['Value'] # # summary ret = RequestDB().getDBSummary() self.assertEqual(ret, {'OK': True, 'Value': {'Operation': {'ReplicateAndRegister': {'Waiting': 1}}, 'Request': {'Waiting': 1}, 'File': {'Waiting': 2}}}) get = self.requestClient.getRequest(reqID) self.assertTrue(get['OK']) self.assertEqual(isinstance(get['Value'], Request), True) # # summary - the request became "Assigned" res = RequestDB().getDBSummary() self.assertEqual(res, {'OK': True, 'Value': {'Operation': {'ReplicateAndRegister': {'Waiting': 1}}, 'Request': {'Assigned': 1}, 'File': {'Waiting': 2}}}) res = self.requestClient.getRequestInfo(reqID) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getRequestFileStatus(reqID, self.file.LFN) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getRequestFileStatus(reqID, [self.file.LFN]) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getDigest(reqID) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.readRequestsForJobs([123]) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') self.assertTrue(isinstance(res['Value']['Successful'][123], Request)) proxyInfo = getProxyInfo()['Value'] # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" self.request.OwnerDN = proxyInfo['identity'] self.request.OwnerGroup = proxyInfo['group'] request2.JobID = 456 request2.addOperation(self.operation) # # update res = self.requestClient.putRequest(request2) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') reqID2 = res['Value'] # # get summary again ret = RequestDB().getDBSummary() self.assertEqual(ret, {'OK': True, 'Value': {'Operation': {'ReplicateAndRegister': {'Waiting': 2}}, 'Request': {'Waiting': 1, 'Assigned': 1}, 'File': {'Waiting': 4}}}) delete = self.requestClient.deleteRequest(reqID) self.assertEqual(delete['OK'], True, delete['Message'] if 'Message' in delete else 'OK') delete = self.requestClient.deleteRequest(reqID2) self.assertEqual(delete['OK'], True, delete['Message'] if 'Message' in delete else 'OK') # # should be empty now ret = RequestDB().getDBSummary() self.assertEqual(ret, {'OK': True, 'Value': {'Operation': {}, 'Request': {}, 'File': {}}})
self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'Successful': {123L:self.request.RequestName}, 'Failed': {}} ) res = self.requestClient.getRequestNamesList() self.assert_( res['OK'] ) res = self.requestClient.readRequestsForJobs( [123] ) self.assert_( res['OK'] ) self.assert_( isinstance( res['Value']['Successful'][123], Request ) ) # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" request2.OwnerDN = "/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=cibak/CN=605919/CN=Krzysztof Ciba" request2.OwnerGroup = "dirac_user" request2.JobID = 456 request2.addOperation( self.operation ) # # update res = self.requestClient.putRequest( request2 ) self.assert_( res['OK'] ) # # get summary again ret = RequestDB().getDBSummary() self.assertEqual( ret, { 'OK': True, 'Value': { 'Operation': { 'ReplicateAndRegister': {'Waiting': 2L } }, 'Request': { 'Waiting': 1L, 'Assigned': 1L }, 'File': { 'Waiting': 4L} } } )
def test01fullChain(self): put = self.requestClient.putRequest(self.request) self.assertTrue(put['OK'], put) self.assertEqual(type(put['Value']), long) reqID = put['Value'] # # summary ret = self.requestClient.getDBSummary() self.assertTrue(ret['OK']) self.assertEqual(ret['Value'], {'Operation': {'ReplicateAndRegister': {'Waiting': 1}}, 'Request': {'Waiting': 1}, 'File': {'Waiting': 2}}) get = self.requestClient.getRequest(reqID) self.assertTrue(get['OK']) self.assertEqual(isinstance(get['Value'], Request), True) # # summary - the request became "Assigned" res = self.requestClient.getDBSummary() self.assertTrue(res['OK']) self.assertEqual(res['Value'], {'Operation': {'ReplicateAndRegister': {'Waiting': 1}}, 'Request': {'Assigned': 1}, 'File': {'Waiting': 2}}) res = self.requestClient.getRequestInfo(reqID) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getRequestFileStatus(reqID, self.file.LFN) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getRequestFileStatus(reqID, [self.file.LFN]) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.getDigest(reqID) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') res = self.requestClient.readRequestsForJobs([123]) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') self.assertTrue(isinstance(res['Value']['Successful'][123], Request)) proxyInfo = getProxyInfo()['Value'] # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" self.request.OwnerDN = proxyInfo['identity'] self.request.OwnerGroup = proxyInfo['group'] request2.JobID = 456 request2.addOperation(self.operation) # # update res = self.requestClient.putRequest(request2) self.assertEqual(res['OK'], True, res['Message'] if 'Message' in res else 'OK') reqID2 = res['Value'] # # get summary again ret = self.requestClient.getDBSummary() self.assertTrue(ret['OK']) self.assertEqual(ret['Value'], {'Operation': {'ReplicateAndRegister': {'Waiting': 2}}, 'Request': {'Waiting': 1, 'Assigned': 1}, 'File': {'Waiting': 4}}) delete = self.requestClient.deleteRequest(reqID) self.assertEqual(delete['OK'], True, delete['Message'] if 'Message' in delete else 'OK') delete = self.requestClient.deleteRequest(reqID2) self.assertEqual(delete['OK'], True, delete['Message'] if 'Message' in delete else 'OK') # # should be empty now ret = self.requestClient.getDBSummary() self.assertTrue(ret['OK']) self.assertEqual(ret['Value'], {'Operation': {}, 'Request': {}, 'File': {}})
'Failed': {} }) res = self.requestClient.getRequestNamesList() self.assert_(res['OK']) res = self.requestClient.readRequestsForJobs([123]) self.assert_(res['OK']) self.assert_(isinstance(res['Value']['Successful'][123], Request)) # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" request2.OwnerDN = "/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=cibak/CN=605919/CN=Krzysztof Ciba" request2.OwnerGroup = "dirac_user" request2.JobID = 456 request2.addOperation(self.operation) # # update res = self.requestClient.putRequest(request2) self.assert_(res['OK']) # # get summary again ret = RequestDB().getDBSummary() self.assertEqual( ret, { 'OK': True, 'Value': { 'Operation': { 'ReplicateAndRegister': { 'Waiting': 2L
def test01fullChain( self ): put = self.requestClient.putRequest( self.request ) self.assert_( put['OK'] ) self.assertEqual( type( put['Value'] ), long ) reqID = put['Value'] # # summary ret = RequestDB().getDBSummary() self.assertEqual( ret, { 'OK': True, 'Value': { 'Operation': { 'ReplicateAndRegister': { 'Waiting': 1L } }, 'Request': { 'Waiting': 1L }, 'File': { 'Waiting': 2L} } } ) get = self.requestClient.getRequest( reqID ) self.assert_( get['OK'] ) self.assertEqual( isinstance( get['Value'], Request ), True ) # # summary - the request became "Assigned" res = RequestDB().getDBSummary() self.assertEqual( res, { 'OK': True, 'Value': { 'Operation': { 'ReplicateAndRegister': { 'Waiting': 1L } }, 'Request': { 'Assigned': 1L }, 'File': { 'Waiting': 2L} } } ) res = self.requestClient.getRequestInfo( reqID ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) res = self.requestClient.getRequestFileStatus( reqID, self.file.LFN ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) res = self.requestClient.getRequestFileStatus( reqID, [self.file.LFN] ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) res = self.requestClient.getDigest( reqID ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) res = self.requestClient.readRequestsForJobs( [123] ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) self.assert_( isinstance( res['Value']['Successful'][123], Request ) ) # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" request2.OwnerDN = "/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=cibak/CN=605919/CN=Krzysztof Ciba" request2.OwnerGroup = "dirac_user" request2.JobID = 456 request2.addOperation( self.operation ) # # update res = self.requestClient.putRequest( request2 ) self.assertEqual( res['OK'], True, res['Message'] if 'Message' in res else 'OK' ) reqID2 = res['Value'] # # get summary again ret = RequestDB().getDBSummary() self.assertEqual( ret, { 'OK': True, 'Value': { 'Operation': { 'ReplicateAndRegister': {'Waiting': 2L } }, 'Request': { 'Waiting': 1L, 'Assigned': 1L }, 'File': { 'Waiting': 4L} } } )
def execute(self): """The JobAgent execution method.""" self.log.verbose("Job Agent execution loop") queueDictItems = list(self.queueDict.items()) random.shuffle(queueDictItems) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result for queueName, queueDictionary in queueDictItems: # Make sure there is no problem with the queue before trying to submit if not self._allowedToSubmit(queueName): continue # Get a working proxy ce = queueDictionary["CE"] cpuTime = 86400 * 3 self.log.verbose( "Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result proxy = result["Value"] result = proxy.getRemainingSecs() # pylint: disable=no-member if not result["OK"]: return result lifetime_secs = result["Value"] ce.setProxy(proxy, lifetime_secs) # Check that there is enough slots in the remote CE to match a job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 continue # Get environment details and enhance them result = self._getCEDict(ce) if not result["OK"]: self.failedQueues[queueName] += 1 continue ceDictList = result["Value"] for ceDict in ceDictList: # Information about number of processors might not be returned in CE.getCEStatus() ceDict["NumberOfProcessors"] = ce.ceParameters.get( "NumberOfProcessors") self._setCEDict(ceDict) # Update the configuration with the names of the Site, CE and queue to target # This is used in the next stages self._updateConfiguration("Site", queueDictionary["Site"]) self._updateConfiguration("GridCE", queueDictionary["CEName"]) self._updateConfiguration("CEQueue", queueDictionary["QueueName"]) self._updateConfiguration("RemoteExecution", True) # Try to match a job jobRequest = self._matchAJob(ceDictList) while jobRequest["OK"]: # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) self.failedQueues[queueName] += 1 break params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) jobReport.setJobStatus(status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False) # Setup proxy result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) self.failedQueues[queueName] += 1 break proxyChain = result_setupProxy.get("Value") # Check software and install them if required software = self._checkInstallSoftware( jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) self.failedQueues[queueName] += 1 break # Submit the job to the CE self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error( "Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: self.log.error("Error during submission", result_submitJob["Message"]) self.failedQueues[queueName] += 1 break elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result # Check that there is enough slots in the remote CE to match a new job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 break # Try to match a new job jobRequest = self._matchAJob(ceDictList) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception") self.failedQueues[queueName] += 1 break if not jobRequest["OK"]: self._checkMatchingIssues(jobRequest) self.failedQueues[queueName] += 1 continue return S_OK("Push Job Agent cycle complete")
def execute(self): """The JobAgent execution method. """ # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Check if we can match jobs at all self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', ': %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) self.log.info( 'CE is not available (and there are no running jobs)') return self.__finish('CE Not Available') if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: self.timeLeft = self.computeCPUWorkLeft() self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') # if we are here we assume that a job can be matched result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: # if we don't match a job, independently from the reason, # we wait a bit longer before trying again self.am_setOption("PollingTime", int(self.am_getOption("PollingTime") * 1.5)) if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 # If we are here it is because we matched a job matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: jobReport.setJobStatus(status='Failed', minor='Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: jobReport.setJobStatus(status='Failed', minor='Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: jobReport.setJobStatus(status='Failed', minor='Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' jobReport.setJobStatus(status='Failed', minor=msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions and 'dirac-jobexec' in params.get( 'Executable', '').strip(): params['Arguments'] = (params.get('Arguments', '') + ' ' + self.extraOptions).strip() params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name='MatcherServiceTime', par_value=str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus(status='Matched', minor='Job Received by Agent', sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy['OK']: return self._rescheduleFailedJob(jobID, result_setupProxy['Message'], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=processors, wholeNode=wholeNode, maxNumberOfProcessors=maxNumberOfProcessors, mpTag=mpTag) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res['OK']: resFD = jobReport.generateForwardDISET() if not resFD['OK']: self.log.error("Error generating ForwardDISET operation", resFD['Message']) else: # Here we create the Request. op = resFD['Value'] request = Request() requestName = 'jobAgent_%s' % jobID request.RequestName = requestName.replace('"', '') request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob['OK']: return self.__finish(result_submitJob['Message']) elif 'PayloadFailed' in result_submitJob: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result_submitJob[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) return S_OK('Job Agent cycle complete')
def test01fullChain(self): ret = self.requestClient.getDBSummary() self.assertTrue(ret["OK"]) initialSummary = ret["Value"] put = self.requestClient.putRequest(self.request) self.assertTrue(put["OK"], put) self.assertTrue(isinstance(put["Value"], six.integer_types)) reqID = put["Value"] # summary ret = self.requestClient.getDBSummary() self.assertTrue(ret["OK"]) self._checkSummary( initialSummary, [ ("Operation", "ReplicateAndRegister", "Waiting", 1), (None, "Request", "Waiting", 1), (None, "File", "Waiting", 2), ], ) get = self.requestClient.getRequest(reqID) self.assertTrue(get["OK"]) self.assertEqual(isinstance(get["Value"], Request), True) # # summary - the request became "Assigned" self._checkSummary( initialSummary, [ ("Operation", "ReplicateAndRegister", "Waiting", 1), (None, "Request", "Assigned", 1), (None, "File", "Waiting", 2), ], ) res = self.requestClient.getRequestInfo(reqID) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") res = self.requestClient.getRequestFileStatus(reqID, self.file.LFN) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") res = self.requestClient.getRequestFileStatus(reqID, [self.file.LFN]) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") res = self.requestClient.getDigest(reqID) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") res = self.requestClient.readRequestsForJobs([123]) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") self.assertTrue(isinstance(res["Value"]["Successful"][123], Request)) proxyInfo = getProxyInfo()["Value"] # Adding new request request2 = Request() request2.RequestName = "RequestManagerHandlerTests-2" self.request.OwnerDN = proxyInfo["identity"] self.request.OwnerGroup = proxyInfo["group"] request2.JobID = 456 request2.addOperation(self.operation) # # update res = self.requestClient.putRequest(request2) self.assertEqual(res["OK"], True, res["Message"] if "Message" in res else "OK") reqID2 = res["Value"] # # get summary again ret = self.requestClient.getDBSummary() self.assertTrue(ret["OK"]) self._checkSummary( initialSummary, [ ("Operation", "ReplicateAndRegister", "Waiting", 2), (None, "Request", "Waiting", 1), (None, "Request", "Assigned", 1), (None, "File", "Waiting", 4), ], ) delete = self.requestClient.deleteRequest(reqID) self.assertEqual(delete["OK"], True, delete["Message"] if "Message" in delete else "OK") delete = self.requestClient.deleteRequest(reqID2) self.assertEqual(delete["OK"], True, delete["Message"] if "Message" in delete else "OK") # # should be empty now ret = self.requestClient.getDBSummary() self.assertTrue(ret["OK"]) self.assertEqual(ret["Value"], initialSummary)