def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def finalizeRequest(self, requestID, jobID, useCertificates=True): """check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestID: request id :param int jobID: job id """ stateServer = JobStateUpdateClient(useCertificates=useCertificates) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestID) if not res["OK"]: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestID, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestID, res["Value"])) # The request is 'Done', let's update the job status. If we fail, we should re-try later monitorServer = JobMonitoringClient(useCertificates=useCertificates) res = monitorServer.getJobSummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return res elif not res["Value"]: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]["Status"] jobMinorStatus = res["Value"]["MinorStatus"] jobAppStatus = "" newJobStatus = "" if jobStatus == JobStatus.STALLED: # If job is stalled, find the previous status from the logging info res = monitorServer.getJobLoggingInfo(int(jobID)) if not res["OK"]: self.log.error( "finalizeRequest: Failed to get job logging info", "JobID: %d" % jobID) return res # Check the last status was Stalled and get the one before if len(res["Value"] ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED: jobStatus, jobMinorStatus, jobAppStatus = res["Value"][ -2][:3] newJobStatus = jobStatus # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestID) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestID, digest["Message"])) if jobStatus == JobStatus.COMPLETED: # What to do? Depends on what we have in the minorStatus if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS: newJobStatus = JobStatus.DONE elif jobMinorStatus == JobMinorStatus.APP_ERRORS: newJobStatus = JobStatus.FAILED elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION: # If the job has been Killed, set it Killed newJobStatus = JobStatus.KILLED else: self.log.error( "finalizeRequest: Unexpected jobMinorStatus", "for %d (got %s)" % (jobID, jobMinorStatus)) return S_ERROR("Unexpected jobMinorStatus") if newJobStatus: self.log.info( "finalizeRequest: Updating job status", "for %d to '%s/%s'" % (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE), ) else: self.log.info( "finalizeRequest: Updating job minor status", "for %d to '%s' (current status is %s)" % (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus), ) stateUpdate = stateServer.setJobStatus( jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS") if jobAppStatus and stateUpdate["OK"]: stateUpdate = stateServer.setJobApplicationStatus( jobID, jobAppStatus, "RMS") if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]), ) return stateUpdate return S_OK(newJobStatus)
def test_JobStateUpdateAndJobMonitoring( self ): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create a job and check stuff job = helloWorldJob() jobDescription = createFile( job ) # submitting the job. Checking few stuff res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = int ( res['Value'] ) # jobID = res['JobID'] res = jobMonitor.getJobJDL( jobID, True ) self.assert_( res['OK'] ) res = jobMonitor.getJobJDL( jobID, False ) self.assert_( res['OK'] ) # Adding stuff res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' ) self.assert_( res['OK'] ) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite( jobID, 'Site' ) self.assert_( res['OK'] ) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Running' ) res = jobMonitor.getJobParameter( jobID, 'par1' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value'} ) res = jobMonitor.getJobParameters( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} ) res = jobMonitor.getJobAttribute( jobID, 'Site' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Site' ) res = jobMonitor.getJobAttributes( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['JobName'], 'helloWorld' ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['Status'], 'Running' ) res = jobMonitor.getJobHeartBeatData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getInputData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getJobPrimarySummary( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getAtticJobParameters( jobID ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' ) self.assert_( res['OK'] ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Status'], 'Done' ) self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} ) self.assert_( res['OK'] ) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob( jobID )
class WorkflowTasks(TaskBase): """Handles jobs""" def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasksBulk" startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose("Setting job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) try: site = oJob.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJob.workflow.findParameter("JobType").getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter("JOB_ID"): oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if oJob.workflow.findParameter("PRODUCTION_ID"): oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID", ) oJob.setType(jobType) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose("Setting Site: ", str(sites), transID=transID, method=method) seqDict["Site"] = sites seqDict["JobName"] = self._transTaskName(transID, taskID) seqDict["JOB_ID"] = str(taskID).zfill(8) self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # Handle Input Data inputData = paramsDict.get("InputData") if inputData: if isinstance(inputData, six.string_types): inputData = inputData.replace(" ", "").split(";") self._logVerbose("Setting input data to %s" % inputData, transID=transID, method=method) seqDict["InputData"] = inputData elif paramSeqDict.get("InputData") is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logVerbose("Setting %s to %s" % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ "Job": oJob._toXML(), # pylint: disable=protected-access "TransformationID": transID, "TaskID": taskID, "InputData": inputData, }) if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, name, "JDL", "%%(%s)s" % name, name # pylint: disable=protected-access ) for pName, seq in seqDict.items(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.items(): if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData" ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res["OK"]: return res if taskDict: self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict["BulkJobObject"] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasks" startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter("JobType").getValue() templateOK = False getOutputDataTiming = 0.0 for taskID, paramsDict in taskDict.items(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose("Job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter("PRODUCTION_ID"): oJobTemplate._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter("JOB_ID"): oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose("Setting task name to %s" % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue("JOB_ID", str(taskID).zfill(8)) inputData = None self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) paramsDict["TaskObject"] = "" continue else: self._logDebug("Setting Site: ", str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res["OK"]: self._logError("Could not set the site: %s" % res["Message"], transID=transID, method=method) paramsDict["TaskObject"] = "" continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) paramsDict["TaskObject"] = "" if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ "Job": oJob._toXML(), "TransformationID": transID, "TaskID": taskID, "InputData": inputData }) getOutputDataTiming += time.time() if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): oJob._addJDLParameter(name, ";".join(output)) paramsDict["TaskObject"] = oJob if taskDict: self._logVerbose( "Average getOutputData time: %.1f per task" % (getOutputDataTiming / len(taskDict)), transID=transID, method=method, ) self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """Handle Sites and TargetSE in the parameters""" try: sites = ["ANY"] if paramsDict["Site"]: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict["Site"], sepChar=";") except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res["OK"]: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res["Value"] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ["ANY"]: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """set job inputs (+ metadata)""" inputData = paramsDict.get("InputData") transID = paramsDict["TransformationID"] if inputData: self._logVerbose("Setting input data to %s" % inputData, transID=transID, method="_handleInputs") res = oJob.setInputData(inputData) if not res["OK"]: self._logError("Could not set the inputs: %s" % res["Message"], transID=transID, method="_handleInputs") def _handleRest(self, oJob, paramsDict): """add as JDL parameters all the other parameters that are not for inputs or destination""" transID = paramsDict["TransformationID"] for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logDebug("Setting %s to %s" % (paramName, paramValue), transID=transID, method="_handleRest") oJob._addJDLParameter(paramName, paramValue) def _checkSickTransformations(self, transID): """Check if the transformation is in the transformations to be processed at Hospital or Clinic""" transID = int(transID) clinicPath = "Hospital" if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath if "Clinics" in self.opsH.getSections("Hospital").get("Value", []): basePath = os.path.join("Hospital", "Clinics") clinics = self.opsH.getSections(basePath)["Value"] for clinic in clinics: clinicPath = os.path.join(basePath, clinic) if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath return None def _handleHospital(self, oJob, clinicPath): """Optional handle of hospital/clinic jobs""" if not clinicPath: return oJob.setInputDataPolicy("download", dataScheduling=False) # Check first for a clinic, if not it must be the general hospital hospitalSite = self.opsH.getValue( os.path.join(clinicPath, "ClinicSite"), "") hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"), []) # If not found, get the hospital parameters if not hospitalSite: hospitalSite = self.opsH.getValue("Hospital/HospitalSite", "DIRAC.JobDebugger.ch") if not hospitalCEs: hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) oJob.setDestination(hospitalSite) if hospitalCEs: oJob._addJDLParameter("GridCE", hospitalCEs) def __generatePluginObject(self, plugin): """This simply instantiates the TaskManagerPlugin class with the relevant plugin name""" method = "__generatePluginObject" try: plugModule = __import__(self.pluginLocation, globals(), locals(), ["TaskManagerPlugin"]) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, "TaskManagerPlugin")("%s" % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """Get the list of job output LFNs from the provided plugin""" if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance["OK"]: return moduleInstance self.outputDataModule_o = moduleInstance["Value"] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """Submit the tasks""" if "BulkJobObject" in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """Submit jobs in one go with one parametric job""" if not taskDict: return S_OK(taskDict) startTime = time.time() method = "__submitTransformationTasksBulk" oJob = taskDict.pop("BulkJobObject") # we can only do this, once the job has been popped, or we _might_ crash transID = list(taskDict.values())[0]["TransformationID"] if oJob is None: self._logError("no bulk Job object found", transID=transID, method=method) return S_ERROR(ETSUKN, "No bulk job object provided for submission") result = self.submitTaskToExternal(oJob) if not result["OK"]: self._logError("Failed to submit tasks to external", transID=transID, method=method) return result jobIDList = result["Value"] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task["Success"] = False return S_ERROR( ETSUKN, "Submitted less number of jobs than requested tasks") # Get back correspondence with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]["ExternalID"] = jobID taskDict[taskID]["Success"] = True submitted = len(jobIDList) self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """Submit jobs one by one""" method = "__submitTransformationTasks" submitted = 0 failed = 0 startTime = time.time() for task in taskDict.values(): transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res["Message"], transID=transID, method=method) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logError("Failed to submit %d tasks to WMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """Submits a single job (which can be a bulk one) to the WMS.""" if isinstance(job, six.string_types): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", "", x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({"JobName": jobNames}) if not res["OK"]: self._logError( "Failed to get task from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) return res jobNameIDs = {} for wmsID in res["Value"]: res = self.jobMonitoringClient.getJobSummary(int(wmsID)) if not res["OK"]: self._logWarn( "Failed to get task summary from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) else: jobNameIDs[res["Value"]["JobName"]] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = "getSubmittedTaskStatus" if taskDicts: wmsIDs = [ int(taskDict["ExternalID"]) for taskDict in taskDicts if int(taskDict["ExternalID"]) ] transID = taskDicts[0]["TransformationID"] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] updateDict = {} for taskDict in taskDicts: taskID = taskDict["TaskID"] wmsID = int(taskDict["ExternalID"]) if not wmsID: continue oldStatus = taskDict["ExternalStatus"] newStatus = statusDict.get(wmsID, {}).get("Status", "Removed") if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( "Production/Job %d/%d removed from WMS while it is in %s status" % (transID, taskID, oldStatus), transID=transID, method=method, ) newStatus = "Failed" self._logVerbose( "Setting job status for Production/Job %d/%d to %s" % (transID, taskID, newStatus), transID=transID, method=method, ) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = "getSubmittedFileStatus" # All files are from the same transformation transID = fileDicts[0]["TransformationID"] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict["TaskID"]) taskFiles.setdefault(jobName, {})[fileDict["LFN"]] = fileDict["Status"] res = self.updateTransformationReservedTasks(fileDicts) if not res["OK"]: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res["Value"]["NoTasks"] taskNameIDs = res["Value"]["TaskNameIDs"] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].items(): if oldStatus != TransformationFilesStatus.UNUSED: updateDict[lfn] = TransformationFilesStatus.UNUSED res = self.jobMonitoringClient.getJobsStatus(list( taskNameIDs.values())) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] for jobName, wmsID in taskNameIDs.items(): jobStatus = statusDict.get(wmsID, {}).get("Status") newFileStatus = { "Done": TransformationFilesStatus.PROCESSED, "Completed": TransformationFilesStatus.PROCESSED, "Failed": TransformationFilesStatus.UNUSED, }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].items(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch', 'Site'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Matched', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)): { 'Status': 'Completed', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value']['Status'], 'Completed') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def __call__(self): """request processing""" self.log.debug("about to execute request") if not self.rmsMonitoring: gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient( useCertificates=True) res = monitorServer.getJobSummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as e: error = str(e) self.log.exception("hit by exception:", "%s" % error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if not self.rmsMonitoring: gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """# Now, let's submit some jobs. Different sites, types, inputs""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [["/a/1.txt", "/a/2.txt"], ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []] types = ["User", "Test"] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"}, msg="Got %s" % res["Value"]) res = jobMonitor.getJobTypes() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(sorted(res["Value"]), sorted(types), msg="Got %s" % str(sorted(res["Value"]))) res = jobMonitor.getApplicationStates() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], ["app status", "Unknown"], msg="Got %s" % str(res["Value"])) res = jobMonitor.getOwners() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getOwnerGroup() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getProductionIds() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobGroups() self.assertTrue(res["OK"], res.get("Message")) resJG_empty = res["Value"] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanNow = res["Value"] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanOneYear = res["Value"] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)), resJG_olderThanOneYear) res = jobMonitor.getStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [[JobStatus.RECEIVED], sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"]) res = jobMonitor.getMinorStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [ ["Job accepted"], sorted(["Job accepted", "Job Rescheduled"]), sorted(["Job accepted", "Marked for termination"]), ], res["Value"], ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobs() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"]) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { "Status": JobStatus.CHECKING, "MinorStatus": "MinorStatus", "Source": "Unknown", } }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus") res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)): { "Status": JobStatus.WAITING, "MinorStatus": "MinorStatus", "Source": "Unknown", }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)): { "Status": JobStatus.MATCHED, "MinorStatus": "MinorStatus-matched", "Source": "Unknown", }, }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]}) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobAttribute(jobID, "Status", JobStatus.RUNNING) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def test_JobStateUpdateAndJobMonitoring(self): """Verifying all JobStateUpdate and JobMonitoring functions""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = int(res["Value"]) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobJDL(jobID, False) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsParameters([jobID], []) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobOwner(jobID) self.assertTrue(res["OK"], res.get("Message")) # Adding stuff # forcing the update res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "running", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobParameters(jobID, [("par1", "par1Value"), ("par2", "par2Value")]) time.sleep(5) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobApplicationStatus(jobID, "app status", "source") self.assertTrue(res["OK"], res.get("Message")) # res = jobStateUpdate.setJobFlag() # self.assertTrue(res['OK'], res.get('Message')) # res = jobStateUpdate.unsetJobFlag() # self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobSite(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) # now checking few things res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameter(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {"par1": "par1Value"}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value", "par2": "par2Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttribute(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], "Site", msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttributes(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["JobName"], "helloWorld", msg="Got %s" % str(res["Value"]["JobName"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"]["Status"])) res = jobMonitor.getJobHeartBeatData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getInputData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getAtticJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.DONE, "MinorStatus", "Unknown") self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.DONE, msg="Got %s" % str(res["Value"]["Status"])) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus", msg="Got %s" % str(res["Value"]["MinorStatus"])) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) res = jobStateUpdate.sendHeartBeat(jobID, {"bih": "bih"}, {"boh": "boh"}) self.assertTrue(res["OK"], res.get("Message")) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)