def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'dirac-JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def createJob(): job = helloWorldJob() jobDescription = createFile(job) wmsClient = WMSClient() res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) assert res['OK'], res['Message'] jobID = int(res['Value']) return jobID
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { "OwnerGroup": "prod", "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "DIRACVersion": "pippo", "GridCE": "some.grid.ce.org", "ReleaseVersion": "blabla", "VirtualOrganization": "LHCb", "PilotInfoReportedFlag": "True", "PilotBenchmark": "anotherPilot", "Site": "DIRAC.Jenkins.ch", "CPUTime": 86400, } wmsClient = WMSClient() job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData("/a/bbb") job.setType("User") jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] # forcing the update res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING, "matching", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) tqDB = TaskQueueDB() tqDefDict = { "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "OwnerGroup": "prod", "Setup": "dirac-JenkinsSetup", "CPUTime": 86400, } res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res["OK"], res.get("Message")) res = MatcherClient().requestJob(resourceDescription) print(res) self.assertTrue(res["OK"], res.get("Message")) wmsClient.deleteJob(jobID)
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """# Now, let's submit some jobs. Different sites, types, inputs""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [["/a/1.txt", "/a/2.txt"], ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []] types = ["User", "Test"] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"}, msg="Got %s" % res["Value"]) res = jobMonitor.getJobTypes() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(sorted(res["Value"]), sorted(types), msg="Got %s" % str(sorted(res["Value"]))) res = jobMonitor.getApplicationStates() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], ["app status", "Unknown"], msg="Got %s" % str(res["Value"])) res = jobMonitor.getOwners() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getOwnerGroup() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getProductionIds() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobGroups() self.assertTrue(res["OK"], res.get("Message")) resJG_empty = res["Value"] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanNow = res["Value"] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanOneYear = res["Value"] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)), resJG_olderThanOneYear) res = jobMonitor.getStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [[JobStatus.RECEIVED], sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"]) res = jobMonitor.getMinorStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [ ["Job accepted"], sorted(["Job accepted", "Job Rescheduled"]), sorted(["Job accepted", "Marked for termination"]), ], res["Value"], ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobs() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"]) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { "Status": JobStatus.CHECKING, "MinorStatus": "MinorStatus", "Source": "Unknown", } }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus") res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)): { "Status": JobStatus.WAITING, "MinorStatus": "MinorStatus", "Source": "Unknown", }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)): { "Status": JobStatus.MATCHED, "MinorStatus": "MinorStatus-matched", "Source": "Unknown", }, }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]}) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobAttribute(jobID, "Status", JobStatus.RUNNING) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
file.file.close() tFile.close() fileNameList.append(name) except Exception,x: exception_counter = 1 c.result = {"success":"false","error":"An EXCEPTION happens during saving your sandbox file(s): %s" % str(x)} if len(fileNameList) > 0 and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(getRPCClient("WorkloadManagement/JobManager"), getRPCClient("WorkloadManagement/SandboxStore"), getTransferClient("WorkloadManagement/SandboxStore")) jdl = str(jdl) gLogger.info("J D L : ",jdl) try: result = jobManager.submitJob(jdl) if result["OK"]: c.result = {"success":"true","result":result["Value"]} else: c.result = {"success":"false","error":result["Message"]} except Exception,x: c.result = {"success":"false","error":"An EXCEPTION happens during job submittion: %s" % str(x)} if clearFS: shutil.rmtree(storePath) return c.result ################################################################################
def test_FullChain(self): """This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue(isinstance(res["Value"], int), msg="Got %s" % type(res["Value"])) self.assertEqual(res["Value"], res["JobID"], msg="Got %s, expected %s" % (str(res["Value"]), res["JobID"])) jobID = res["JobID"] jobID = res["Value"] # updating the status res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "Executing Minchiapp", "source") self.assertTrue(res["OK"], res.get("Message")) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res["OK"], res.get("Message")) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RECEIVED, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "MinorStatus": "Job Rescheduled" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "ApplicationStatus": "Unknown" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsStates([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual( res["Value"], { jobID: { "Status": JobStatus.RECEIVED, "MinorStatus": "Job Rescheduled", "ApplicationStatus": "Unknown" } }, msg="Got %s" % str(res["Value"]), ) # updating the status again res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.WAITING, "waiting", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.MATCHED, "matched", "source") self.assertTrue(res["OK"], res.get("Message")) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.KILLED, msg="Got %s" % str(res["Value"])) # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
def test_FullChain( self ): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create the job job = helloWorldJob() jobDescription = createFile( job ) # submit the job res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus( jobID, 'Running', 'Executing Minchiapp', 'source' ) # reset the job res = wmsClient.resetJob( jobID ) self.assert_( res['OK'] ) # reschedule the job res = wmsClient.rescheduleJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Received' ) # updating the status again jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Killed' ) # updating the status aaaagain jobStateUpdate.setJobStatus( jobID, 'Done', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Done' ) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Deleted' )
def test_JobStateUpdateAndJobMonitoring( self ): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create a job and check stuff job = helloWorldJob() jobDescription = createFile( job ) # submitting the job. Checking few stuff res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = int ( res['Value'] ) # jobID = res['JobID'] res = jobMonitor.getJobJDL( jobID, True ) self.assert_( res['OK'] ) res = jobMonitor.getJobJDL( jobID, False ) self.assert_( res['OK'] ) # Adding stuff res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' ) self.assert_( res['OK'] ) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite( jobID, 'Site' ) self.assert_( res['OK'] ) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Running' ) res = jobMonitor.getJobParameter( jobID, 'par1' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value'} ) res = jobMonitor.getJobParameters( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} ) res = jobMonitor.getJobAttribute( jobID, 'Site' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Site' ) res = jobMonitor.getJobAttributes( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['JobName'], 'helloWorld' ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['Status'], 'Running' ) res = jobMonitor.getJobHeartBeatData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getInputData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getJobPrimarySummary( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getAtticJobParameters( jobID ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' ) self.assert_( res['OK'] ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Status'], 'Done' ) self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} ) self.assert_( res['OK'] ) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob( jobID )
def test_JobStateUpdateAndJobMonitoringMultuple( self ): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination( dest ) job.setInputData( lfns ) job.setType( jobType ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] jobIDs.append( jobID ) res = jobMonitor.getSites() self.assert_( res['OK'] ) self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) ) res = jobMonitor.getJobTypes() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( types ) ) res = jobMonitor.getApplicationStates() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) ) res = jobMonitor.getOwners() self.assert_( res['OK'] ) res = jobMonitor.getOwnerGroup() self.assert_( res['OK'] ) res = jobMonitor.getProductionIds() self.assert_( res['OK'] ) res = jobMonitor.getJobGroups() self.assert_( res['OK'] ) res = jobMonitor.getStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] ) res = jobMonitor.getMinorStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] ) self.assert_( res['OK'] ) res = jobMonitor.getJobs() self.assert_( res['OK'] ) self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) ) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_( res['OK'] ) try: self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) ) except TypeError: pass res = jobMonitor.getJobsSummary( jobIDs ) self.assert_( res['OK'] ) res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown'}} ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} ) self.assert_( res['OK'] ) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob( jobIDs )
def test_JobStateUpdateAndJobMonitoring(self): """Verifying all JobStateUpdate and JobMonitoring functions""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = int(res["Value"]) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobJDL(jobID, False) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsParameters([jobID], []) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobOwner(jobID) self.assertTrue(res["OK"], res.get("Message")) # Adding stuff # forcing the update res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "running", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobParameters(jobID, [("par1", "par1Value"), ("par2", "par2Value")]) time.sleep(5) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobApplicationStatus(jobID, "app status", "source") self.assertTrue(res["OK"], res.get("Message")) # res = jobStateUpdate.setJobFlag() # self.assertTrue(res['OK'], res.get('Message')) # res = jobStateUpdate.unsetJobFlag() # self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobSite(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) # now checking few things res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameter(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {"par1": "par1Value"}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value", "par2": "par2Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttribute(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], "Site", msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttributes(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["JobName"], "helloWorld", msg="Got %s" % str(res["Value"]["JobName"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"]["Status"])) res = jobMonitor.getJobHeartBeatData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getInputData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getAtticJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.DONE, "MinorStatus", "Unknown") self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.DONE, msg="Got %s" % str(res["Value"]["Status"])) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus", msg="Got %s" % str(res["Value"]["MinorStatus"])) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) res = jobStateUpdate.sendHeartBeat(jobID, {"bih": "bih"}, {"boh": "boh"}) self.assertTrue(res["OK"], res.get("Message")) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class WorkflowTasks(TaskBase): """Handles jobs""" def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasksBulk" startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose("Setting job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) try: site = oJob.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJob.workflow.findParameter("JobType").getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter("JOB_ID"): oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if oJob.workflow.findParameter("PRODUCTION_ID"): oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID", ) oJob.setType(jobType) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose("Setting Site: ", str(sites), transID=transID, method=method) seqDict["Site"] = sites seqDict["JobName"] = self._transTaskName(transID, taskID) seqDict["JOB_ID"] = str(taskID).zfill(8) self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # Handle Input Data inputData = paramsDict.get("InputData") if inputData: if isinstance(inputData, six.string_types): inputData = inputData.replace(" ", "").split(";") self._logVerbose("Setting input data to %s" % inputData, transID=transID, method=method) seqDict["InputData"] = inputData elif paramSeqDict.get("InputData") is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logVerbose("Setting %s to %s" % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ "Job": oJob._toXML(), # pylint: disable=protected-access "TransformationID": transID, "TaskID": taskID, "InputData": inputData, }) if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, name, "JDL", "%%(%s)s" % name, name # pylint: disable=protected-access ) for pName, seq in seqDict.items(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.items(): if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData" ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res["OK"]: return res if taskDict: self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict["BulkJobObject"] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasks" startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter("JobType").getValue() templateOK = False getOutputDataTiming = 0.0 for taskID, paramsDict in taskDict.items(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose("Job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter("PRODUCTION_ID"): oJobTemplate._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter("JOB_ID"): oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose("Setting task name to %s" % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue("JOB_ID", str(taskID).zfill(8)) inputData = None self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) paramsDict["TaskObject"] = "" continue else: self._logDebug("Setting Site: ", str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res["OK"]: self._logError("Could not set the site: %s" % res["Message"], transID=transID, method=method) paramsDict["TaskObject"] = "" continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) paramsDict["TaskObject"] = "" if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ "Job": oJob._toXML(), "TransformationID": transID, "TaskID": taskID, "InputData": inputData }) getOutputDataTiming += time.time() if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): oJob._addJDLParameter(name, ";".join(output)) paramsDict["TaskObject"] = oJob if taskDict: self._logVerbose( "Average getOutputData time: %.1f per task" % (getOutputDataTiming / len(taskDict)), transID=transID, method=method, ) self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """Handle Sites and TargetSE in the parameters""" try: sites = ["ANY"] if paramsDict["Site"]: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict["Site"], sepChar=";") except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res["OK"]: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res["Value"] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ["ANY"]: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """set job inputs (+ metadata)""" inputData = paramsDict.get("InputData") transID = paramsDict["TransformationID"] if inputData: self._logVerbose("Setting input data to %s" % inputData, transID=transID, method="_handleInputs") res = oJob.setInputData(inputData) if not res["OK"]: self._logError("Could not set the inputs: %s" % res["Message"], transID=transID, method="_handleInputs") def _handleRest(self, oJob, paramsDict): """add as JDL parameters all the other parameters that are not for inputs or destination""" transID = paramsDict["TransformationID"] for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logDebug("Setting %s to %s" % (paramName, paramValue), transID=transID, method="_handleRest") oJob._addJDLParameter(paramName, paramValue) def _checkSickTransformations(self, transID): """Check if the transformation is in the transformations to be processed at Hospital or Clinic""" transID = int(transID) clinicPath = "Hospital" if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath if "Clinics" in self.opsH.getSections("Hospital").get("Value", []): basePath = os.path.join("Hospital", "Clinics") clinics = self.opsH.getSections(basePath)["Value"] for clinic in clinics: clinicPath = os.path.join(basePath, clinic) if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath return None def _handleHospital(self, oJob, clinicPath): """Optional handle of hospital/clinic jobs""" if not clinicPath: return oJob.setInputDataPolicy("download", dataScheduling=False) # Check first for a clinic, if not it must be the general hospital hospitalSite = self.opsH.getValue( os.path.join(clinicPath, "ClinicSite"), "") hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"), []) # If not found, get the hospital parameters if not hospitalSite: hospitalSite = self.opsH.getValue("Hospital/HospitalSite", "DIRAC.JobDebugger.ch") if not hospitalCEs: hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) oJob.setDestination(hospitalSite) if hospitalCEs: oJob._addJDLParameter("GridCE", hospitalCEs) def __generatePluginObject(self, plugin): """This simply instantiates the TaskManagerPlugin class with the relevant plugin name""" method = "__generatePluginObject" try: plugModule = __import__(self.pluginLocation, globals(), locals(), ["TaskManagerPlugin"]) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, "TaskManagerPlugin")("%s" % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """Get the list of job output LFNs from the provided plugin""" if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance["OK"]: return moduleInstance self.outputDataModule_o = moduleInstance["Value"] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """Submit the tasks""" if "BulkJobObject" in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """Submit jobs in one go with one parametric job""" if not taskDict: return S_OK(taskDict) startTime = time.time() method = "__submitTransformationTasksBulk" oJob = taskDict.pop("BulkJobObject") # we can only do this, once the job has been popped, or we _might_ crash transID = list(taskDict.values())[0]["TransformationID"] if oJob is None: self._logError("no bulk Job object found", transID=transID, method=method) return S_ERROR(ETSUKN, "No bulk job object provided for submission") result = self.submitTaskToExternal(oJob) if not result["OK"]: self._logError("Failed to submit tasks to external", transID=transID, method=method) return result jobIDList = result["Value"] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task["Success"] = False return S_ERROR( ETSUKN, "Submitted less number of jobs than requested tasks") # Get back correspondence with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]["ExternalID"] = jobID taskDict[taskID]["Success"] = True submitted = len(jobIDList) self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """Submit jobs one by one""" method = "__submitTransformationTasks" submitted = 0 failed = 0 startTime = time.time() for task in taskDict.values(): transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res["Message"], transID=transID, method=method) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logError("Failed to submit %d tasks to WMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """Submits a single job (which can be a bulk one) to the WMS.""" if isinstance(job, six.string_types): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", "", x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({"JobName": jobNames}) if not res["OK"]: self._logError( "Failed to get task from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) return res jobNameIDs = {} for wmsID in res["Value"]: res = self.jobMonitoringClient.getJobSummary(int(wmsID)) if not res["OK"]: self._logWarn( "Failed to get task summary from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) else: jobNameIDs[res["Value"]["JobName"]] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = "getSubmittedTaskStatus" if taskDicts: wmsIDs = [ int(taskDict["ExternalID"]) for taskDict in taskDicts if int(taskDict["ExternalID"]) ] transID = taskDicts[0]["TransformationID"] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] updateDict = {} for taskDict in taskDicts: taskID = taskDict["TaskID"] wmsID = int(taskDict["ExternalID"]) if not wmsID: continue oldStatus = taskDict["ExternalStatus"] newStatus = statusDict.get(wmsID, {}).get("Status", "Removed") if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( "Production/Job %d/%d removed from WMS while it is in %s status" % (transID, taskID, oldStatus), transID=transID, method=method, ) newStatus = "Failed" self._logVerbose( "Setting job status for Production/Job %d/%d to %s" % (transID, taskID, newStatus), transID=transID, method=method, ) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = "getSubmittedFileStatus" # All files are from the same transformation transID = fileDicts[0]["TransformationID"] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict["TaskID"]) taskFiles.setdefault(jobName, {})[fileDict["LFN"]] = fileDict["Status"] res = self.updateTransformationReservedTasks(fileDicts) if not res["OK"]: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res["Value"]["NoTasks"] taskNameIDs = res["Value"]["TaskNameIDs"] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].items(): if oldStatus != TransformationFilesStatus.UNUSED: updateDict[lfn] = TransformationFilesStatus.UNUSED res = self.jobMonitoringClient.getJobsStatus(list( taskNameIDs.values())) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] for jobName, wmsID in taskNameIDs.items(): jobStatus = statusDict.get(wmsID, {}).get("Status") newFileStatus = { "Done": TransformationFilesStatus.PROCESSED, "Completed": TransformationFilesStatus.PROCESSED, "Failed": TransformationFilesStatus.UNUSED, }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].items(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(isinstance(res['Value'], int), msg="Got %s" % type(res['Value'])) self.assertEqual(res['Value'], res['JobID'], msg="Got %s, expected %s" % (str(res['Value']), res['JobID'])) jobID = res['JobID'] jobID = res['Value'] # updating the status res = jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') self.assertTrue(res['OK'], res.get('Message')) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res['OK'], res.get('Message')) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Received', msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'MinorStatus': 'Job Rescheduled', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'ApplicationStatus': 'Unknown', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) # updating the status again res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Killed', msg="Got %s" % str(res['Value'])) # updating the status aaaagain res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], 'Done', msg="Got %s" % str(res['Value'])) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Deleted', msg="Got %s" % str(res['Value']))
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') # reset the job res = wmsClient.resetJob(jobID) self.assert_(res['OK']) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Received') # updating the status again jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Killed') # updating the status aaaagain jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Done') # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Deleted')
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)