def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def test_JobStateUpdateAndJobMonitoring( self ): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create a job and check stuff job = helloWorldJob() jobDescription = createFile( job ) # submitting the job. Checking few stuff res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = int ( res['Value'] ) # jobID = res['JobID'] res = jobMonitor.getJobJDL( jobID, True ) self.assert_( res['OK'] ) res = jobMonitor.getJobJDL( jobID, False ) self.assert_( res['OK'] ) # Adding stuff res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' ) self.assert_( res['OK'] ) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite( jobID, 'Site' ) self.assert_( res['OK'] ) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Running' ) res = jobMonitor.getJobParameter( jobID, 'par1' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value'} ) res = jobMonitor.getJobParameters( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} ) res = jobMonitor.getJobAttribute( jobID, 'Site' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Site' ) res = jobMonitor.getJobAttributes( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['JobName'], 'helloWorld' ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['Status'], 'Running' ) res = jobMonitor.getJobHeartBeatData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getInputData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getJobPrimarySummary( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getAtticJobParameters( jobID ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' ) self.assert_( res['OK'] ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Status'], 'Done' ) self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} ) self.assert_( res['OK'] ) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob( jobID )
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)