def _setup_bulk_subjobs(self, dirac_ids, dirac_script): """ This is the old bulk submit method which is used to construct the subjobs for a parametric job Args: dirac_ids (list): This is a list of the Dirac ids which have been created dirac_script (str): Name of the dirac script which contains the job jdl """ f = open(dirac_script, 'r') parametric_datasets = get_parametric_datasets(f.read().split('\n')) f.close() if len(parametric_datasets) != len(dirac_ids): raise BackendError('Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC') from Ganga.GPIDev.Lib.Job.Job import Job master_job = self.getJobObject() master_job.subjobs = [] for i in range(len(dirac_ids)): j = Job() j.copyFrom(master_job) j.splitter = None j.backend.id = dirac_ids[i] j.id = i j.inputdata = self._setup_subjob_dataset(parametric_datasets[i]) j.status = 'submitted' j.time.timenow('submitted') master_job.subjobs.append(j) return True
def kill(self): """ Kill a Dirac jobs""" global dirac_ganga_server if not self.id: return None dirac_cmd = 'result = DiracCommands.kill(%d)' % self.id result = dirac_ganga_server.execute(dirac_cmd) if not result_ok(result): raise BackendError('Dirac', 'Could not kill job: %s' % str(result)) return result['OK']
def kill(self): """ Kill a Dirac jobs""" if not self.id: return None dirac_cmd = 'kill(%d)' % self.id result = execute(dirac_cmd) if not result_ok(result): raise BackendError('Dirac', 'Could not kill job: %s' % str(result)) return result['OK']
def retrievePandaJobs(job, jIDs): ''' methods for retrieving panda job ids of panda jobs given a jobDefId ''' from pandatools import Client ick = False jstatus = '' num_pjobs = 0 logger.debug("retrievePandaJobs jIDs=%s" % jIDs) # get status from Panda server rc, jobsStatus = Client.getFullJobStatus(jIDs, False) if rc: logger.error('Return code %d retrieving job status information.', rc) raise BackendError( 'Jedi', 'Return code %d retrieving job status information.' % rc) for status in jobsStatus: if not status: continue jstatus = status.jobStatus if status.jobStatus == None: logger.warning('No panda jobs expected') job.backend.pandajobs = [] elif status.jobStatus in [ "defined", "activated", "running", "failed", "finished", "holding", "assigned" ]: logger.debug('Panda jobs are running') logger.debug("PandaID: %d" % status.PandaID) pjobj = JediPandaJob() pjobj.id = status.PandaID pjobj.url = 'http://panda.cern.ch/?job=%d' % status.PandaID pjobj.jopSpec = dict(zip(status._attributes, status.values())) for k in pjobj.jobSpec.keys(): if type(pjobj.jobSpec[k]) not in [type(''), type(1)]: pjobj.jobSpec[k] = str(pjobj.jobSpec[k]) if pjobj not in job.backend.pandajobs: job.backend.pandajobs.append(pjobj) else: logger.debug("Panda job %s already exists locally" % pjobj.id) num_pjobs += 1 else: logger.warning( "getFullJobStatus returned unsupported status %s for Panda job %s " % (status.jobStatus, status.PandaID)) ick = True return (ick, jstatus, num_pjobs)
def command(klass, cmd, soutfile=None, allowed_exit=None): if allowed_exit is None: allowed_exit = [0] rc, soutfile, ef = shell_cmd(cmd, soutfile, allowed_exit) if not ef: logger.error( 'Problem submitting batch job. Maybe your chosen batch system is not available or you have configured it wrongly') with open(soutfile) as sout_file: logger.error(sout_file.read()) raiseable = BackendError(klass._name, 'It seems that %s commands are not installed properly:%s' % (klass._name, sout_file.readline())) return rc, soutfile
def createContainer(name): from pandatools import Client # don't create containers for HC datasets if not configPanda['processingType'].startswith( 'gangarobot') and not configPanda['processingType'].startswith( 'hammercloud'): try: Client.createContainer(name, False) logger.info('Created output container %s' % name) except exceptions.SystemExit: raise BackendError( 'Panda', 'Exception in Client.createContainer %s: %s %s' % (name, sys.exc_info()[0], sys.exc_info()[1]))
def checkForRebrokerage(string): import re matchObj = re.match( 'reassigned to another site by rebrokerage. new PandaID=(\d+) JobsetID=(\d+) JobID=(\d+)', string) if matchObj: newPandaID = long(matchObj.group(1)) newJobsetID = long(matchObj.group(2)) newJobID = long(matchObj.group(3)) return newPandaID raise BackendError( 'Jedi', 'Error getting new PandaID for rebrokered job. Report to DA Help')
def _submit(self): '''Submit the job via the Dirac server.''' self.id = None self.actualCE = None self.status = None global dirac_ganga_server dirac_cmd = "execfile('%s')" % self._getDiracScript() result = dirac_ganga_server.execute(dirac_cmd) err_msg = 'Error submitting job to Dirac: %s' % str(result) if not result_ok(result) or not result.has_key('Value'): logger.error(err_msg) raise BackendError('Dirac', err_msg) self.id = result['Value'] j = self.getJobObject() gDiracTask.addTaskJob(self.id, j.id) return type(self.id) == int
def _common_submit(self, dirac_script): '''Submit the job via the Dirac server. Args: dirac_script (str): filename of the JDL which is to be submitted to DIRAC ''' j = self.getJobObject() self.id = None self.actualCE = None self.status = None self.extraInfo = None self.statusInfo = '' j.been_queued = False dirac_cmd = """execfile(\'%s\')""" % dirac_script result = execute(dirac_cmd) # Could use the below code instead to submit on a thread # If submitting many then user may terminate ganga before # all jobs submitted # def submit_checker(result, job, script): # err_msg = 'Error submitting job to Dirac: %s' % str(result) # if not result_ok(result) or 'Value' not in result: # logger.error(err_msg) # raise BackendError('Dirac',err_msg) # # idlist = result['Value'] # if type(idlist) is list: # return job._setup_bulk_subjobs(idlist, script) # job.id = idlist # server.execute_nonblocking(dirac_cmd, callback_func=submit_checker, args=(self, dirac_script)) # return True err_msg = 'Error submitting job to Dirac: %s' % str(result) if not result_ok(result) or 'Value' not in result: logger.error(err_msg) logger.error("\n\n===\n%s\n===\n" % dirac_script) logger.error("\n\n====\n") with open(dirac_script, 'r') as file_in: logger.error("%s" % file_in.read()) logger.error("\n====\n") raise BackendError('Dirac', err_msg) idlist = result['Value'] if type(idlist) is list: return self._setup_bulk_subjobs(idlist, dirac_script) self.id = idlist return type(self.id) == int
def _setup_bulk_subjobs(self, dirac_ids, dirac_script): f = open(dirac_script, 'r') parametric_datasets = get_parametric_datasets(f.read().split('\n')) f.close() if len(parametric_datasets) != len(dirac_ids): raise BackendError('Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC') from Ganga.GPIDev.Lib.Job.Job import Job master_job = self.getJobObject() master_job.subjobs = [] for i in range(len(dirac_ids)): j = Job() j.copyFrom(master_job) j.splitter = None j.backend.id = dirac_ids[i] j.id = i j.inputdata = self._setup_subjob_dataset(parametric_datasets[i]) j.status = 'submitted' j.time.timenow('submitted') master_job.subjobs.append(j) master_job._commit() return True
def submit(self, subjobconfig, master_input_sandbox): """Submit a DIRAC job""" j = self.getJobObject() dirac_script = subjobconfig.script dirac_script.name = mangle_job_name(j) dirac_script.settings = self.settings dirac_script.dirac_opts = self.diracOpts sboxname = j.createPackedInputSandbox(subjobconfig.getSandboxFiles()) script_file = self._getDiracScript() dirac_script.input_sandbox = [ sboxname[0], master_input_sandbox[0], script_file ] for lfn in self.inputSandboxLFNs: from GangaBoss.Lib.Dataset.PhysicalFile import PhysicalFile if type(lfn) is PhysicalFile: msg = 'Dirac.inputSandboxLFNs cannot contain a PhysicalFile.' logger.error(msg) raise BackendError('Dirac', msg) dirac_script.input_sandbox.append('LFN:' + lfn.name) dirac_script.write(script_file) return self._submit()
def checkReport(self, jobDoc): job = self.getJobObject() config = Config.getConfig('Metrics') location = config['location'] if not os.path.exists(location): raise BackendError(0, 'Location %s file doesnt exist.' % (location)) config = ConfigParser() config.read(location) PARAMS = [('status', 'status')] if config.has_section('report'): PARAMS += config.items('report') else: logger.warning('No report in metrics') for n, v in PARAMS: if v: job.backend.report[v] = jobDoc.getAttribute(v)
def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig): logger.debug("Prepare") inputsandbox, outputsandbox = sandbox_prepare(app, appsubconfig, appmasterconfig, jobmasterconfig) job = stripProxy(app).getJobObject() if job.inputdata: if not job.splitter: if len(job.inputdata) > 100: raise BackendError( "You're submitting a job to Dirac with no splitter and more than 100 files, please add a splitter and try again!" ) outputfiles = [ this_file for this_file in job.outputfiles if isType(this_file, DiracFile) ] data_str = 'import os\n' data_str += 'execfile(\'data.py\')\n' if hasattr(job, '_splitter_data'): data_str += job._splitter_data inputsandbox.append(FileBuffer('data-wrapper.py', data_str)) input_data = [] # Cant wait to get rid of this when people no-longer specify # inputdata in options file ####################################################################### # splitters ensure that subjobs pick up inputdata from job over that in # optsfiles but need to take care of unsplit jobs if not job.master: share_path = os.path.join(get_share_path(app), 'inputdata', 'options_data.pkl') if not job.inputdata: if os.path.exists(share_path): f = open(share_path, 'r+b') job.inputdata = pickle.load(f) f.close() ####################################################################### # Cant wait to get rid of this when people no-longer specify # outputsandbox or outputdata in options file ####################################################################### share_path = os.path.join(get_share_path(app), 'output', 'options_parser.pkl') if os.path.exists(share_path): # if not os.path.exists(share_path): # raise GangaException('could not find the parser') f = open(share_path, 'r+b') parser = pickle.load(f) f.close() outbox, outdata = parser.get_output(job) from Ganga.GPIDev.Lib.File import FileUtils from Ganga.GPIDev.Base.Filters import allComponentFilters fileTransform = allComponentFilters['gangafiles'] outdata_files = [ fileTransform(this_file, None) for this_file in outdata if not FileUtils.doesFileExist(this_file, job.outputfiles) ] job.non_copyable_outputfiles.extend([ output_file for output_file in outdata_files if not isType(output_file, DiracFile) ]) outbox_files = [ fileTransform(this_file, None) for this_file in outbox if not FileUtils.doesFileExist(this_file, job.outputfiles) ] job.non_copyable_outputfiles.extend([ outbox_file for outbox_file in outbox_files if not isType(outbox_file, DiracFile) ]) outputsandbox = [ f.namePattern for f in job.non_copyable_outputfiles ] outputsandbox.extend([ f.namePattern for f in job.outputfiles if not isType(f, DiracFile) ]) outputsandbox = unique(outputsandbox) # + outbox[:]) ####################################################################### input_data_dirac, parametricinput_data = dirac_inputdata( job.application) if input_data_dirac is not None: for f in input_data_dirac: if isType(f, DiracFile): input_data.append(f.lfn) elif isType(f, str): input_data.append(f) else: raise ApplicationConfigurationError( "Don't know How to handle anythig other than DiracFiles or strings to LFNs!" ) commandline = "python ./gaudipython-wrapper.py" if is_gaudi_child(app): commandline = 'gaudirun.py ' commandline += ' '.join([str(arg) for arg in app.args]) commandline += ' options.pkl data-wrapper.py' logger.debug('Command line: %s: ', commandline) gaudi_script_path = os.path.join(job.getInputWorkspace().getPath(), "gaudi-script.py") script_generator( gaudi_script_template(), #remove_unreplaced = False, outputfile_path=gaudi_script_path, PLATFORM=app.platform, COMMAND=commandline, XMLSUMMARYPARSING=getXMLSummaryScript() # , #OUTPUTFILESINJECTEDCODE = getWNCodeForOutputPostprocessing(job, ' ') ) #logger.debug( "input_data %s" % str( input_data ) ) # We want to propogate the ancestor depth to DIRAC when we have # inputdata set if job.inputdata is not None and isType(job.inputdata, LHCbDataset): # As the RT Handler we already know we have a Dirac backend if type(job.backend.settings) is not dict: raise ApplicationConfigurationError( None, 'backend.settings should be a dict') if 'AncestorDepth' in job.backend.settings: ancestor_depth = job.backend.settings['AncestorDepth'] else: ancestor_depth = job.inputdata.depth else: ancestor_depth = 0 lhcbdirac_script_template = lhcbdiracAPI_script_template() lhcb_dirac_outputfiles = lhcbdirac_outputfile_jdl(outputfiles) # not necessary to use lhcbdiracAPI_script_template any more as doing our own uploads to Dirac # remove after Ganga6 release # NOTE special case for replicas: replicate string must be empty for no # replication dirac_script = script_generator( lhcbdirac_script_template, DIRAC_IMPORT= 'from LHCbDIRAC.Interfaces.API.DiracLHCb import DiracLHCb', DIRAC_JOB_IMPORT= 'from LHCbDIRAC.Interfaces.API.LHCbJob import LHCbJob', DIRAC_OBJECT='DiracLHCb()', JOB_OBJECT='LHCbJob()', NAME=mangle_job_name(app), APP_NAME=stripProxy(app).appname, APP_VERSION=app.version, APP_SCRIPT=gaudi_script_path, APP_LOG_FILE='Ganga_%s_%s.log' % (stripProxy(app).appname, app.version), INPUTDATA=input_data, PARAMETRIC_INPUTDATA=parametricinput_data, OUTPUT_SANDBOX=API_nullifier(outputsandbox), OUTPUTFILESSCRIPT=lhcb_dirac_outputfiles, # job.fqid,#outputdata_path, OUTPUT_PATH="", OUTPUT_SE=getConfig('DIRAC')['DiracOutputDataSE'], SETTINGS=diracAPI_script_settings(job.application), DIRAC_OPTS=job.backend.diracOpts, PLATFORM=app.platform, REPLICATE='True' if getConfig('DIRAC')['ReplicateOutputData'] else '', ANCESTOR_DEPTH=ancestor_depth, ## This is to be modified in the final 'submit' function in the backend ## The backend also handles the inputfiles DiracFiles ass appropriate INPUT_SANDBOX='##INPUT_SANDBOX##') logger.debug("prepare: LHCbGaudiDiracRunTimeHandler") return StandardJobConfig(dirac_script, inputbox=unique(inputsandbox), outputbox=unique(outputsandbox))
def _internal_job_finalisation(job, updated_dirac_status): logger = getLogger() if updated_dirac_status == 'completed': # firstly update job to completing DiracBase._getStateTime(job, 'completing') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completing') if job.master: job.master.updateMasterJobStatus() import time start = time.time() # contact dirac for information job.backend.normCPUTime = execute('normCPUTime(%d)' % job.backend.id) getSandboxResult = execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath())) file_info_dict = execute('getOutputDataInfo(%d)' % job.backend.id) now = time.time() logger.debug('Job ' + job.fqid + ' Time for Dirac metadata : ' + str(now - start)) logger.debug('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict)) logger.debug('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult)) # Set DiracFile metadata wildcards = [f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None] with open(os.path.join(job.getOutputWorkspace().getPath(), getConfig('Output')['PostProcessLocationsFileName']), 'ab') as postprocesslocationsfile: if not hasattr(file_info_dict, 'keys'): logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error understanding OutputDataInfo: %s" % str(file_info_dict)) for file_name in file_info_dict.get('Value', []): file_name = os.path.basename(file_name) info = file_info_dict.get(file_name) logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info))) valid_wildcards = [wc for wc in wildcards if fnmatch.fnmatch(file_name, wc)] if len(valid_wildcards) == 0: valid_wildcards.append('') if not hasattr(info, 'get'): logger.error("Error getting OutputDataInfo for: %s" % str(job.getFQID('.'))) logger.error("Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!") logger.error("Err: %s" % str(info)) logger.error("file_info_dict: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error getting OutputDataInfo") for wc in valid_wildcards: logger.debug("wildcard: %s" % str(wc)) DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (wc, file_name, info.get('LFN', 'Error Getting LFN!'), str(info.get('LOCATIONS', ['NotAvailable'])), info.get('GUID', 'NotAvailable') ) logger.debug("DiracFileData: %s" % str(DiracFileData)) postprocesslocationsfile.write(DiracFileData) # check outputsandbox downloaded correctly if not result_ok(getSandboxResult): logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() raise BackendError('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) # finally update job to completed DiracBase._getStateTime(job, 'completed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completed') if job.master: job.master.updateMasterJobStatus() now = time.time() logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start)) elif updated_dirac_status == 'failed': # firstly update status to failed DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() # if requested try downloading outputsandbox anyway if getConfig('DIRAC')['failed_sandbox_download']: execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath())) else: logger.error("Unexpected dirac status '%s' encountered" % updated_dirac_status)
def _internal_job_finalisation(job, updated_dirac_status): """ This method performs the main job finalisation Args: job (Job): Thi is the job we want to finalise updated_dirac_status (str): String representing the Ganga finalisation state of the job failed/completed """ if updated_dirac_status == 'completed': start = time.time() # firstly update job to completing DiracBase._getStateTime(job, 'completing') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completing') if job.master: job.master.updateMasterJobStatus() output_path = job.getOutputWorkspace().getPath() logger.info('Contacting DIRAC for job: %s' % job.fqid) # Contact dirac which knows about the job job.backend.normCPUTime, getSandboxResult, file_info_dict, completeTimeResult = execute("finished_job(%d, '%s')" % (job.backend.id, output_path)) now = time.time() logger.info('%0.2fs taken to download output from DIRAC for Job %s' % ((now - start), job.fqid)) #logger.info('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict)) #logger.info('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult)) #logger.info('Job ' + job.fqid + ' normCPUTime: ' + str(job.backend.normCPUTime)) # Set DiracFile metadata wildcards = [f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None] lfn_store = os.path.join(output_path, getConfig('Output')['PostProcessLocationsFileName']) # Make the file on disk with a nullop... if not os.path.isfile(lfn_store): with open(lfn_store, 'w'): pass if job.outputfiles.get(DiracFile): # Now we can iterate over the contents of the file without touching it with open(lfn_store, 'ab') as postprocesslocationsfile: if not hasattr(file_info_dict, 'keys'): logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error understanding OutputDataInfo: %s" % str(file_info_dict)) ## Caution is not clear atm whether this 'Value' is an LHCbism or bug list_of_files = file_info_dict.get('Value', file_info_dict.keys()) for file_name in list_of_files: file_name = os.path.basename(file_name) info = file_info_dict.get(file_name) #logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info))) if not hasattr(info, 'get'): logger.error("Error getting OutputDataInfo for: %s" % str(job.getFQID('.'))) logger.error("Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!") logger.error("Err: %s" % str(info)) logger.error("file_info_dict: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error getting OutputDataInfo") valid_wildcards = [wc for wc in wildcards if fnmatch.fnmatch(file_name, wc)] if not valid_wildcards: valid_wildcards.append('') for wc in valid_wildcards: #logger.debug("wildcard: %s" % str(wc)) DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (wc, file_name, info.get('LFN', 'Error Getting LFN!'), str(info.get('LOCATIONS', ['NotAvailable'])), info.get('GUID', 'NotAvailable') ) #logger.debug("DiracFileData: %s" % str(DiracFileData)) postprocesslocationsfile.write(DiracFileData) postprocesslocationsfile.flush() logger.debug("Written: %s" % open(lfn_store, 'r').readlines()) # check outputsandbox downloaded correctly if not result_ok(getSandboxResult): logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() raise BackendError('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) # finally update job to completed DiracBase._getStateTime(job, 'completed', completeTimeResult) if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completed') if job.master: job.master.updateMasterJobStatus() now = time.time() logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start)) elif updated_dirac_status == 'failed': # firstly update status to failed DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() # if requested try downloading outputsandbox anyway if configDirac['failed_sandbox_download']: execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath())) else: logger.error("Job #%s Unexpected dirac status '%s' encountered" % (job.getFQID('.'), updated_dirac_status))
def _resubmit(self): """Resubmit a DIRAC job""" j = self.getJobObject() parametric = False script_path = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') # Check old script if j.master is None and not os.path.exists(script_path): raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir') if j.master is not None and not os.path.exists(script_path): script_path = os.path.join( j.master.getInputWorkspace().getPath(), 'dirac-script.py') if not os.path.exists(script_path): raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir or j.master.inputdir') parametric = True # Read old script f = open(script_path, 'r') script = f.read() f.close() # Create new script - ##note instead of using get_parametric_dataset # could just use j.inputdata. if parametric is True: parametric_datasets = get_parametric_datasets(script.split('\n')) if j.master: if len(parametric_datasets) != len(j.master.subjobs): raise BackendError('Dirac', 'number of parametric datasets defined in API script doesn\'t match number of master.subjobs') if j.inputdata and len(j.inputdata) > 0: _input_files = [f for f in j.inputdata if not isType(f, DiracFile)] else: _input_files = [] if set(parametric_datasets[j.id]).symmetric_difference(set([f.namePattern for f in _input_files])): raise BackendError( 'Dirac', 'Mismatch between dirac-script and job attributes.') script = script.replace('.setParametricInputData(%s)' % str(parametric_datasets), '.setInputData(%s)' % str(parametric_datasets[j.id])) script = script.replace('%n', str(j.id)) # name start_user_settings = '# <-- user settings\n' new_script = script[ :script.find(start_user_settings) + len(start_user_settings)] job_ident = get_job_ident(script.split('\n')) for key, value in self.settings.iteritems(): if str(key).startswith('set'): _key = key[3:] else: _key = key if type(value) is str: template = '%s.set%s("%s")\n' else: template = '%s.set%s(%s)\n' new_script += template % (job_ident, str(_key), str(value)) new_script += script[script.find('# user settings -->'):] # Save new script new_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') f = open(new_script_filename, 'w') f.write(new_script) f.flush() f.close() return self._common_submit(new_script_filename)
def master_prepare(self, app, appconfig): '''Prepare the master job''' from pandatools import Client from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec job = app._getParent() logger.debug('ExecutablePandaRTHandler master_prepare called for %s', job.getFQID('.')) # set chirp variables if configPanda['chirpconfig'] or configPanda['chirpserver']: setChirpVariables() # Pack inputsandbox inputsandbox = 'sources.%s.tar' % commands.getoutput( 'uuidgen 2> /dev/null') inpw = job.getInputWorkspace() # add user script to inputsandbox if hasattr(job.application.exe, "name"): if not job.application.exe in job.inputsandbox: job.inputsandbox.append(job.application.exe) for fname in [f.name for f in job.inputsandbox]: fname.rstrip(os.sep) path = fname[:fname.rfind(os.sep)] f = fname[fname.rfind(os.sep) + 1:] rc, output = commands.getstatusoutput( 'tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f)) if rc: logger.error('Packing inputsandbox failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') if len(job.inputsandbox) > 0: rc, output = commands.getstatusoutput('gzip %s' % (inpw.getPath(inputsandbox))) if rc: logger.error('Packing inputsandbox failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') inputsandbox += ".gz" else: inputsandbox = None # Upload Inputsandbox if inputsandbox: logger.debug('Uploading source tarball ...') uploadSources(inpw.getPath(), os.path.basename(inputsandbox)) self.inputsandbox = inputsandbox else: self.inputsandbox = None # input dataset if job.inputdata: if job.inputdata._name != 'DQ2Dataset': raise ApplicationConfigurationError( None, 'PANDA application supports only DQ2Datasets') # run brokerage here if not splitting if not job.splitter: from GangaPanda.Lib.Panda.Panda import runPandaBrokerage runPandaBrokerage(job) elif job.splitter._name not in [ 'DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask' ]: raise ApplicationConfigurationError( None, 'Panda splitter must be DQ2JobSplitter or ArgSplitter') if job.backend.site == 'AUTO': raise ApplicationConfigurationError( None, 'site is still AUTO after brokerage!') # output dataset if job.outputdata: if job.outputdata._name != 'DQ2OutputDataset': raise ApplicationConfigurationError( None, 'Panda backend supports only DQ2OutputDataset') else: logger.info('Adding missing DQ2OutputDataset') job.outputdata = DQ2OutputDataset() job.outputdata.datasetname, outlfn = dq2outputdatasetname( job.outputdata.datasetname, job.id, job.outputdata.isGroupDS, job.outputdata.groupname) self.outDsLocation = Client.PandaSites[job.backend.site]['ddm'] try: Client.addDataset(job.outputdata.datasetname, False, location=self.outDsLocation) logger.info('Output dataset %s registered at %s' % (job.outputdata.datasetname, self.outDsLocation)) dq2_set_dataset_lifetime(job.outputdata.datasetname, location=self.outDsLocation) except exceptions.SystemExit: raise BackendError( 'Panda', 'Exception in Client.addDataset %s: %s %s' % (job.outputdata.datasetname, sys.exc_info()[0], sys.exc_info()[1])) # handle the libds if job.backend.libds: self.libDataset = job.backend.libds self.fileBO = getLibFileSpecFromLibDS(self.libDataset) self.library = self.fileBO.lfn elif job.backend.bexec: self.libDataset = job.outputdata.datasetname + '.lib' self.library = '%s.tgz' % self.libDataset try: Client.addDataset(self.libDataset, False, location=self.outDsLocation) dq2_set_dataset_lifetime(self.libDataset, location=self.outDsLocation) logger.info('Lib dataset %s registered at %s' % (self.libDataset, self.outDsLocation)) except exceptions.SystemExit: raise BackendError( 'Panda', 'Exception in Client.addDataset %s: %s %s' % (self.libDataset, sys.exc_info()[0], sys.exc_info()[1])) # collect extOutFiles self.extOutFile = [] for tmpName in job.outputdata.outputdata: if tmpName != '': self.extOutFile.append(tmpName) for tmpName in job.outputsandbox: if tmpName != '': self.extOutFile.append(tmpName) for tmpName in job.backend.extOutFile: if tmpName != '': self.extOutFile.append(tmpName) # create build job if job.backend.bexec != '': jspec = JobSpec() jspec.jobDefinitionID = job.id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.transformation = '%s/buildGen-00-00-01' % Client.baseURLSUB if Client.isDQ2free(job.backend.site): jspec.destinationDBlock = '%s/%s' % ( job.outputdata.datasetname, self.libDataset) jspec.destinationSE = 'local' else: jspec.destinationDBlock = self.libDataset jspec.destinationSE = job.backend.site jspec.prodSourceLabel = configPanda['prodSourceLabelBuild'] jspec.processingType = configPanda['processingType'] jspec.assignedPriority = configPanda['assignedPriorityBuild'] jspec.computingSite = job.backend.site jspec.cloud = job.backend.requirements.cloud jspec.jobParameters = '-o %s' % (self.library) if self.inputsandbox: jspec.jobParameters += ' -i %s' % (self.inputsandbox) else: raise ApplicationConfigurationError( None, 'Executable on Panda with build job defined, but inputsandbox is emtpy !' ) matchURL = re.search('(http.*://[^/]+)/', Client.baseURLCSRVSSL) if matchURL: jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1) if job.backend.bexec != '': jspec.jobParameters += ' --bexec "%s" ' % urllib.quote( job.backend.bexec) jspec.jobParameters += ' -r %s ' % '.' fout = FileSpec() fout.lfn = self.library fout.type = 'output' fout.dataset = self.libDataset fout.destinationDBlock = self.libDataset jspec.addFile(fout) flog = FileSpec() flog.lfn = '%s.log.tgz' % self.libDataset flog.type = 'log' flog.dataset = self.libDataset flog.destinationDBlock = self.libDataset jspec.addFile(flog) return jspec else: return None
def parseResults(self): job = self.getJobObject() server = CRABServer() try: server.status(job) server.getOutput(job) except: logger.error('Could not get the output of the job.') # Let's not raise this yet (in case of a double call). # raise CRABServerError('Impossible to get the output of the job') workdir = job.inputdata.ui_working_dir index = int(job.id) + 1 doc_path = '%s/res/crab_fjr_%d.xml' % (workdir, index) if not os.path.exists(doc_path): logger.error('FJR %s not found.' % (doc_path)) return try: doc = parse(doc_path) except: logger.error("Could not parse document. File not present?") return status = doc.firstChild.getAttribute("Status") if status in ["Failed"]: self.postMortem(job) job.updateStatus('failed') elif status in ["Success"]: if job.status == 'submitting': job.updateStatus('submitted') job.updateStatus('completed') else: logger.warning("UNKNOWN PARSE STATUS: " + str(status)) config = Config.getConfig('Metrics') location = config['location'] if not os.path.exists(location): raise BackendError(0, 'Location %s file doesnt exist.' % (location)) config = ConfigParser() config.read(location) #Iterate over all them SECTIONS = config.sections() if 'report' in SECTIONS: SECTIONS.remove('report') # Only five sections work here... for section in SECTIONS: if not job.backend.fjr.has_key(section): job.backend.fjr[section] = {} performancereport = doc.getElementsByTagName( "PerformanceReport")[0] performancesummary = performancereport.getElementsByTagName( "PerformanceSummary") for pfs in performancesummary: if pfs.getAttribute("Metric") == section: metrics = pfs.getElementsByTagName("Metric") for metric in metrics: name = metric.getAttribute("Name") if config.has_option(section, name): # Due to the names with minus intead of underscore, we have to do thiw walkarround # to send them to the DB. name = config.get(section, name) if name: job.backend.fjr[section][ name] = metric.getAttribute("Value")
def master_updateMonitoringInformation(jobs): '''Monitor jobs''' from pandatools import Client #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ] submitting_status = [] active_status = [ None, 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready', 'scouting', 'running', 'holding', 'merging', 'prepared', 'aborting', 'finishing' ] inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done'] # Find jobs to be monitored jobdict = {} for job in jobs: # add a delay as Panda can be a little slow in sorting out a new Task if job.backend.id and job.backend.status in active_status and ( (datetime.datetime.utcnow() - job.time.timestamps["submitted"]).seconds > 120): jobdict[job.backend.id] = job logger.debug("jobdict = %s" % jobdict) # Monitor active Jedi tasks allJobIDs = jobdict.keys() pandaJobIDs = {} for jID in allJobIDs: status, jediTaskDict = Client.getJediTaskDetails( {'jediTaskID': jID}, False, True, verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) #raise BackendError('Jedi','Return code %d retrieving job status information.' % status) continue # Retrieve job job = jobdict[jediTaskDict['jediTaskID']] # Store associated Panda jobs if job.backend.pandajobs: pandaJobIDs[job.backend.id] = [ pj.id for pj in job.backend.pandajobs ] else: pandaJobIDs[ jediTaskDict['jediTaskID']] = jediTaskDict['PandaID'] logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs)) # Fill the output data dataset list if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '': for ds in jediTaskDict['outDS'].split(','): if not ds in job.outputdata.datasetList: job.outputdata.datasetList.append(ds) # Jedi job status has changed if job.backend.status != jediTaskDict['status']: logger.debug('Job %s has changed status from %s to %s', job.getFQID('.'), job.backend.status, jediTaskDict['status']) job.backend.status = jediTaskDict['status'] job.backend.reason = jediTaskDict['statistics'] # Now update Jedi job status if jediTaskDict['status'] in [ 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready' ]: job.updateStatus('submitted') elif jediTaskDict['status'] in [ 'scouting', 'running', 'holding', 'merging', 'prepared' ]: job.updateStatus('running') elif jediTaskDict['status'] in ['done']: job.updateStatus('completed') elif jediTaskDict['status'] in ['failed', 'finished']: job.updateStatus('failed') elif jediTaskDict['status'] in [ 'aborted', 'broken', 'cancelled' ] and job.status not in ['completed', 'failed']: job.updateStatus('killed') else: logger.warning('Unexpected Jedi task status %s', jediTaskDict['status']) # Check if associated Panda job exist and monitor them if not job.backend.pandajobs: jdefids = pandaJobIDs[jID] # skip if there are no Panda jobs yet if not jdefids: continue tot_num_mjobs = 0 do_master_update = True ick, status, num_mjobs = retrievePandaJobs(job, jdefids) logger.debug('retrievePandaJobs returns: %s %s' % (repr(ick), status)) if not ick: logger.debug( 'Panda job retrival failure for Jedi task %s with PandaIds %s' % (job.backend.id, jdefids)) do_master_update = False tot_num_mjobs += num_mjobs logger.debug('Job %s retrieved %d Panda jobs' % (job.getFQID('.'), tot_num_mjobs)) # Now monitor the already attached Panda jobs else: jdefids = [pj.id for pj in job.backend.pandajobs] rc, jobsStatus = Client.getFullJobStatus(jdefids, False) if rc: logger.error( 'Return code %d retrieving job status information.', rc) raise BackendError( 'Jedi', 'Return code %d retrieving job status information.' % rc) for status in jobsStatus: if not status: continue for pjob in job.backend.pandajobs: if pjob.id == status.PandaID: # skip if no status change if pjob.status == status.jobStatus: continue # Else update job record pjob.jobSpec = dict( zip(status._attributes, status.values())) for k in pjob.jobSpec.keys(): if type(pjob.jobSpec[k]) not in [ type(''), type(1) ]: pjob.jobSpec[k] = str(pjob.jobSpec[k]) logger.debug( 'Job %s with Panda job %s has changed status from %s to %s', job.getFQID('.'), pjob.id, pjob.status, status.jobStatus) pjob.status = status.jobStatus pjob.exitcode = str(status.transExitCode) pjob.piloterrorcode = str(status.pilotErrorCode) pjob.reason = '' for k in pjob.jobSpec.keys(): if k.endswith('ErrorDiag' ) and pjob.jobSpec[k] != 'NULL': pjob.reason += '%s: %s, ' % ( k, str(pjob.jobSpec[k])) #if job.backend.jobSpec['transExitCode'] != 'NULL': pjob.reason += 'transExitCode: %s' % pjob.jobSpec[ 'transExitCode'] if status.jobStatus in [ 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent' ]: logger.debug('Panda job %s %s' % (pjob.id, status.jobStatus)) elif status.jobStatus in [ 'starting', 'running', 'holding', 'transferring', 'merging' ]: logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) elif status.jobStatus in ['finished']: logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) elif status.jobStatus == 'failed': logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) # check for server side retry if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[ 'taskBufferErrorDiag'].find( "PandaID=") != -1: # grab the new panda ID newPandaID = long( pjob.jobSpec['taskBufferErrorDiag']. split("=")[1]) pjob.id = newPandaID pjob.status = None pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID elif status.jobStatus == 'cancelled' and pjob.status not in [ 'completed', 'failed' ]: # bug 67716 logger.debug('Panda job %s cancelled' % pjob.id) if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[ 'taskBufferErrorDiag']: newPandaID = checkForRebrokerage( pjob.jobSpec['taskBufferErrorDiag']) logger.warning( "Subjob rebrokered by Panda server. Job %d moved to %d." % (pjob.id, newPandaID)) pjob.id = newPandaID pjob.status = None else: logger.warning('Unexpected job status %s', status.jobStatus)
def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig): """Prepare the specific aspec of each subjob. Returns: subjobconfig list of objects understood by backends.""" from pandatools import Client from pandatools import AthenaUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs # make sure we have the correct siteType refreshPandaSpecs() job = app._getParent() masterjob = job._getRoot() logger.debug('ProdTransPandaRTHandler prepare called for %s', job.getFQID('.')) job.backend.actualCE = job.backend.site job.backend.requirements.cloud = Client.PandaSites[ job.backend.site]['cloud'] # check that the site is in a submit-able status if not job.splitter or job.splitter._name != 'DQ2JobSplitter': allowed_sites = job.backend.list_ddm_sites() try: outDsLocation = Client.PandaSites[job.backend.site]['ddm'] tmpDsExist = False if (configPanda['processingType'].startswith('gangarobot') or configPanda['processingType'].startswith('hammercloud')): #if Client.getDatasets(job.outputdata.datasetname): if getDatasets(job.outputdata.datasetname): tmpDsExist = True logger.info('Re-using output dataset %s' % job.outputdata.datasetname) if not configPanda[ 'specialHandling'] == 'ddm:rucio' and not configPanda[ 'processingType'].startswith( 'gangarobot' ) and not configPanda['processingType'].startswith( 'hammercloud') and not configPanda[ 'processingType'].startswith('rucio_test'): Client.addDataset(job.outputdata.datasetname, False, location=outDsLocation, allowProdDisk=True, dsExist=tmpDsExist) logger.info('Output dataset %s registered at %s' % (job.outputdata.datasetname, outDsLocation)) dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation) except exceptions.SystemExit: raise BackendError( 'Panda', 'Exception in adding dataset %s: %s %s' % (job.outputdata.datasetname, sys.exc_info()[0], sys.exc_info()[1])) # JobSpec. jspec = JobSpec() jspec.currentPriority = app.priority jspec.jobDefinitionID = masterjob.id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.coreCount = app.core_count jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release jspec.homepackage = app.home_package jspec.transformation = app.transformation jspec.destinationDBlock = job.outputdata.datasetname if job.outputdata.location: jspec.destinationSE = job.outputdata.location else: jspec.destinationSE = job.backend.site if job.inputdata: jspec.prodDBlock = job.inputdata.dataset[0] else: jspec.prodDBlock = 'NULL' if app.prod_source_label: jspec.prodSourceLabel = app.prod_source_label else: jspec.prodSourceLabel = configPanda['prodSourceLabelRun'] jspec.processingType = configPanda['processingType'] jspec.specialHandling = configPanda['specialHandling'] jspec.computingSite = job.backend.site jspec.cloud = job.backend.requirements.cloud jspec.cmtConfig = app.atlas_cmtconfig if app.dbrelease == 'LATEST': try: latest_dbrelease = getLatestDBReleaseCaching() except: from pandatools import Client latest_dbrelease = Client.getLatestDBRelease() m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease) if m: self.dbrelease_dataset = m.group(1) self.dbrelease = m.group(2) else: raise ApplicationConfigurationError( None, "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually." ) else: self.dbrelease_dataset = app.dbrelease_dataset self.dbrelease = app.dbrelease jspec.jobParameters = app.job_parameters if self.dbrelease: if self.dbrelease == 'current': jspec.jobParameters += ' --DBRelease=current' else: if jspec.transformation.endswith( "_tf.py") or jspec.transformation.endswith("_tf"): jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % ( self.dbrelease, ) else: jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % ( self.dbrelease, ) dbspec = FileSpec() dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease dbspec.dataset = self.dbrelease_dataset dbspec.prodDBlock = jspec.prodDBlock dbspec.type = 'input' jspec.addFile(dbspec) if job.inputdata: m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)', job.inputdata.dataset[0]) if not m: logger.error("Error retrieving run number from dataset name") #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name") runnumber = 105200 else: runnumber = int(m.group(2)) if jspec.transformation.endswith( "_tf.py") or jspec.transformation.endswith("_tf"): jspec.jobParameters += ' --runNumber %d' % runnumber else: jspec.jobParameters += ' RunNumber=%d' % runnumber # Output files. randomized_lfns = [] ilfn = 0 for lfn, lfntype in zip(app.output_files, app.output_type): ofspec = FileSpec() if app.randomize_lfns: randomized_lfn = lfn + ( '.%s.%d.%s' % (job.backend.site, int(time.time()), commands.getoutput('uuidgen 2> /dev/null')[:4])) else: randomized_lfn = lfn ofspec.lfn = randomized_lfn randomized_lfns.append(randomized_lfn) ofspec.destinationDBlock = jspec.destinationDBlock ofspec.destinationSE = jspec.destinationSE ofspec.dataset = jspec.destinationDBlock ofspec.type = 'output' jspec.addFile(ofspec) if jspec.transformation.endswith( "_tf.py") or jspec.transformation.endswith("_tf"): jspec.jobParameters += ' --output%sFile %s' % ( lfntype, randomized_lfns[ilfn]) else: jspec.jobParameters += ' output%sFile=%s' % ( lfntype, randomized_lfns[ilfn]) ilfn = ilfn + 1 # Input files. if job.inputdata: for guid, lfn, size, checksum, scope in zip( job.inputdata.guids, job.inputdata.names, job.inputdata.sizes, job.inputdata.checksums, job.inputdata.scopes): ifspec = FileSpec() ifspec.lfn = lfn ifspec.GUID = guid ifspec.fsize = size ifspec.md5sum = checksum ifspec.scope = scope ifspec.dataset = jspec.prodDBlock ifspec.prodDBlock = jspec.prodDBlock ifspec.type = 'input' jspec.addFile(ifspec) if app.input_type: itype = app.input_type else: itype = m.group(5) if jspec.transformation.endswith( "_tf.py") or jspec.transformation.endswith("_tf"): jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join( job.inputdata.names)) else: jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join( job.inputdata.names)) # Log files. lfspec = FileSpec() lfspec.lfn = '%s.job.log.tgz' % jspec.jobName lfspec.destinationDBlock = jspec.destinationDBlock lfspec.destinationSE = jspec.destinationSE lfspec.dataset = jspec.destinationDBlock lfspec.type = 'log' jspec.addFile(lfspec) return jspec
def master_resubmit(self, jobs): '''Resubmit failed Jedi job''' from pandatools import Client jobIDs = {} for job in jobs: jobIDs[job.backend.id] = job allJobIDs = jobIDs.keys() pandaJobIDs = {} for jID in allJobIDs: status, jediTaskDict = Client.getJediTaskDetails( {'jediTaskID': jID}, False, True, verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) raise BackendError( 'Jedi', 'Return code %d retrieving job status information.' % status) # Retrieve job job = jobIDs[jediTaskDict['jediTaskID']] newJobsetID = -1 # get jobset retryJobs = [] # jspecs resubmittedJobs = [] # ganga jobs if jediTaskDict['status'] in [ 'failed', 'killed', 'cancelled', 'aborted', 'broken', 'finished' ]: retryJobs.append(job) resubmittedJobs.append(jID) #elif jediTaskDict['status'] == 'finished': # pass else: logger.warning("Cannot resubmit. Jedi task %s is status %s." % (jID, jediTaskDict['status'])) return False # submit if len(retryJobs) == 0: logger.warning("No failed jobs to resubmit") return False status, out = Client.retryTask(jID, verbose=False) if status != 0: logger.error(status) logger.error(out) logger.error("Failed to retry JobID=%s" % jID) return False tmpStat, tmpDiag = out if not tmpStat: logger.error(tmpDiag) logger.error("Failed to retry JobID=%s" % jID) return False logger.info(tmpDiag) job.backend.status = None job.backend.jobSpec = {} job.updateStatus('submitted') logger.info('Resubmission successful') return True