def master_resubmit(self,jobs): '''Resubmit failed Jedi job''' from pandatools import Client jobIDs = {} for job in jobs: jobIDs[job.backend.id] = job allJobIDs = jobIDs.keys() pandaJobIDs = {} for jID in allJobIDs: with inject_proxy(self.credential_requirements): status, jediTaskDict = Client.getJediTaskDetails({'jediTaskID': jID},False,True,verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) raise BackendError('Jedi','Return code %d retrieving job status information.' % status) # Retrieve job job = jobIDs[jediTaskDict['jediTaskID']] newJobsetID = -1 # get jobset retryJobs = [] # jspecs resubmittedJobs = [] # ganga jobs if jediTaskDict['status'] in ['failed', 'killed', 'cancelled', 'aborted', 'broken', 'finished' ]: retryJobs.append(job) resubmittedJobs.append(jID) #elif jediTaskDict['status'] == 'finished': # pass else: logger.warning("Cannot resubmit. Jedi task %s is status %s." %(jID, jediTaskDict['status'] )) return False # submit if len(retryJobs)==0: logger.warning("No failed jobs to resubmit") return False with inject_proxy(self.credential_requirements): status,out = Client.retryTask(jID, verbose=False) if status != 0: logger.error(status) logger.error(out) logger.error("Failed to retry JobID=%s" % jID) return False tmpStat,tmpDiag = out if not tmpStat: logger.error(tmpDiag) logger.error("Failed to retry JobID=%s" % jID) return False logger.info(tmpDiag) job.backend.status = None job.backend.jobSpec = {} job.updateStatus('submitted') logger.info('Resubmission successful') return True
def ELG_jediState(sample) : from pandatools import PandaToolsPkgInfo if int(float(PandaToolsPkgInfo.release_version[2])) < 4 : print "Need prun with JEDI support, try:" print " localSetupPandaClient currentJedi --noAthenaCheck" return '' jediTaskID = int(sample.getMetaDouble("nc_jediTaskID", 0)) if jediTaskID < 100 : print "Sample " + sample.name() + " does not have a jediTaskID" return '' from pandatools import Client taskDict = {} taskDict['jediTaskID'] = jediTaskID ret = Client.getJediTaskDetails(taskDict, False, True) if ret[0] != 0 : print "Problem checking status of task %s with id %s" % (sample.name(), jediTaskID) return '' return ret[1]['status']
def status(self,JobID,forceUpdate=False): # get logger tmpLog = PLogger.getPandaLogger() # check proxy self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose, useCache=True) # get job info from local repository job = self.getJobInfo(JobID) if job == None: # not found return None # update if needed if job.dbStatus != 'frozen' or forceUpdate: if not job.isJEDI(): tmpLog.info("Getting status for JobID=%s ..." % JobID) # get status from Panda server status,pandaIDstatus = Client.getPandIDsWithJobID(JobID,verbose=self.verbose) if status != 0: tmpLog.error("Failed to get status for ID=%s" % JobID) return None # get one job to set computingSite which may have changed due to rebrokerage pandaJob = None if pandaIDstatus != {}: tmpPandaIDs = pandaIDstatus.keys() tmpPandaIDs.sort() status,tmpPandaJobs = Client.getFullJobStatus( tmpPandaIDs[:1], verbose=self.verbose) if status != 0: tmpLog.error("Failed to get PandaJobs for %s" % JobID) return None pandaJob = tmpPandaJobs[0] # convert to local job spec job = PdbUtils.convertPtoD([],pandaIDstatus,job,pandaJobForSiteID=pandaJob) # check merge job generation status = self.setMergeJobStatus(job,forceUpdate) if not status: return None else: tmpLog.info("Getting status for TaskID=%s ..." % job.jediTaskID) # get JEDI task status,jediTaskDict = Client.getJediTaskDetails( {'jediTaskID':job.jediTaskID}, False, True, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get task details for %s" % JobID) return # convert JEDI task job = PdbUtils.convertJTtoD(jediTaskDict,job) # update DB try: PdbUtils.updateJobDB(job,self.verbose) except: tmpLog.error("Failed to update local repository for JobID=%s" % JobID) return None if not job.isJEDI(): tmpLog.info("Updated JobID=%s" % JobID) else: tmpLog.info("Updated TaskID=%s ..." % job.jediTaskID) # return return job
def master_updateMonitoringInformation(jobs): '''Monitor jobs''' from pandatools import Client #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ] submitting_status = [ ] active_status = [ None, 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready', 'scouting', 'running', 'holding', 'merging', 'prepared', 'aborting', 'finishing' ] inactive_status = [ 'finished', 'aborted', 'broken', 'failed', 'done' ] # Find jobs to be monitored jobdict = {} for job in jobs: # add a delay as Panda can be a little slow in sorting out a new Task if job.backend.id and job.backend.status in active_status and ( (datetime.datetime.utcnow() - job.time.timestamps["submitted"]).seconds > 120): jobdict[job.backend.id] = job logger.debug("jobdict = %s" %jobdict) # Monitor active Jedi tasks allJobIDs = jobdict.keys() pandaJobIDs = {} for jID in allJobIDs: status, jediTaskDict = Client.getJediTaskDetails({'jediTaskID': jID},False,True,verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) #raise BackendError('Jedi','Return code %d retrieving job status information.' % status) continue # Retrieve job job = jobdict[jediTaskDict['jediTaskID']] # Store associated Panda jobs if job.backend.pandajobs: pandaJobIDs[job.backend.id] = [pj.id for pj in job.backend.pandajobs] else: pandaJobIDs[jediTaskDict['jediTaskID']] = jediTaskDict['PandaID'] logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs)) # Fill the output data dataset list if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '': for ds in jediTaskDict['outDS'].split(','): if not ds in job.outputdata.datasetList: job.outputdata.datasetList.append(ds) # Jedi job status has changed if job.backend.status != jediTaskDict['status']: logger.debug('Job %s has changed status from %s to %s',job.getFQID('.'),job.backend.status, jediTaskDict['status']) job.backend.status = jediTaskDict['status'] job.backend.reason = jediTaskDict['statistics'] # Now update Jedi job status if jediTaskDict['status'] in ['registered', 'waiting', 'defined', 'pending', 'assigning', 'ready']: job.updateStatus('submitted') elif jediTaskDict['status'] in ['scouting', 'running', 'holding', 'merging', 'prepared' ]: job.updateStatus('running') elif jediTaskDict['status'] in ['done']: job.updateStatus('completed') elif jediTaskDict['status'] in ['failed', 'finished']: job.updateStatus('failed') elif jediTaskDict['status'] in [ 'aborted', 'broken', 'cancelled' ] and job.status not in ['completed','failed']: job.updateStatus('killed') else: logger.warning('Unexpected Jedi task status %s', jediTaskDict['status']) # Check if associated Panda job exist and monitor them if not job.backend.pandajobs: jdefids = pandaJobIDs[jID] # skip if there are no Panda jobs yet if not jdefids: continue tot_num_mjobs = 0 do_master_update = True ick,status,num_mjobs = retrievePandaJobs(job, jdefids) logger.debug('retrievePandaJobs returns: %s %s' % (repr(ick),status)) if not ick: logger.debug('Panda job retrival failure for Jedi task %s with PandaIds %s' % (job.backend.id, jdefids)) do_master_update = False tot_num_mjobs += num_mjobs logger.debug('Job %s retrieved %d Panda jobs' % (job.getFQID('.'),tot_num_mjobs) ) # Now monitor the already attached Panda jobs else: jdefids = [ pj.id for pj in job.backend.pandajobs ] rc, jobsStatus = Client.getFullJobStatus(jdefids,False) if rc: logger.error('Return code %d retrieving job status information.',rc) raise BackendError('Jedi','Return code %d retrieving job status information.' % rc) for status in jobsStatus: if not status: continue for pjob in job.backend.pandajobs: if pjob.id == status.PandaID: # skip if no status change if pjob.status == status.jobStatus: continue # Else update job record pjob.jobSpec = dict(zip(status._attributes,status.values())) for k in pjob.jobSpec.keys(): if type(pjob.jobSpec[k]) not in [type(''),type(1)]: pjob.jobSpec[k]=str(pjob.jobSpec[k]) logger.debug('Job %s with Panda job %s has changed status from %s to %s',job.getFQID('.'),pjob.id, pjob.status,status.jobStatus) pjob.status = status.jobStatus pjob.exitcode = str(status.transExitCode) pjob.piloterrorcode = str(status.pilotErrorCode) pjob.reason = '' for k in pjob.jobSpec.keys(): if k.endswith('ErrorDiag') and pjob.jobSpec[k]!='NULL': pjob.reason += '%s: %s, '%(k,str(pjob.jobSpec[k])) #if job.backend.jobSpec['transExitCode'] != 'NULL': pjob.reason += 'transExitCode: %s'%pjob.jobSpec['transExitCode'] if status.jobStatus in ['defined','unknown','assigned','waiting','activated','sent']: logger.debug('Panda job %s %s' % (pjob.id, status.jobStatus)) elif status.jobStatus in ['starting','running','holding','transferring', 'merging']: logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus)) elif status.jobStatus in ['finished']: logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus)) elif status.jobStatus == 'failed': logger.debug('Panda job %s %s '% (pjob.id, status.jobStatus)) # check for server side retry if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec['taskBufferErrorDiag'].find("PandaID=") != -1: # grab the new panda ID newPandaID = long(pjob.jobSpec['taskBufferErrorDiag'].split("=")[1]) pjob.id = newPandaID pjob.status = None pjob.url = 'http://panda.cern.ch/?job=%d'%newPandaID elif status.jobStatus == 'cancelled' and pjob.status not in ['completed','failed']: # bug 67716 logger.debug('Panda job %s cancelled'%pjob.id) if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec['taskBufferErrorDiag']: newPandaID = checkForRebrokerage(pjob.jobSpec['taskBufferErrorDiag']) logger.warning("Subjob rebrokered by Panda server. Job %d moved to %d."%(pjob.id, newPandaID)) pjob.id = newPandaID pjob.status = None else: logger.warning('Unexpected job status %s',status.jobStatus)
def master_resubmit(self, jobs): '''Resubmit failed Jedi job''' from pandatools import Client jobIDs = {} for job in jobs: jobIDs[job.backend.id] = job allJobIDs = jobIDs.keys() pandaJobIDs = {} for jID in allJobIDs: status, jediTaskDict = Client.getJediTaskDetails( {'jediTaskID': jID}, False, True, verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) raise BackendError( 'Jedi', 'Return code %d retrieving job status information.' % status) # Retrieve job job = jobIDs[jediTaskDict['jediTaskID']] newJobsetID = -1 # get jobset retryJobs = [] # jspecs resubmittedJobs = [] # ganga jobs if jediTaskDict['status'] in [ 'failed', 'killed', 'cancelled', 'aborted', 'broken', 'finished' ]: retryJobs.append(job) resubmittedJobs.append(jID) #elif jediTaskDict['status'] == 'finished': # pass else: logger.warning("Cannot resubmit. Jedi task %s is status %s." % (jID, jediTaskDict['status'])) return False # submit if len(retryJobs) == 0: logger.warning("No failed jobs to resubmit") return False status, out = Client.retryTask(jID, verbose=False) if status != 0: logger.error(status) logger.error(out) logger.error("Failed to retry JobID=%s" % jID) return False tmpStat, tmpDiag = out if not tmpStat: logger.error(tmpDiag) logger.error("Failed to retry JobID=%s" % jID) return False logger.info(tmpDiag) job.backend.status = None job.backend.jobSpec = {} job.updateStatus('submitted') logger.info('Resubmission successful') return True
def master_updateMonitoringInformation(jobs): '''Monitor jobs''' from pandatools import Client #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ] submitting_status = [] active_status = [ None, 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready', 'scouting', 'running', 'holding', 'merging', 'prepared', 'aborting', 'finishing' ] inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done'] # Find jobs to be monitored jobdict = {} for job in jobs: # add a delay as Panda can be a little slow in sorting out a new Task if job.backend.id and job.backend.status in active_status and ( (datetime.datetime.utcnow() - job.time.timestamps["submitted"]).seconds > 120): jobdict[job.backend.id] = job logger.debug("jobdict = %s" % jobdict) # Monitor active Jedi tasks allJobIDs = jobdict.keys() pandaJobIDs = {} for jID in allJobIDs: status, jediTaskDict = Client.getJediTaskDetails( {'jediTaskID': jID}, False, True, verbose=False) if status != 0: logger.error("Failed to get task details for %s" % jID) #raise BackendError('Jedi','Return code %d retrieving job status information.' % status) continue # Retrieve job job = jobdict[jediTaskDict['jediTaskID']] # Store associated Panda jobs if job.backend.pandajobs: pandaJobIDs[job.backend.id] = [ pj.id for pj in job.backend.pandajobs ] else: pandaJobIDs[ jediTaskDict['jediTaskID']] = jediTaskDict['PandaID'] logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs)) # Fill the output data dataset list if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '': for ds in jediTaskDict['outDS'].split(','): if not ds in job.outputdata.datasetList: job.outputdata.datasetList.append(ds) # Jedi job status has changed if job.backend.status != jediTaskDict['status']: logger.debug('Job %s has changed status from %s to %s', job.getFQID('.'), job.backend.status, jediTaskDict['status']) job.backend.status = jediTaskDict['status'] job.backend.reason = jediTaskDict['statistics'] # Now update Jedi job status if jediTaskDict['status'] in [ 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready' ]: job.updateStatus('submitted') elif jediTaskDict['status'] in [ 'scouting', 'running', 'holding', 'merging', 'prepared' ]: job.updateStatus('running') elif jediTaskDict['status'] in ['done']: job.updateStatus('completed') elif jediTaskDict['status'] in ['failed', 'finished']: job.updateStatus('failed') elif jediTaskDict['status'] in [ 'aborted', 'broken', 'cancelled' ] and job.status not in ['completed', 'failed']: job.updateStatus('killed') else: logger.warning('Unexpected Jedi task status %s', jediTaskDict['status']) # Check if associated Panda job exist and monitor them if not job.backend.pandajobs: jdefids = pandaJobIDs[jID] # skip if there are no Panda jobs yet if not jdefids: continue tot_num_mjobs = 0 do_master_update = True ick, status, num_mjobs = retrievePandaJobs(job, jdefids) logger.debug('retrievePandaJobs returns: %s %s' % (repr(ick), status)) if not ick: logger.debug( 'Panda job retrival failure for Jedi task %s with PandaIds %s' % (job.backend.id, jdefids)) do_master_update = False tot_num_mjobs += num_mjobs logger.debug('Job %s retrieved %d Panda jobs' % (job.getFQID('.'), tot_num_mjobs)) # Now monitor the already attached Panda jobs else: jdefids = [pj.id for pj in job.backend.pandajobs] rc, jobsStatus = Client.getFullJobStatus(jdefids, False) if rc: logger.error( 'Return code %d retrieving job status information.', rc) raise BackendError( 'Jedi', 'Return code %d retrieving job status information.' % rc) for status in jobsStatus: if not status: continue for pjob in job.backend.pandajobs: if pjob.id == status.PandaID: # skip if no status change if pjob.status == status.jobStatus: continue # Else update job record pjob.jobSpec = dict( zip(status._attributes, status.values())) for k in pjob.jobSpec.keys(): if type(pjob.jobSpec[k]) not in [ type(''), type(1) ]: pjob.jobSpec[k] = str(pjob.jobSpec[k]) logger.debug( 'Job %s with Panda job %s has changed status from %s to %s', job.getFQID('.'), pjob.id, pjob.status, status.jobStatus) pjob.status = status.jobStatus pjob.exitcode = str(status.transExitCode) pjob.piloterrorcode = str(status.pilotErrorCode) pjob.reason = '' for k in pjob.jobSpec.keys(): if k.endswith('ErrorDiag' ) and pjob.jobSpec[k] != 'NULL': pjob.reason += '%s: %s, ' % ( k, str(pjob.jobSpec[k])) #if job.backend.jobSpec['transExitCode'] != 'NULL': pjob.reason += 'transExitCode: %s' % pjob.jobSpec[ 'transExitCode'] if status.jobStatus in [ 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent' ]: logger.debug('Panda job %s %s' % (pjob.id, status.jobStatus)) elif status.jobStatus in [ 'starting', 'running', 'holding', 'transferring', 'merging' ]: logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) elif status.jobStatus in ['finished']: logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) elif status.jobStatus == 'failed': logger.debug('Panda job %s %s ' % (pjob.id, status.jobStatus)) # check for server side retry if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[ 'taskBufferErrorDiag'].find( "PandaID=") != -1: # grab the new panda ID newPandaID = long( pjob.jobSpec['taskBufferErrorDiag']. split("=")[1]) pjob.id = newPandaID pjob.status = None pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID elif status.jobStatus == 'cancelled' and pjob.status not in [ 'completed', 'failed' ]: # bug 67716 logger.debug('Panda job %s cancelled' % pjob.id) if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[ 'taskBufferErrorDiag']: newPandaID = checkForRebrokerage( pjob.jobSpec['taskBufferErrorDiag']) logger.warning( "Subjob rebrokered by Panda server. Job %d moved to %d." % (pjob.id, newPandaID)) pjob.id = newPandaID pjob.status = None else: logger.warning('Unexpected job status %s', status.jobStatus)
if job_info is not None: # if job_info.Files and len(job_info.Files) > 0: print(job_info) print(job_info.attemptNr) print(job_info.maxAttempt) print(job_info.Files) print(job_info.Files[0]) for f in job_info.Files: # print(dir(f)) print(f._attributes) print(f.values()) print(f.type) jediTaskID = 3885 ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID}, True, True, verbose=False) print(ret) ret = Client.getTaskStatus(jediTaskID, verbose=False) print(ret) """ sys.exit(0) jediTaskID = 998 ret = Client.getPandaIDsWithTaskID(jediTaskID, verbose=False) # print(ret) jobids = ret[1] # print(jobids) ret = Client.getJobStatus(ids=jobids, verbose=False)