print "Submitting jobs..." multiPartJob.submit() restarted = False # now we wait for all jobs to finish. Actually, we probably should test whether the job was successful as well... while not multiPartJob.isFinished(True): # printing some stats print multiPartJob.getProgress() # restart failed jobs everytime failedpolicy = DefaultResubmitPolicy() # to only resubmit failed jobs, we have to remove the waiting jobs resubmission that is set by default multiPartJob.restart(failedpolicy, True) # restart once after the jobsubmission is finished to optimize job distributions to queues where the job actually runs if not restarted: # actually, it probably would be a good idea to refresh the job status here because otherwise the restart will just # restart failed jobs that were already submitted with the restart above... not really sure... #multiPartJob.refresh() # this might not work the first few times because in the background the batchjob is still submitting... print "trying to restarting job..." policy = DefaultResubmitPolicy() # the next line doesn't make sense since it's the default anyway. Just to demonstrate. policy.setProperty(DefaultResubmitPolicy.RESTART_WAITING_JOBS, True) restarted = multiPartJob.restart(policy, True)
start = 30 end = 40 pathToInputFiles = batchJob.pathToInputFiles() inputFile1relPath = pathToInputFiles+'inputFile1.txt ' inputFile2relPath = pathToInputFiles+'inputFile2.txt' for i in range(start, end): # create a unique jobname for every job jobname = batchJobName+"_"+ str(i) print 'Creating job: '+jobname # create the single job job = JobObject(si) job.setJobname(jobname) # better to set the application to use explicitely because in that case we don't need to use mds (faster) job.setApplication('UnixCommands') job.setCommandline('cat '+ inputFile1relPath + ' ' + inputFile2relPath) job.setWalltimeInSeconds(60) # adding the job to the multijob batchJob.addJob(job) # only start the newly added jobs and wait for the restart to finish batchJob.restart(False, False, True, True) # don't forget to exit properly. this cleans up possible existing threads/executors sys.exit()
si = LoginManager.loginCommandline() # load (but not refresh yet) batchjob, this might take a while batchJob = BatchJobObject(si, batchJobname, False) while not batchJob.isFinished(True) and False: print batchJob.getProgress() print str(batchJob.getNumberOfFailedJobs()) if batchJob.getNumberOfFailedJobs() > 0: print str(batchJob.getNumberOfFailedJobs()) + ' failed jobs found. restarting...' failedpolicy = DefaultResubmitPolicy() batchJob.restart(failedpolicy, True) print 'Restart finished.' time.sleep(5) jobsToRestart = [] for job in batchJob.getJobs(): print "Job: "+job.getJobname()+", Status: "+job.getStatusString(False) try: output = job.getStdOutContent() index = output.find('error') if index != -1: # it doesn't actually make any sense to restart this job, since it would # obviously have the same result again. This is just to demonstrate how to parse