from grisu.control import DefaultResubmitPolicy from grisu.frontend.control.login import LoginManager from grisu.frontend.model.job import BatchJobObject from grisu.frontend.model.job import JobException import sys import time batchJobname = sys.argv[1] si = LoginManager.loginCommandline() # load (but not refresh yet) batchjob, this might take a while batchJob = BatchJobObject(si, batchJobname, False) while not batchJob.isFinished(True) and False: print batchJob.getProgress() print str(batchJob.getNumberOfFailedJobs()) if batchJob.getNumberOfFailedJobs() > 0: print str(batchJob.getNumberOfFailedJobs()) + ' failed jobs found. restarting...' failedpolicy = DefaultResubmitPolicy() batchJob.restart(failedpolicy, True) print 'Restart finished.' time.sleep(5) jobsToRestart = []
print "Job: " + job.getJobname() + ", Error: " + error.getFailures( ).get(job).getLocalizedMessage() sys.exit() print "Job distribution:" for subLoc in batch_job.getOptimizationResult().keySet(): print subLoc + " : " + batch_job.getOptimizationResult().get(subLoc) print "Submitting jobs..." batch_job.submit() restarted = False # now we wait for all jobs to finish. Actually, we probably should test whether the job was successful as well... while not batch_job.isFinished(True): # printing some stats print batch_job.getProgress() # restart failed jobs everytime failedpolicy = DefaultResubmitPolicy() # to only resubmit failed jobs, we have to remove the waiting jobs resubmission that is set by default batch_job.restart(failedpolicy, True) # restart once after the jobsubmission is finished to optimize job distributions to queues where the job actually runs if not restarted: # actually, it probably would be a good idea to refresh the job status here because otherwise the restart will just # restart failed jobs that were already submitted with the restart above... not really sure... #multiPartJob.refresh()
except (BackendException), error: print ("HALT: Exception from grisu backend " + backend + "!") print (error.getLocalizedMessage()) print ("========================") time.sleep(3) error.printStackTrace() sys.exit(1) time.sleep(3) print "INFO: Submitting jobs in batch " + batch_jobs.getJobname() batch_jobs.submit() restarted = False print "INFO: Waiting for batch " + batch_jobs.getJobname() + " to finish" while not batch_jobs.isFinished(True): print "\rWAITING: Running " + str(job_count) + " jobs:", print " Waiting [" + str(batch_jobs.getNumberOfWaitingJobs()) + "]", print " Active [" + str(batch_jobs.getNumberOfRunningJobs()) + "]", print " Successful [" + str(batch_jobs.getNumberOfSuccessfulJobs()) + "]", print " Failed [" + str(batch_jobs.getNumberOfFailedJobs()) + "]", time.sleep(3) # Refresh status one last time print "\rWAITING: Running " + str(job_count) + " jobs:", print " Waiting [" + str(batch_jobs.getNumberOfWaitingJobs()) + "]", print " Active [" + str(batch_jobs.getNumberOfRunningJobs()) + "]", print " Successful [" + str(batch_jobs.getNumberOfSuccessfulJobs()) + "]", print " Failed [" + str(batch_jobs.getNumberOfFailedJobs()) + "]" print "INFO: batch jobs in " + batch_jobs.getJobname() + " finished."