Example #1
0
from grisu.control import DefaultResubmitPolicy
from grisu.frontend.control.login import LoginManager
from grisu.frontend.model.job import BatchJobObject
from grisu.frontend.model.job import JobException
import sys
import time

batchJobname  =  sys.argv[1]

si = LoginManager.loginCommandline()

# load (but not refresh yet) batchjob, this might take a while
batchJob = BatchJobObject(si, batchJobname, False)

while not batchJob.isFinished(True) and False:
    
    print batchJob.getProgress()
    
    print str(batchJob.getNumberOfFailedJobs())
    
    if batchJob.getNumberOfFailedJobs() > 0:
        
        print str(batchJob.getNumberOfFailedJobs()) + ' failed jobs found. restarting...'
        failedpolicy = DefaultResubmitPolicy()
        batchJob.restart(failedpolicy, True)
        print 'Restart finished.'
        
    time.sleep(5)

jobsToRestart = []
Example #2
0
        print "Job: " + job.getJobname() + ", Error: " + error.getFailures(
        ).get(job).getLocalizedMessage()

    sys.exit()

print "Job distribution:"
for subLoc in batch_job.getOptimizationResult().keySet():
    print subLoc + " : " + batch_job.getOptimizationResult().get(subLoc)

print "Submitting jobs..."
batch_job.submit()

restarted = False

# now we wait for all jobs to finish. Actually, we probably should test whether the job was successful as well...
while not batch_job.isFinished(True):
    # printing some stats
    print batch_job.getProgress()

    # restart failed jobs everytime
    failedpolicy = DefaultResubmitPolicy()
    # to only resubmit failed jobs, we have to remove the waiting jobs resubmission that is set by default
    batch_job.restart(failedpolicy, True)

    # restart once after the jobsubmission is finished to optimize job distributions to queues where the job actually runs
    if not restarted:

        # actually, it probably would be a good idea to refresh the job status here because otherwise the restart will just
        # restart failed jobs that were already submitted with the restart above...  not really sure...
        #multiPartJob.refresh()
except (BackendException), error:
    print ("HALT: Exception from grisu backend " + backend + "!")
    print (error.getLocalizedMessage())
    print ("========================")
    time.sleep(3)
    error.printStackTrace()
    sys.exit(1)
time.sleep(3)

print "INFO: Submitting jobs in batch " + batch_jobs.getJobname()
batch_jobs.submit()

restarted = False

print "INFO: Waiting for batch " + batch_jobs.getJobname() + " to finish"
while not batch_jobs.isFinished(True):
    print "\rWAITING: Running " + str(job_count) + " jobs:",
    print " Waiting [" + str(batch_jobs.getNumberOfWaitingJobs()) + "]",
    print " Active [" + str(batch_jobs.getNumberOfRunningJobs()) + "]",
    print " Successful [" + str(batch_jobs.getNumberOfSuccessfulJobs()) + "]",
    print " Failed [" + str(batch_jobs.getNumberOfFailedJobs()) + "]",
    time.sleep(3)

# Refresh status one last time
print "\rWAITING: Running " + str(job_count) + " jobs:",
print " Waiting [" + str(batch_jobs.getNumberOfWaitingJobs()) + "]",
print " Active [" + str(batch_jobs.getNumberOfRunningJobs()) + "]",
print " Successful [" + str(batch_jobs.getNumberOfSuccessfulJobs()) + "]",
print " Failed [" + str(batch_jobs.getNumberOfFailedJobs()) + "]"

print "INFO: batch jobs in " + batch_jobs.getJobname() + " finished."