def blockResult(self, modelRun, jobMI): # CHeck jobMI is of type MPI ... maxRunTime = modelRun.jobParams['maxRunTime'] pollInterval = modelRun.jobParams['pollInterval'] procHandle = jobMI.procHandle # Navigate to the model's base directory startDir = os.getcwd() if modelRun.basePath != startDir: print "Changing to ModelRun's specified base path '%s'" % \ (modelRun.basePath) os.chdir(modelRun.basePath) if maxRunTime == None or maxRunTime <= 0: timeOut = False retCode = procHandle.wait() else: if pollInterval > maxRunTime: pollInterval = maxRunTime totalTime = 0 timeOut = True while totalTime <= maxRunTime: # Note: current strategy in this loop means 'totalTime' # recorded here will only be as accurate as size of # pollInterval. # Thus this is a fall-back for recording time taken. time.sleep(pollInterval) totalTime += pollInterval retCode = procHandle.poll() if retCode is not None: timeOut = False break if timeOut: # At this point, we know the process has run too long. # From Python 2.6, change this to procHandle.kill() print "Error: passed timeout of %s, sending quit signal." % \ (str(timedelta(seconds=maxRunTime))) os.kill(procHandle.pid, signal.SIGQUIT) # TODO: set finishTime # Check status of run (eg error status) stdOutFilename = modelRun.getStdOutFilename() stdErrFilename = modelRun.getStdErrFilename() if timeOut == True: raise ModelRunTimeoutError(modelRun.name, stdOutFilename, stdErrFilename, maxRunTime) if retCode != 0: raise ModelRunRegularError(modelRun.name, retCode, stdOutFilename, stdErrFilename) else: # Taking advantage of os.path.join functionality to automatically # over-ride later absolute paths. absOutPath = os.path.join(modelRun.basePath, modelRun.outputPath) absLogPath = os.path.join(modelRun.basePath, modelRun.logPath) print "Model ran successfully (output saved to path %s" %\ (absOutPath), if absLogPath != absOutPath: print ", std out & std error to %s" % (absLogPath), print ")." # Now tidy things up after the run. jobMI.stdOutFile.close() jobMI.stdErrFile.close() print "Doing post-run tidyup:" modelRun.postRunCleanup() # Construct a modelResult mResult = ModelResult(modelRun.name, absOutPath) mResult.jobMetaInfo = jobMI try: #TODO: the below should be a standard method of ModelResult tSteps, simTime = getSimInfoFromFreqOutput(mResult.outputPath) except ValueError: # For now, allow runs that didn't create a freq output tSteps, simTime = None, None #Now collect profiler performance info. for profiler in self.profilers: profiler.attachPerformanceInfo(jobMI, mResult) if modelRun.basePath != startDir: print "Restoring initial path '%s'" % \ (startDir) os.chdir(startDir) return mResult
def blockResult(self, modelRun, jobMetaInfo): # Check jobMetaInfo is of type PBS # via self.runType = "PBS" startDir = os.getcwd() if modelRun.basePath != startDir: print "Changing to ModelRun's specified base path '%s'" % \ (modelRun.basePath) os.chdir(modelRun.basePath) jobID = jobMetaInfo.jobId pollInterval = modelRun.jobParams['pollInterval'] checkOutput = 0 # NB: unlike with the MPI Job Runner, we don't check the "maxJobTime" here:- since that was encoded # in the PBS Walltime used. Wait as long as necessary for job to be queued, run, and completed # in PBS system. pbsWaitTime = 0 gotResult = False pbsError = False while gotResult == False: time.sleep(pollInterval) pbsWaitTime += pollInterval # check PBS job output ... (eg using qstat on jobID) qstat = os.popen("qstat " + jobID).readlines() qstatus = "%s" % (qstat) # when the job has been submitted and we query the job ID we should get something like: # if the job has ended: #qstat: Unknown Job Id 3506.tweedle # OR #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 E batch # if the job has not commenced running or is still running: #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 Q batch #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 R batch #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 S batch # if the job has not been able to be run: #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 C batch # So if we break the command line up into an array of words separated by spaces: qstatus = qstatus.split(" ") #jobName and modelName MUST be THE SAME for ii in range(len(qstatus)): if qstatus[ii] == "Unknown": print "job has already run\n" gotResult = True elif qstatus[ii] == "R": print "job is still running\n" elif qstatus[ii] == "Q": print "job is queued\n" elif qstatus[ii] == "C": print "job is cancelled\n" gotResult = True pbsError = True elif qstatus[ii] == "E": print "job has ended\n" gotResult = True # Check status of run (eg error status) # TODO: archive PBS file in modelRun output directory. # TODO: connect/copy PBS stdout/error files to standard expected names. stdOutFilename = modelRun.getStdOutFilename() stdErrFilename = modelRun.getStdErrFilename() #qdel = os.popen("qdel "+jobID).readlines() if pbsError: raise ModelRunRegularError(modelRun.name, -1, stdOutFilename, stdErrFilename) else: absOutPath = os.path.join(modelRun.basePath, modelRun.outputPath) absLogPath = os.path.join(modelRun.basePath, modelRun.logPath) # TODO: Move and rename output and error files created by PBS, # ... to stdOutFilename, stdErrFilename # check PBS output file and make sure there's something in it jobName = "%s" % (modelRun.name) jobid = jobID.split(".") jobNo = jobid[0] fileName = jobName + ".o" + jobNo f = open(fileName, 'r') lines = f.read() if lines == "": print "error in file no output obtained\n" raise ModelRunRegularError(modelRun.name, retCode, stdOutFilename, stdErrFilename) else: print "Model ran successfully (output saved to %s, std out"\ " & std error to %s." % (absOutPath, absLogPath) print "Doing post-run tidyup:" modelRun.postRunCleanup() # Construct a modelResult mResult = ModelResult(modelRun.name, absOutPath) # Now attach appropriate Job meta info try: tSteps, simTime = getSimInfoFromFreqOutput(modelRun.outputPath) except ValueError: # For now, allow runs that didn't create a freq output tSteps, simTime = None, None # Perhaps functions on jobMetaInfo? # get provenance info # attach provenance info # get performance info # attach performance info mResult.jobMetaInfo = jobMetaInfo # Navigate to the model's base directory if modelRun.basePath != startDir: print "Restoring initial path '%s'" % (startDir) os.chdir(startDir) return mResult
def blockResult(self, modelRun, jobMetaInfo): # Check jobMetaInfo is of type PBS # via self.runType = "PBS" startDir = os.getcwd() if modelRun.basePath != startDir: print "Changing to ModelRun's specified base path '%s'" % \ (modelRun.basePath) os.chdir(modelRun.basePath) jobID = jobMetaInfo.jobId pollInterval = modelRun.jobParams['pollInterval'] checkOutput = 0 # NB: unlike with the MPI Job Runner, we don't check the "maxJobTime" here:- since that was encoded # in the PBS Walltime used. Wait as long as necessary for job to be queued, run, and completed # in PBS system. pbsWaitTime = 0 gotResult = False pbsError = False while gotResult == False: time.sleep(pollInterval) pbsWaitTime += pollInterval # check PBS job output ... (eg using qstat on jobID) qstat = os.popen("qstat "+jobID).readlines() qstatus = "%s" % (qstat) # when the job has been submitted and we query the job ID we should get something like: # if the job has ended: #qstat: Unknown Job Id 3506.tweedle # OR #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 E batch # if the job has not commenced running or is still running: #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 Q batch #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 R batch #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 S batch # if the job has not been able to be run: #3505.tweedle cratonic30t2c3d2 WendySharples 00:15:16 C batch # So if we break the command line up into an array of words separated by spaces: qstatus = qstatus.split(" ") #jobName and modelName MUST be THE SAME for ii in range(len(qstatus)): if qstatus[ii] == "Unknown": print "job has already run\n" gotResult = True elif qstatus[ii] == "R": print "job is still running\n" elif qstatus[ii] == "Q": print "job is queued\n" elif qstatus[ii] == "C": print "job is cancelled\n" gotResult = True pbsError = True elif qstatus[ii] == "E": print "job has ended\n" gotResult = True # Check status of run (eg error status) # TODO: archive PBS file in modelRun output directory. # TODO: connect/copy PBS stdout/error files to standard expected names. stdOutFilename = modelRun.getStdOutFilename() stdErrFilename = modelRun.getStdErrFilename() #qdel = os.popen("qdel "+jobID).readlines() if pbsError: raise ModelRunRegularError(modelRun.name, -1, stdOutFilename, stdErrFilename) else: absOutPath = os.path.join(modelRun.basePath, modelRun.outputPath) absLogPath = os.path.join(modelRun.basePath, modelRun.logPath) # TODO: Move and rename output and error files created by PBS, # ... to stdOutFilename, stdErrFilename # check PBS output file and make sure there's something in it jobName = "%s" % (modelRun.name) jobid = jobID.split(".") jobNo = jobid[0] fileName = jobName+".o"+jobNo f = open(fileName, 'r') lines = f.read() if lines == "": print "error in file no output obtained\n" raise ModelRunRegularError(modelRun.name, retCode, stdOutFilename, stdErrFilename) else: print "Model ran successfully (output saved to %s, std out"\ " & std error to %s." % (absOutPath, absLogPath) print "Doing post-run tidyup:" modelRun.postRunCleanup() # Construct a modelResult mResult = ModelResult(modelRun.name, absOutPath) # Now attach appropriate Job meta info try: tSteps, simTime = getSimInfoFromFreqOutput(modelRun.outputPath) except ValueError: # For now, allow runs that didn't create a freq output tSteps, simTime = None, None # Perhaps functions on jobMetaInfo? # get provenance info # attach provenance info # get performance info # attach performance info mResult.jobMetaInfo = jobMetaInfo # Navigate to the model's base directory if modelRun.basePath != startDir: print "Restoring initial path '%s'" % (startDir) os.chdir(startDir) return mResult