def howManyJobs(eM, tableId, jobInt): tableStr = str(tableId) tableInt = int(tableId) eM = jobDB.JobState(tableInt) tableStr = str(tableId) t0 = eM.queryState(tableStr + 'NumJobs') if t0 == None: t0 = 0 else: t0 = int(t0) print 'howManyJobs: Current num: ', t0 return t0
from lsst.sims.catalogs.generation.db import jobDB import sys eM = jobDB.JobState(sys.argv[1]) eM.updateState(sys.argv[1] + 'blah', 'blah') eM.showStates() t0 = eM.queryState(sys.argv[1] + 'NumJobs') print t0
possibleTestModeStr = ' testMode' print '>>> Using test mode.' obsList = [] f = open('inList.txt', 'r') for line in f: t0 = line.split() if len(t0) != 1: continue obsList.append(t0[0]) f.close() print 'obsList:', obsList numJobs = len(obsList) executionDBManager = jobDB.JobState() t0 = executionDBManager.getJobId() nFN = '%s_%s' % (t0.getOwner(), t0.getId()) print 'Using job ID: %s' % nFN for i in range(len(obsList)): jobId = '%s_%i' % (nFN, i) throttleUtils.throttle(executionDBManager, maxNumJobs, waitTime) # For now, call addJob() before actually starting the job, # because there could be a race condition if addJob() # and removeJob() are called simultaneously. t0 = int(time.time()) t1 = '%s_%i' % (obsList[i], t0) throttleUtils.addJob(executionDBManager, jobId, t1)
if __name__ == '__main__': print 'Registering signal handler.' signal.signal(signal.SIGTERM, signalHandler) print 'Started with args:' for i in range(1, len(sys.argv)): print sys.argv[i] testMode = False if sys.argv[-1].lower() == 'testmode': testMode = True t0 = sys.argv[1].split('_') jobId = jobDB.JobId(int(t0[1]), owner=t0[0]) d = jobDB.JobState(jobId) procId = sys.argv[2] obsId = sys.argv[3] rad = sys.argv[4] startTime = time.time() t0 = int(startTime) d.updateState(procId, 'JobRunning_%s_%i' % (obsId, t0)) print 'Update state: %s to JobRunning_%s_%i' % (procId, obsId, t0) #HACK this should be changed to just call the catalog generation classes #Rob says this may be an issue because he checks the error code on exit. if testMode == False: t0 = 'python $CATALOGS_GENERATION_DIR/bin/runFiles.py %s %s' t1 = t0 % (obsId, rad) else: t0 = None t1 = 'python $CATALOGS_GENERATION_DIR/bin/fakeRunFiles.py'
def getCopyDBM(iD): eDBM = jobDB.JobState(iD) return eDBM
print 'Waking to check again.' else: done = True if not len(sys.argv) == 4: print "usage: %python myJobMonitor.py tableId state jobId" quit() tableId = sys.argv[1] state = sys.argv[2] jobId = sys.argv[3] if state == 'qsubbed': tableId = int(tableId) eM = jobDB.JobState(tableId) qsubJob(eM, tableId, jobId) if state == 'running': tableId = int(tableId) eM = jobDB.JobState(tableId) jobRunning(eM, tableId, jobId) if state == 'finished': #jid = jobDB.JobId(id, owner) tableId = int(tableId) eM = jobDB.JobState(tableId) jobFinished(eM, tableId, jobId) if state == 'howmany': tableId = int(tableId)
raftmap = {"01":"0,1", "02":"0,2", "03":"0,3", \ "10":"1,0", "11":"1,1", "12":"1,2", "13":"1,3", "14":"1,4", \ "20":"2,0", "21":"2,1", "22":"2,2", "23":"2,3", "24":"2,4", \ "30":"3,0", "31":"3,1", "32":"3,2", "33":"3,3", "34":"3,4", \ "41":"4,1", "42":"4,2", "43":"4,3"} sensormap = {"00":"0,0", "01":"0,1", "02":"0,2", \ "10":"1,0", "11":"1,1", "12":"1,2", \ "20":"2,0", "21":"2,1", "22":"2,2"} # constructed to have the form "R:rx,ry S:sx,sy:snap" # which is how the fpaFig.map keys are constructed sensorid = "R:"+raftmap[rxry]+" "+"S:"+sensormap[sxsy]+":"+ex jobid = jobDB.JobId(id=obshistid, owner=username) jobStr = str(jobid) eM = jobDB.JobState(jobid=jobid) stateKey = jobStr + '_%s' %(sensorid) + '_JS' if state == 'qsubbed': qsubJob(eM, sensorid, jobStr) if state == 'running': jobRunning(eM, sensorid, jobStr) if state == 'finished': jobFinished(eM, sensorid, jobStr) if state == 'error': jobError(eM, sensorid, jobStr)
def doOneCatalogType(self, catalogType, queryTypes, obsHistID): #nFN = self.getNextGoodFileNum() fullTimeStart = time.time() self.executionDBManager = jobDB.JobState() t0 = self.executionDBManager.getJobId() nFN = '%s_%s' % (t0.getOwner(), t0.getId()) print 'Using job ID: %s' % nFN print 'queryTypes:', queryTypes jobNum = 0 jobTypes = []; jobNums = []; jobPickleFiles = []; useTypes = [] allOutputFiles = []; curMD = None self.metaDataManager.reset() os.system('free -m') for objectType in queryTypes: if objectType not in useTypes: useTypes.append(objectType) print 'Getting first %s instance catalog of size %i...' % ( objectType, self.chunkSize) t0 = time.time() myQDB = queryDB.queryDB( chunksize=self.chunkSize, objtype=objectType) print ' ...setting up QDB took %i sec.' % (time.time() - t0) t0 = time.time() instanceCat = myQDB.getInstanceCatalogById(obsHistID) print ' ...and getting catalog took %i sec.' % (time.time() - t0) numCats = 0 if instanceCat != None: # This code adds some needed fields to the metadata mUtils.trimGeneration.derivedTrimMetadata(instanceCat) os.system('free -m') # Deep copy so we can store this after instanceCat disappears if curMD == None: curMD = copy.deepcopy(instanceCat.metadata) else: curMD.mergeMetadata(instanceCat.metadata) while instanceCat != None: t0 = self.WorkDir + 'catData%s_%i.ja' % (nFN, jobNum) t1 = self.WorkDir + 'catData%s_%i.p' % (nFN, jobNum) print 'Now pickling query type: %s' % objectType # Store job data files in instance time0 = time.time() instanceCat.jobAllocatorDataFile = t0 allOutputFiles.append(t0) # Order is important instanceCat.jobAllocatorCatalogType = catalogType instanceCat.jobAllocatorObjectType = objectType cPickle.dump(instanceCat, open(t1, 'w')) print ' ...pickling took %i sec.' % (time.time() - time0) jobTypes.append(catalogType) jobNums.append(jobNum) jobPickleFiles.append(t1) jobNum += 1 if numCats > 0: curMD.mergeMetadata(instanceCat.metadata) # *** RRG: Free up memory somehow here for instanceCat... del(instanceCat); instanceCat = None os.system('free -m') if self.maxCats >= 0 and (numCats + 1) >= self.maxCats: instanceCat = None else: print 'Querying DB for next chunk.' t0 = time.time() instanceCat = myQDB.getNextChunk() print ' ...took %i sec.' % (time.time() - t0) if instanceCat != None: # This code adds some needed fields to the metadata mUtils.trimGeneration.derivedTrimMetadata(instanceCat) os.system('free -m') numCats += 1 # RRG: For now this must be disabled #curMD.validateMetadata(catalogType, myQDB.opsim) mFName = self.WorkDir + 'metaData%s_%s.ja' % (nFN, catalogType) curMD.writeMetadata(mFName, catalogType, myQDB.opsim, newfile=True) # Finished with queryDB; clean up nicely. myQDB.closeSession() # For debug mode, don't start the clients if self.QueryOnly == True: print 'Full time for this file: %i sec' % (time.time()-fullTimeStart) print 'DEBUG: Finished, no client processes started.' # Now fire off the jobs for i in range(len(jobNums)): jobId = '%s_%i' % (nFN, jobNums[i]) self.executionDBManager.updateState(jobId, 'JAAdded') print 'Added job to execution DB: %s' % jobId #t0 = '/astro/apps/pkg/python64/bin/python jobAllocatorRun.py %i %s %s&' % (nFN, jobId, jobPickleFiles[i]) #t0 = 'qsub ./runOneAthena.csh %i %s %s&' % (nFN, jobId, jobPickleFiles[i]) #t0 = 'ssh minerva0 "(cd $PBS_O_WORKDIR; qsub ./runOneAthena.csh %i %s %s)"' % (nFN, jobId, jobPickleFiles[i]) cwd0 = os.getcwd() f0 = open('tmpJA%s.csh' % jobId, 'w') f0.write('#!/bin/csh\n#PBS -N jA%s\n#PBS -l walltime=1:00:00\n#PBS -e jA%s.err\n#PBS -o jA%s.out\ncd %s\nsource setupAthena.csh\npython jobAllocatorRun.py %s %s %s\necho Finished.' % (jobId, jobId, jobId, cwd0, nFN, jobId, jobPickleFiles[i])) f0.close() t0 = 'ssh minerva0 "(cd %s; /opt/torque/bin/qsub tmpJA%s.csh)"' % (cwd0, jobId) print t0 os.system(t0) # Check that everything started within a certain time limit # On minerva, jobs may be queued indefinitely, so this won't work for i in range(len(jobNums)): jobId = '%s_%i' % (nFN, jobNums[i]) tryNum = 0 t0 = self.executionDBManager.queryState(jobId) while t0 != 'JAFinished': print 'Try %i: JA sees state for %s: %s' % (tryNum, jobId, t0) time.sleep(10) # Give it up to a day if tryNum > 60 * 60 * 24: raise RuntimeError, '*** Job not started: %s' % jobId tryNum += 1 t0 = self.executionDBManager.queryState(jobId) print 'Finished (Try %i): JA sees state for %s: %s' % (tryNum, jobId, t0) # Finally, merge the output trim file trimFile = self.WorkDir + 'trim%s_%s.ja' % (nFN, catalogType) t0 = 'cat %s > %s' % (mFName, trimFile) print t0 os.system(t0) for f in allOutputFiles: t0 = 'cat %s >> %s' % (f, trimFile) print t0 os.system(t0) print 'Full time for this file: %i sec' % (time.time()-fullTimeStart) print 'Finished catting trim file: ', trimFile