def execute(self): # Hold output jobs self.failedJobs = [] self.successfulJobs = [] # Run all commands to submit new jobsets to the grid. # Retrieve the JobIDs and stats of all newly created jobs. print "About to submit requested jobs" self.currentJobs = {} for item in self.commandList: newjobset = GridJobset(item) newjobset.submit() createdJobs = newjobset.JobIDs # Register newly existing JobIDS self.pbookCore.sync() PdbUtils.getListOfJobIDs(True, False) for jobID in createdJobs: self.addJobToList(jobID) print "Submitted jobs", self.currentJobs.keys() # If specified jobs to pick up, get those: if len(self.additionalJobs) != 0: print "Adding specified jobs to list." for item in self.additionalJobs: self.addJobToList(item) # If we are supposed to include all running jobs, add those if self.syncToRunningJobs == True: print "Adding currently running jobs to list." # Param 'True' means only jobs not 'frozen' are kept runningJobs = PdbUtils.getListOfJobIDs(True, False) for item in runningJobs: self.addJobToList(item) print "Total list of jobs to monitor is now:", self.currentJobs.keys( ), "\n" # Run this until all jobs are complete. while len(self.currentJobs.keys()) > 0: # Synchronise pbook. self.pbookCore.sync() # Check each job and act accordingly. self.currentJobIDs = sorted(self.currentJobs.keys()) for jobID in self.currentJobIDs: job = self.currentJobs[jobID] currentStatus = self.checkCurrentStatus(job) if currentStatus == 'stillRunning': continue elif currentStatus == 'stuck': self.unstick(job) elif currentStatus == 'failed': if job.prunAttemptCount < self.pandaRetryLimit: self.retryFailed(job) else: self.failedJobs.append[jobID] del self.currentJobs[jobID] elif currentStatus == 'finished': ## If running a test code which does not produce an ## output dataset, the outDS will be blank. if job.outDS == "": del self.currentJobs[jobID] self.successfulJobs.append(jobID) ## dq2-get output. else: self.getOutput(job) else: print "Error!" self.currentJobs = {} break # Wait required gap time, then go to next iteration. print "\n" time.sleep(self.downtime) print "All jobs finished." print "Successful jobs:", self.successfulJobs print "Failed jobs:", self.failedJobs sys.exit(0)
def sync(self): # get logger tmpLog = PLogger.getPandaLogger() tmpLog.info("Synchronizing local repository ...") # check proxy self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose, useCache=True) # get nickname nickName = PsubUtils.getNickname() # set Rucio accounting PsubUtils.setRucioAccount(nickName,'pbook',True) # get JobIDs in local repository localJobIDs = PdbUtils.getListOfJobIDs() # get recent JobIDs from panda server syncTimeRaw = datetime.datetime.utcnow() syncTime = syncTimeRaw.strftime('%Y-%m-%d %H:%M:%S') # set sync time for the first attempt bookConf = BookConfig.getConfig() if self.restoreDB: # reset last_synctime to restore database bookConf.last_synctime = '' # disable self.restoreDB = False tmpLog.info("It may take several minutes to restore local repository ...") if bookConf.last_synctime == '': bookConf.last_synctime = datetime.datetime.utcnow()-datetime.timedelta(days=180) bookConf.last_synctime = bookConf.last_synctime.strftime('%Y-%m-%d %H:%M:%S') maxTaskID = None while True: status, jediTaskDicts = Client.getJobIDsJediTasksInTimeRange(bookConf.last_synctime, minTaskID=maxTaskID, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get tasks from panda server") return if len(jediTaskDicts) == 0: break tmpLog.info("Got %s tasks to be updated" % len(jediTaskDicts)) # insert if missing for remoteJobID in jediTaskDicts.keys(): taskID = jediTaskDicts[remoteJobID]['jediTaskID'] # get max if maxTaskID is None or taskID > maxTaskID: maxTaskID = taskID # check local status job = None if remoteJobID in localJobIDs: # get job info from local repository job = PdbUtils.readJobDB(remoteJobID, self.verbose) # skip if frozen if job.dbStatus == 'frozen': continue tmpLog.info("Updating taskID=%s ..." % taskID) # convert JEDI task localJob = PdbUtils.convertJTtoD(jediTaskDicts[remoteJobID],job) # update database if not remoteJobID in localJobIDs: # insert to DB try: PdbUtils.insertJobDB(localJob,self.verbose) except: tmpLog.error("Failed to insert taskID=%s to local repository" % taskID) return else: # update try: PdbUtils.updateJobDB(localJob,self.verbose,syncTimeRaw) except: tmpLog.error("Failed to update local repository for taskID=%s" % taskID) return # update sync time bookConf = BookConfig.getConfig() bookConf.last_synctime = syncTime BookConfig.updateConfig(bookConf) self.updateTaskJobsetMap() tmpLog.info("Synchronization Completed")
def sync(self): # get logger tmpLog = PLogger.getPandaLogger() tmpLog.info("Synchronizing local repository ...") # check proxy self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose) # get JobIDs in local repository localJobIDs = PdbUtils.getListOfJobIDs() # get recent JobIDs from panda server syncTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') # set sync time for the first attempt bookConf = BookConfig.getConfig() if self.restoreDB: # reset last_synctime to restore database bookConf.last_synctime = '' # disable self.restoreDB = False tmpLog.info( "It may take several minutes to restore local repository ...") if bookConf.last_synctime == '': bookConf.last_synctime = datetime.datetime.utcnow( ) - datetime.timedelta(days=180) bookConf.last_synctime = bookConf.last_synctime.strftime( '%Y-%m-%d %H:%M:%S') status, remoteJobIDs = Client.getJobIDsInTimeRange( bookConf.last_synctime, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get JobIDs from panda server") return tmpLog.info("Got %s jobs to be updated" % len(remoteJobIDs)) # insert if missing for remoteJobID in remoteJobIDs: # check local status job = None if remoteJobID in localJobIDs: # get job info from local repository job = PdbUtils.readJobDB(remoteJobID, self.verbose) # skip if frozen if job.dbStatus == 'frozen': continue tmpLog.info("Updating JobID=%s ..." % remoteJobID) # get PandaIDs status, pandaIDstatus = Client.getPandIDsWithJobID( remoteJobID, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get PandaIDs for %s" % remoteJobID) return pandaIDs = pandaIDstatus.keys() pandaIDs.sort() # get full JobSpec pandaJobs = [] pandaFileInfo = {} pandaJobForSiteID = None if job == None: tmpIDs = [pandaIDs[0], pandaIDs[-1]] status, pandaJobs = Client.getFullJobStatus( tmpIDs, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get PandaJobs for %s" % remoteJobID) return # get slimmed file info status, pandaFileInfo = Client.getSlimmedFileInfoPandaIDs( pandaIDs, verbose=self.verbose) if status != 0: tmpLog.error("Failed to get file info for %s" % remoteJobID) return else: # get one job to set computingSite which may have changed due to rebrokerage status, tmpPandaJobs = Client.getFullJobStatus( [pandaIDs[0]], verbose=self.verbose) if status != 0: tmpLog.error("Failed to get PandaJobs for %s" % remoteJobID) return pandaJobForSiteID = tmpPandaJobs[0] # convert to local job spec localJob = PdbUtils.convertPtoD(pandaJobs, pandaIDstatus, job, pandaFileInfo, pandaJobForSiteID) # update database if not remoteJobID in localJobIDs: # insert to DB try: PdbUtils.insertJobDB(localJob, self.verbose) except: tmpLog.error( "Failed to insert JobID=%s to local repository" % remoteJobID) return # set retryID if not localJob.provenanceID in [0, '0']: try: PdbUtils.setRetryID(localJob, self.verbose) except: tmpLog.error( "Failed to set retryID for JobID=%s in local repository" % remoteJobID) return else: # update try: PdbUtils.updateJobDB(localJob, self.verbose) except: tmpLog.error( "Failed to update local repository for JobID=%s" % remoteJobID) return # update sync time bookConf = BookConfig.getConfig() bookConf.last_synctime = syncTime BookConfig.updateConfig(bookConf) tmpLog.info("Synchronization Completed")