def __pollJobTree(self): childJobFileToParentJob, childCounts = {}, {} updatedJobFiles, shellJobs = set(), set() try: parseJobFiles(getJobFileDirName(self.jobTreePath), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) failedJobs = [ job for job in updatedJobFiles | \ set(childCounts.keys()) \ if job.remainingRetryCount == 0 ] self.curActiveJobs = set() for job in updatedJobFiles: self.curActiveJobs.add(job.getJobFileName()) self.failedJobs = max(len(failedJobs), self.failedJobs) except: self.curActiveJobs = set() if len(self.prevActiveJobs) > 0 and len(self.curActiveJobs) > 0 and\ self.curActiveJobs == self.prevActiveJobs: self.sameJobsTime += self.pollTime else: self.sameJobsTime = 0 self.prevActiveJobs = set(self.curActiveJobs)
def jobtree_is_finished(self, jobtree_path): """ See if this jobTree has finished before. Code extracted from the jobTree repo. """ childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(), set() parseJobFiles(getJobFileDirName(jobtree_path), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) return len(updatedJobFiles) == 0
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint): """Adds the first job to to the jobtree. """ logger.info("Adding the first job") if memory == None or memory == sys.maxint: memory = float(config.attrib["default_memory"]) if cpu == None or cpu == sys.maxint: cpu = float(config.attrib["default_cpu"]) job = Job(command=command, memory=memory, cpu=cpu, tryCount=int(config.attrib["try_count"]), jobDir=getJobFileDirName(config.attrib["job_tree"])) job.write() logger.info("Added the first job")
def reloadJobTree(jobTree): """Load the job tree from a dir. """ logger.info("The job tree appears to already exist, so we'll reload it") assert os.path.isfile(getConfigFileName(jobTree)) #A valid job tree must contain the config file assert os.path.isfile(getEnvironmentFileName(jobTree)) #A valid job tree must contain a pickle file which encodes the path environment of the job assert os.path.isdir(getJobFileDirName(jobTree)) #A job tree must have a directory of jobs. config = ET.parse(getConfigFileName(jobTree)).getroot() config.attrib["log_level"] = getLogLevelString() writeConfig(config) #This updates the on disk config file with the new logging setting batchSystem = loadTheBatchSystem(config) logger.info("Reloaded the jobtree") return config, batchSystem
def createJobTree(options): logger.info("Starting to create the job tree setup for the first time") options.jobTree = absSymPath(options.jobTree) os.mkdir(options.jobTree) os.mkdir(getJobFileDirName(options.jobTree)) config = ET.Element("config") config.attrib["log_level"] = getLogLevelString() config.attrib["job_tree"] = options.jobTree config.attrib["parasol_command"] = options.parasolCommand config.attrib["try_count"] = str(int(options.retryCount) + 1) config.attrib["max_job_duration"] = str(float(options.maxJobDuration)) config.attrib["batch_system"] = options.batchSystem config.attrib["job_time"] = str(float(options.jobTime)) config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize)) config.attrib["default_memory"] = str(int(options.defaultMemory)) config.attrib["default_cpu"] = str(int(options.defaultCpu)) config.attrib["max_cpus"] = str(int(options.maxCpus)) config.attrib["max_memory"] = str(int(options.maxMemory)) config.attrib["max_threads"] = str(int(options.maxThreads)) if options.bigBatchSystem != None: config.attrib["big_batch_system"] = options.bigBatchSystem config.attrib["big_memory_threshold"] = str( int(options.bigMemoryThreshold)) config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold)) config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus)) config.attrib["big_max_memory"] = str(int(options.bigMaxMemory)) if options.stats: config.attrib["stats"] = "" #Load the batch system. batchSystem = loadTheBatchSystem(config, options) logger.info("Loaded the batch system %s" % batchSystem) #Set the parameters determining the polling frequency of the system. config.attrib["rescue_jobs_frequency"] = str( float(batchSystem.getRescueJobFrequency())) if options.rescueJobsFrequency != None: config.attrib["rescue_jobs_frequency"] = str( float(options.rescueJobsFrequency)) writeConfig(config) logger.info("Finished the job tree setup") return config, batchSystem
def createJobTree(options): logger.info("Starting to create the job tree setup for the first time") options.jobTree = absSymPath(options.jobTree) os.mkdir(options.jobTree) os.mkdir(getJobFileDirName(options.jobTree)) config = ET.Element("config") config.attrib["log_level"] = getLogLevelString() config.attrib["job_tree"] = options.jobTree config.attrib["parasol_command"] = options.parasolCommand config.attrib["try_count"] = str(int(options.retryCount) + 1) config.attrib["max_job_duration"] = str(float(options.maxJobDuration)) config.attrib["batch_system"] = options.batchSystem config.attrib["job_time"] = str(float(options.jobTime)) config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize)) config.attrib["default_memory"] = str(int(options.defaultMemory)) config.attrib["default_cpu"] = str(int(options.defaultCpu)) config.attrib["max_cpus"] = str(int(options.maxCpus)) config.attrib["max_memory"] = str(int(options.maxMemory)) config.attrib["max_threads"] = str(int(options.maxThreads)) if options.bigBatchSystem != None: config.attrib["big_batch_system"] = options.bigBatchSystem config.attrib["big_memory_threshold"] = str(int(options.bigMemoryThreshold)) config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold)) config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus)) config.attrib["big_max_memory"] = str(int(options.bigMaxMemory)) if options.stats: config.attrib["stats"] = "" #Load the batch system. batchSystem = loadTheBatchSystem(config) #Set the parameters determining the polling frequency of the system. config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency())) if options.rescueJobsFrequency != None: config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency)) writeConfig(config) logger.info("Finished the job tree setup") return config, batchSystem
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser( "usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option( "--jobTree", dest="jobTree", help= "Directory containing the job tree. The jobTree location can also be specified as the argument to the script. default=%default", default='./jobTree') parser.add_option( "--verbose", dest="verbose", action="store_true", help= "Print loads of information, particularly all the log files of jobs that failed. default=%default", default=False) parser.add_option( "--failIfNotComplete", dest="failIfNotComplete", action="store_true", help= "Return exit value of 1 if job tree jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only jobtree may be specified as argument if len(args) == 1: #Allow jobTree directory as arg options.jobTree = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(getConfigFileName( options.jobTree)) #A valid job tree must contain the config gile assert os.path.isdir(getJobFileDirName( options.jobTree)) #A job tree must have a directory of jobs. ########################################## #Survey the status of the job and report. ########################################## childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set( ), set() parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) failedJobs = [ job for job in updatedJobFiles | set(childCounts.keys()) if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \ (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if os.path.isfile(job.getLogFileName()): print "Log file of failed job: %s" % job.getLogFileName() logFile(job.getLogFileName(), logger.critical) else: print "Log file for job %s is not present" % job.getJobFileName( ) if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(updatedJobFiles) + len(childCounts)) != 0 and options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the job tree. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser("usage: %prog [--jobTree] JOB_TREE_DIR [options]", "%prog 0.1") parser.add_option("--jobTree", dest="jobTree", help="Directory containing the job tree") parser.add_option("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of errors. default=%default", default=False) parser.add_option("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if job tree jobs not all completed. default=%default", default=False) options, args = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) assert len(args) <= 1 #Only jobtree may be specified as argument if len(args) == 1: #Allow jobTree directory as arg options.jobTree = args[0] ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for job tree") assert options.jobTree != None assert os.path.isdir(options.jobTree) #The given job dir tree must exist. assert os.path.isfile(getConfigFileName(options.jobTree)) #A valid job tree must contain the config gile assert os.path.isdir(getJobFileDirName(options.jobTree)) #A job tree must have a directory of jobs. ########################################## #Survey the status of the job and report. ########################################## childJobFileToParentJob, childCounts, updatedJobFiles, shellJobs = {}, {}, set(), set() parseJobFiles(getJobFileDirName(options.jobTree), updatedJobFiles, childJobFileToParentJob, childCounts, shellJobs) failedJobs = [ job for job in updatedJobFiles | set(childCounts.keys()) if job.remainingRetryCount == 0 ] print "There are %i active jobs, %i parent jobs with children, %i totally failed jobs and %i empty jobs (i.e. finished but not cleaned up) currently in job tree: %s" % \ (len(updatedJobFiles), len(childCounts), len(failedJobs), len(shellJobs), options.jobTree) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if os.path.isfile(job.getLogFileName()): print "Log file of failed job: %s" % job.getLogFileName() logFile(job.getLogFileName(), logger.critical) else: print "Log file for job %s is not present" % job.getJobFileName() if len(failedJobs) == 0: print "There are no failed jobs to report" if (len(updatedJobFiles) + len(childCounts)) != 0 and options.failIfNotComplete: sys.exit(1)