def poll(self, nonBlocking): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobIds out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) while nofJobs_left > 0: # Get the list of jobs submitted to batch system and convert their jobIds to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary logging.debug("sleeping for %i seconds." % 60) time.sleep(60) polled_ids = set() if poll_result != '': polled_ids = set(poll_result.split(delimiter)) # Check if number of jobs submitted to batch system is below maxSubmittedJobs; # if it is, take jobs from queuedJobs list and submit them, # until a total of maxSubmittedJobs is submitted to batch system nofJobs_toSubmit = min(len(self.queuedJobs), self.maxSubmittedJobs - len(polled_ids)) if nofJobs_toSubmit > 0: logging.debug( "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs." .format(len(polled_ids), len(self.queuedJobs), nofJobs_toSubmit)) else: logging.debug( "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing." .format(len(polled_ids), len(self.queuedJobs))) for i in range(0, nofJobs_toSubmit): # randomly submit a job from the queue two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_idx = len(self.queuedJobs) - 1 random_idx = random.randint(0, max_idx) job = self.queuedJobs.pop(random_idx) job['status'] = Status.submitted job_id = self.submit(job['sbatch_command']) self.submittedJobs[job_id] = job # Now check status of jobs submitted to batch system: # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for job_id, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(job_id) elif details.status == Status.running: running_jobs.append(job_id) else: failed_jobs.append(job_id) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[job_id].status for job_id in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for job_id in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=job_id, exitCode=completion[job_id].exit_code, derivedExitCode=completion[job_id]. derived_exit_code, state=completion[job_id].state, status=Status.toString( completion[job_id].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.submittedJobs[failed_job][ log[1]] if os.path.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \ completion[failed_job].status == Status.io_error: # The job is eligible for resubmission if the job hasn't been resubmitted more # than a preset limit of resubmissions AND if the job failed due to I/O errors logging.warning( "Job w/ ID {id} and arguments {args} FAILED because: {reason} " "-> resubmission attempt #{attempt}". format( id=failed_job, args=self.submittedJobs[failed_job] ['args'], reason=Status.toString( completion[failed_job].status), attempt=self.submittedJobs[failed_job] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[failed_job]['args']) # The old ID must be deleted, b/c otherwise it would be used to compare against # squeue output and we would resubmit the failed job ad infinitum del self.submittedJobs[failed_job] else: # We've exceeded the maximum number of resubmissions -> fail the workflow raise Status.raiseError( completion[failed_job].status) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for job_id in completed_jobs: if not all( map( lambda outputFile: is_file_ok( outputFile, validate_outputs=True, min_file_size=self.min_file_size), self .submittedJobs[job_id]['outputFiles'])): if self.submittedJobs[job_id][ 'nof_submissions'] < self.max_resubmissions: logging.warning( "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file " "-> resubmission attempt #{attempt}". format( id=job_id, args=self.submittedJobs[job_id] ['args'], attempt=self.submittedJobs[job_id] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[job_id]['args']) del self.submittedJobs[job_id] else: raise ValueError( "Job w/ ID {id} FAILED because it repeatedly produces bogus output " "file {output} yet the job still exits w/o any errors" .format( id=job_id, output=', '.join( self.submittedJobs[job_id] ['outputFiles']), )) else: # Job completed just fine self.submittedJobs[job_id][ 'status'] = Status.completed jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left) if nofJobs_left > 0: if nonBlocking: return False two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) logging.debug("sleeping for %i seconds." % random_delay) time.sleep(self.poll_interval + random_delay) else: break return True
def submitJob( self, inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile=None, skipIfOutputFileExists=False, job_template_file='sbatch-node.sh.template', copy_output_file=True, nof_submissions=0, ): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" % job_template_file) job_template_file = os.path.join(jinja_template_dir, job_template_file) job_template = open(job_template_file, 'r').read() # raise if logfile missing if not logFile: if not self.logFileDir: raise ValueError( "Please call 'setLogFileDir' before calling 'submitJob' !!" ) logFile = os.path.join( self.logFileDir, os.path.basename(scriptFile).replace(".sh", ".log")) # skip only if none of the output files are missing in the file system outputFiles_fullpath = map( lambda outputFile: os.path.join(outputFilePath, outputFile), outputFiles) if skipIfOutputFileExists: outputFiles_missing = [ outputFile for outputFile in outputFiles_fullpath \ if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size) ] if not outputFiles_missing: logging.debug( "output file(s) = %s exist(s) --> skipping !!" % \ '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath)) ) return if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") if not self.cmssw_base_dir: logging.warning("cmssw_base_dir not set, setting it to '%s'" % os.environ.get('CMSSW_BASE')) self.cmssw_base_dir = os.environ.get('CMSSW_BASE') job_dir = self.get_job_dir() # create script for executing jobs wrapper_log_file = logFile.replace('.log', '_wrapper.log') executable_log_file = logFile.replace('.log', '_executable.log') wrapper_log_file, executable_log_file = get_log_version( (wrapper_log_file, executable_log_file)) sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \ "{max_mem} {args} {cmd}".format( partition = self.queue, output = wrapper_log_file, comment = self.pool_id, args = self.sbatchArgs, cmd = scriptFile, max_mem = '--mem={}'.format(self.max_mem) if self.max_mem else '', ) two_pow_sixteen = 65536 random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen) max_delay = 60 random_delay = random.randint(0, max_delay) script = jinja2.Template(job_template).render( working_dir=self.workingDir, cmssw_base_dir=self.cmssw_base_dir, job_dir=job_dir, job_template_file=job_template_file, exec_name=executable, command_line_parameter=command_line_parameter, inputFiles=" ".join(inputFiles), outputDir=outputFilePath, outputFiles=" ".join(outputFiles), wrapper_log_file=wrapper_log_file, executable_log_file=executable_log_file, script_file=scriptFile, RUNNING_COMMAND=sbatch_command, random_sleep=random_delay, copy_output_file=copy_output_file, ) logging.debug("writing sbatch script file = '%s'" % scriptFile) with codecs.open(scriptFile, "w", "utf-8") as f: f.write(script) f.flush() os.fsync(f.fileno()) if self.dry_run: return nof_submissions += 1 job = { 'sbatch_command': sbatch_command, 'status': Status.in_queue, 'log_wrap': wrapper_log_file, 'log_exec': executable_log_file, 'args': ( inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile, skipIfOutputFileExists, job_template_file, nof_submissions, ), 'nof_submissions': nof_submissions, 'outputFiles': outputFiles_fullpath, } self.queuedJobs.append(job)