def _start_job(self, cmd, cluster_queue=None, verbose=1): '''Start test running. Requires directory lock. IMPORTANT: use self.start_job rather than self._start_job if using multiple threads. Decorated to start_job, which acquires directory lock and enters self.path first, during initialisation.''' if cluster_queue: tp_ptr = self.test_program submit_file = '%s.%s' % (os.path.basename(self.submit_template), tp_ptr.test_id) job = queues.ClusterQueueJob(submit_file, system=cluster_queue) job.create_submit_file(tp_ptr.submit_pattern, cmd, self.submit_template) if verbose > 2: print(('Submitting tests using %s (template submit file) in %s' % (self.submit_template, self.path))) job.start_job() else: # Run locally via subprocess. if verbose > 2: print(('Running test using %s in %s\n' % (cmd, self.path))) try: job = subprocess.Popen(cmd, shell=True) except OSError: # slightly odd syntax in order to be compatible with python 2.5 # and python 2.6/3 err = 'Execution of test failed: %s' % (sys.exc_info()[1],) raise exceptions.RunError(err) # Return either Popen object or ClusterQueueJob object. Both have # a wait method which returns only once job has finished. return job
def wait(self): '''Returns when job has finished running on the cluster.''' running = True # Don't ask the queueing system for the job itself but rather parse the # output from all current jobs and look gor the job in question. # This works around the problem where the job_id is not a sufficient # handle to query the system directly (e.g. on the CMTH cluster). qstat_cmd = [self.queue_cmd] while running: time.sleep(15) qstat_popen = subprocess.Popen(qstat_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) qstat_popen.wait() if qstat_popen.returncode != 0: err = ('Error inspecting queue system: %s' % qstat_popen.communicate()) raise exceptions.RunError(err) qstat_out = qstat_popen.communicate()[0] # Assume job has finished unless it appears in the qstat output. running = False for line in qstat_out.splitlines(): words = line.split() if words[self.job_id_column] == self.job_id: running = words[self.status_column] != self.finished_status break
def start_job(self): '''Submit job to cluster queue.''' submit_cmd = [self.submit_cmd, self.submit_file] try: submit_popen = subprocess.Popen(submit_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) submit_popen.wait() self.job_id = submit_popen.communicate()[0].strip().decode('utf-8') except OSError: # 'odd' syntax so exceptions work with python 2.5 and python 2.6/3. err = 'Error submitting job: %s' % (sys.exc_info()[1], ) raise exceptions.RunError(err)
def __init__(self, submit_file, system='PBS'): self.job_id = None self.submit_file = submit_file self.system = system if self.system == 'PBS': self.submit_cmd = 'qsub' self.queue_cmd = 'qstat' self.job_id_column = 0 self.status_column = 4 self.finished_status = 'C' else: err = 'Queueing system not implemented: %s' % self.system raise exceptions.RunError(err)
def create_submit_file(self, pattern, string, template): '''Create a submit file. Replace pattern in the template file with string and place the result in self.submit_file. :param string pattern: string in template to be replaced. :param string string: string to replace pattern in template. :param string template: filename of file containing the template submit script. ''' # get template if not os.path.exists(template): err = 'Submit file template does not exist: %s.' % (template, ) raise exceptions.RunError(err) ftemplate = open(template) submit = ftemplate.read() ftemplate.close() # replace marker with our commands submit = submit.replace(pattern, string) # write to submit script fsubmit = open(self.submit_file, 'w') fsubmit.write(submit) fsubmit.close()
def _move_output_to_test_output(self, test_files_out): '''Move output to the testcode output file. Requires directory lock. This is used when a program writes to standard output rather than to STDOUT. IMPORTANT: use self.move_output_to_test_output rather than self._move_output_to_test_output if using multiple threads. Decorated to move_output_to_test_output, which acquires the directory lock and enters self.path. ''' # self.output might be a glob which works with e.g. # mv self.output test_files[ind] # if self.output matches only one file. Reproduce that # here so that running tests through the queueing system # and running tests locally have the same behaviour. out_files = glob.glob(self.output) if len(out_files) == 1: shutil.move(out_files[0], test_files_out) else: err = ('Output pattern (%s) matches %s files (%s).' % (self.output, len(out_files), out_files)) raise exceptions.RunError(err)
def run_test(self, verbose=1, cluster_queue=None, rundir=None): '''Run all jobs in test.''' try: # Construct tests. test_cmds = [] test_files = [] for (test_input, test_arg) in self.inputs_args: if (test_input and not os.path.exists( os.path.join(self.path, test_input))): err = 'Input file does not exist: %s' % (test_input, ) raise exceptions.RunError(err) test_cmds.append( self.test_program.run_cmd(test_input, test_arg, self.nprocs)) test_files.append( util.testcode_filename(FILESTEM['test'], self.test_program.test_id, test_input, test_arg)) # Move files matching output pattern out of the way. self.move_old_output_files(verbose) # Run tests one-at-a-time locally or submit job in single submit # file to a queueing system. if cluster_queue: if self.output: for (ind, test) in enumerate(test_cmds): # Don't quote self.output if it contains any wildcards # (assume the user set it up correctly!) out = self.output if not compat.compat_any( wild in self.output for wild in ['*', '?', '[', '{']): out = pipes.quote(self.output) test_cmds[ind] = '%s; mv %s %s' % ( test_cmds[ind], out, pipes.quote(test_files[ind])) test_cmds = ['\n'.join(test_cmds)] for (ind, test) in enumerate(test_cmds): job = self.start_job(test, cluster_queue, verbose) job.wait() # Analyse tests as they finish. if cluster_queue: # Did all of them at once. for (test_input, test_arg) in self.inputs_args: self.verify_job(test_input, test_arg, verbose, rundir) else: # Did one job at a time. (test_input, test_arg) = self.inputs_args[ind] err = [] if self.output: try: self.move_output_to_test_output(test_files[ind]) except exceptions.RunError: err.append(sys.exc_info()[1]) status = validation.Status() if job.returncode != 0: err.insert( 0, 'Error running job. Return code: %i' % job.returncode) (status, msg) = self.skip_job(test_input, test_arg, verbose) if status.skipped(): self._update_status(status, (test_input, test_arg)) if verbose > 0 and verbose < 3: sys.stdout.write( util.info_line(self.path, test_input, test_arg, rundir)) status.print_status(msg, verbose) elif err: # re-raise first error we hit. raise exceptions.RunError(err[0]) else: self.verify_job(test_input, test_arg, verbose, rundir) except exceptions.RunError: err = sys.exc_info()[1] if verbose > 2: err = 'Test(s) in %s failed.\n%s' % (self.path, err) status = validation.Status([False]) self._update_status(status, (test_input, test_arg)) if verbose > 0 and verbose < 3: info_line = util.info_line(self.path, test_input, test_arg, rundir) sys.stdout.write(info_line) status.print_status(err, verbose) # Shouldn't run remaining tests after such a catastrophic failure. # Mark all remaining tests as skipped so the user knows that they # weren't run. err = 'Previous test in %s caused a system failure.' % (self.path) status = validation.Status(name='skipped') for ((test_input, test_arg), stat) in self.status.items(): if not self.status[(test_input, test_arg)]: self._update_status(status, (test_input, test_arg)) if verbose > 2: cmd = self.test_program.run_cmd( test_input, test_arg, self.nprocs) print('Test using %s in %s' % (cmd, self.path)) elif verbose > 0: info_line = util.info_line(self.path, test_input, test_arg, rundir) sys.stdout.write(info_line) status.print_status(err, verbose)