def get_output(cmd): proc = py3_compat.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.debug('Communicating with job process.') stdout, stderr = proc.communicate() return stdout
def num_cpus(self): """Returns the number of cpus that qsub should reserve. PBSPro requires the cpu reservation be given to both qsub, and aprun. If cnselect is not callable, raise RuntimeError. :rtype: int :returns: Number of cpus to reserve, or -1 if there was no cnselect output """ try: n_cpus = os.environ.get('CHPL_LAUNCHCMD_NUM_CPUS') if n_cpus is not None: return n_cpus logging.debug('Checking for number of cpus to reserve.') cnselect_proc = py3_compat.Popen(['cnselect', '-Lnumcores'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.debug('Communicating with cnselect process.') stdout, stderr = cnselect_proc.communicate() except OSError as ex: raise RuntimeError(ex) first_line = stdout.split('\n')[0] if first_line: return int(first_line) else: msg = 'cnselect -Lnumcores had no output.' logging.error(msg) raise ValueError(msg)
def _launch_qsub(self, testing_dir, output_file, error_file): """Launch job using qsub and return job id. Raises RuntimeError if self.submit_bin is anything but qsub. :type testing_dir: str :arg testing_dir: working directory for running test :type output_file: str :arg output_file: stdout log filename :type error_file: str :arg error_file: stderr log filename :rtype: str :returns: job id """ if self.submit_bin != 'qsub': raise RuntimeError('_launch_qsub called for non-pbs job type!') # Quiet information from LMOD about module changes that would show up # in our test output logging.info('Setting LMOD_QUIET=1') os.environ["LMOD_QUIET"] = "1" logging.info('Starting {0} job "{1}" on {2} nodes with walltime {3} ' 'and output file: {4}'.format(self.submit_bin, self.job_name, self.num_locales, self.walltime, output_file)) logging.debug('Opening {0} subprocess.'.format(self.submit_bin)) submit_proc = py3_compat.Popen(self._qsub_command( output_file, error_file), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=testing_dir, env=os.environ.copy()) test_command_str = ' '.join( self.full_test_command(output_file, error_file)) logging.debug( 'Communicating with {0} subprocess. Sending test command on stdin: {1}' .format(self.submit_bin, test_command_str)) stdout, stderr = submit_proc.communicate(input=test_command_str) logging.debug( '{0} process returned with status {1}, stdout: {2} stderr: {3}'. format(self.submit_bin, submit_proc.returncode, stdout, stderr)) if submit_proc.returncode != 0: msg = '{0} failed with exit code {1} and output: {2}'.format( self.submit_bin, submit_proc.returncode, stdout) logging.error(msg) raise ValueError(msg) job_id = stdout.strip() return job_id
def _qstat(cls, job_id, args=None): """Call qstat and return output from stdout. Raises ValueError if exit code is non-zero. :type job_id: str :arg job_id: pbs job id :type args: list :arg args: additional arguments to pass qstat :rtype: str :returns: qsub job status """ if args is None: args = [] qstat_command = ['qstat'] + args + [job_id] logging.debug('qstat command to run: {0}'.format(qstat_command)) logging.debug('Opening qstat subprocess.') qstat_proc = py3_compat.Popen(qstat_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=os.environ.copy()) logging.debug('Communicating with qstat subprocess.') stdout, stderr = qstat_proc.communicate() logging.debug( 'qstat process returned with status {0}, stdout: {1}, and stderr: {2}' .format(qstat_proc.returncode, stdout, stderr)) if qstat_proc.returncode != 0: raise ValueError('Non-zero exit code {0} from qstat: "{1}"'.format( qstat_proc.returncode, stdout)) else: return stdout
def status(cls, job_id): """Query job status using squeue. :type job_id: str :arg job_id: squeue job id :rtype: str :returns: squeue job status """ squeue_command = [ 'squeue', '--noheader', '--format', '%A %T', # "<job_id> <status>" '--states', 'all', '--job', job_id, ] logging.debug('squeue command to run: {0}'.format(squeue_command)) logging.debug('Opening squeue subprocess.') squeue_proc = py3_compat.Popen(squeue_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=os.environ.copy()) logging.debug('Communicating with squeue subprocess.') stdout, stderr = squeue_proc.communicate() logging.debug( 'squeue process returned with status {0}, stdout: {1}, stderr: {2}' .format(squeue_proc.returncode, stdout, stderr)) if squeue_proc.returncode != 0: raise ValueError( 'Non-zero exit code {0} from squeue: "{1}"'.format( squeue_proc.returncode, stdout)) failure_statuses = [ 'CANCELLED', 'FAILED', 'TIMEOUT', 'BOOT_FAIL', 'NODE_FAIL', 'PREEMPTED' ] queued_statuses = ['CONFIGURING', 'PENDING'] status_parts = stdout.split(' ') if len(status_parts) == 2: status = status_parts[1].strip() logging.info('Status for job {0} is: {1}'.format(job_id, status)) if status == 'COMPLETED': logging.info('Job finished with status: {0}'.format(status)) return 'C' elif status in failure_statuses: logging.info('Job finished with status: {0}'.format(status)) return 'C' elif status in queued_statuses: return 'Q' else: return 'R' # running else: raise ValueError( 'Could not parse output from squeue: {0}'.format(stdout))
def submit_job(self, testing_dir, output_file, error_file, input_file): """Launch job using executable. Set CHPL_LAUNCHER_USE_SBATCH=true in environment to avoid using expect script. The executable will create a sbatch script and submit it. Parse and return the job id after job is submitted. :type testing_dir: str :arg testing_dir: working directory for running test :type output_file: str :arg output_file: stdout log filename :type error_file: str :arg error_file: stderr log filename :rtype: str :returns: job id """ env = os.environ.copy() env['CHPL_LAUNCHER_USE_SBATCH'] = 'true' env['CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME'] = output_file env['CHPL_LAUNCHER_SLURM_ERROR_FILENAME'] = error_file if select.select([ sys.stdin, ], [], [], 0.0)[0]: with open(input_file, 'w') as fp: fp.write(sys.stdin.read()) env['SLURM_STDINMODE'] = input_file # We could use stdout buffering for other configurations too, but I # don't think there's any need. Currently, single locale perf testing # is the only config that has any tests that produce a lot of output if os.getenv('CHPL_TEST_PERF') != None and self.num_locales <= 1: env['CHPL_LAUNCHER_SLURM_BUFFER_STDOUT'] = 'true' cmd = self.test_command[:] # Add --nodelist into the command line if self.hostlist is not None: cmd.append('--{0}={1}'.format(self.hostlist_resource, self.hostlist)) # Add --walltime back into the command line. if self.walltime is not None: cmd.append('--walltime') cmd.append(self.walltime) logging.debug('Command to submit job: {0}'.format(cmd)) logging.debug('Opening job subprocess') submit_proc = py3_compat.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=testing_dir, env=env) logging.debug('Communicating with job subprocess') stdout, stderr = submit_proc.communicate() logging.debug( 'Job process returned with status {0}, stdout: {1}, stderr: {2}'. format(submit_proc.returncode, stdout, stderr)) if submit_proc.returncode != 0: msg = 'Job submission ({0}) failed with exit code {1} and output: {2}'.format( cmd, submit_proc.returncode, stdout) logging.error(msg) raise ValueError(msg) # Output is: Submitted batch job 106001 id_parts = stdout.split(' ') if len(id_parts) < 4: raise ValueError( 'Could not parse output from sbatch submission: {0}'.format( stdout)) else: job_id = id_parts[3].strip() return job_id