def runtime(self, runtime, *args, **kwargs): """Searches for and instantiates the given runtime with the arguments. Looks in the loaded config module for all classes derived from `perftest.runtime.Runtime` and checks if there is one with a name matching the given argument `runtime`. For comparison, the class name is converted to lowercase and the string "runtime" is removed. E.g. the argument "foo" will match a class FooRuntime. Args: runtime: Lower case name of the runtime to load. *args: Arguments passed on to the constructor of the runtime class. **kwargs: Keyword arguments, passed on to the runtime class. Returns: The instantiated runtime object. """ logger.debug(f'Trying to get runtime "{runtime}"') if runtime == 'gridtools': return GridtoolsRuntime(self) for k, v in self._config.__dict__.items(): if isinstance(v, type) and (issubclass(v, Runtime) and v is not Runtime): if v.__name__.lower().rstrip('runtime') == runtime: return v(self, *args, **kwargs) raise ConfigError(f'Runtime "{runtime}" not available')
def configure(build_dir, source_dir, cmake_args, conf=None): conf = config.get(conf) build_dir = os.path.abspath(build_dir) source_dir = os.path.abspath(source_dir) if not os.path.isdir(source_dir): raise NotFoundError(f'Source directory {source_dir} does not exist') os.makedirs(build_dir, exist_ok=True) def argstr(k, v): if isinstance(v, bool): v = 'ON' if v else 'OFF' return '-D' + k + '=' + v command = conf.cmake_command + [argstr(k, v) for k, v in cmake_args.items()] + [source_dir] logger.debug('Invoking CMake: ' + ' '.join(command)) try: output = subprocess.check_output(command, env=conf.env, cwd=build_dir, stderr=subprocess.STDOUT).decode() except subprocess.CalledProcessError as e: logger.error('CMake failed with output:', e.output.decode()) raise e logger.info('CMake finished') logger.debug('CMake output:', output)
def hostname(): """Host name of the current machine. Example: >>> hostname() 'keschln-0002' """ hostname = platform.node() logger.debug(f'Host name is {hostname}') return hostname
def __init__(self, name): self.name = name self.hostname = hostname() self.clustername = clustername() logger.debug(f'Trying to load config "{self.name}"') self._config = importlib.import_module('perftest.config.' + self.name) logger.info(f'Successfully loaded config "{self.name}"') required_attrs = ['modules', 'env', 'cmake_command', 'make_command', 'sbatch'] for attr in required_attrs: if not hasattr(self._config, attr): raise ConfigError(f'Incomplete config "{self.name}", missing ' f'attribute "{attr}"')
def env(self): if not hasattr(self, '_env'): self._env = os.environ.copy() if self._config.modules: from perftest import modules logger.debug(f'Trying to load modules for config ' f'"{self.name}"') for module in self._config.modules: self._env = modules.load(self._env, module) logger.info(f'Successfully loaded modules for config ' f'"{self.name}"') self._env.update({str(k): str(v) for k, v in self._config.env.items()}) envstr = '\n'.join(f'{k}={v}' for k, v in self.env.items()) logger.debug(f'Environment for config "{self.name}":', envstr) return self._env
def make(build_dir, targets=None, conf=None): conf = config.get(conf) build_dir = os.path.abspath(build_dir) command = conf.make_command if targets is not None: command += targets logger.debug('Invoking make: ' + ' '.join(command)) try: output = subprocess.check_output(command, env=conf.env, cwd=build_dir, stderr=subprocess.STDOUT).decode() except subprocess.CalledProcessError as e: logger.error('CMake failed with output:', e.output.decode()) raise e logger.info('make finished') logger.debug('make output:', output)
def load(grid): """Stencil loading functions. Loads all stencils for the given grid from the respective module. Args: grid: Name of the grid for which the stencils should be loaded. Returns: A list of all stencils provided for the given type. """ logger.debug(f'Trying to import stencils for grid "{grid}"') mod = importlib.import_module('perftest.stencils.' + grid) stencils = [] for k, v in mod.__dict__.items(): if isinstance(v, type) and issubclass(v, Stencil) and v is not Stencil: stencils.append(v()) sstr = ', '.join(f'"{s.name}"' for s in stencils) logger.info(f'Successfully imported stencils {sstr} for grid "{grid}"') return stencils
def load(env, module, resolve_conflicts=True): output = _call_modulecmd(env, 'load', module) if 'conflict' in output: conflicting = [] p = re.compile(r'.*Tcl command execution failed: conflict (.*)') for line in output.splitlines(): m = p.match(line) if m: conflicting.append(m.group(1)) confstr = ', '.join(f'"{c}"' for c in conflicting) logger.debug(f'Module "{module}" conflicts with {confstr}') if not resolve_conflicts: raise ConfigError(f'Unresolved conflicts between module "{module}"' f' and modules {confstr}') else: logger.debug('Resolving module conflicts…') if len(conflicting) == 1: return swap(env, conflicting[0], module) else: for c in conflicting: env = unload(env, c) return load(env, module, resolve_conflicts=False) else: return _apply_modulecmd_output(env, output)
def _submit(command, conf): """Submits a command to SLURM using sbatch. Args: command: A string, the command that should be submitted to SLURM. conf: The config to use for the submission. Returns: A tuple of the SLURM job ID and a temporary file name to which the job output will be written. """ with tempfile.NamedTemporaryFile(suffix='.sh', mode='w') as sbatch: # Generate SLURM sbatch file contents to submit job sbatchstr = conf.sbatch(command) logger.debug(f'Generated sbatch file:', sbatchstr) # Write sbatch to file sbatch.write(sbatchstr) sbatch.flush() # Wait a bit to make sure that we don’t overload SLURM time.sleep(0.1) # Create a file to store the job output # It is created in the working dir (normal /tmp does not seem to work) out = tempfile.NamedTemporaryFile(suffix='.out', dir='.', delete=False) logger.debug(f'Created temporary output file {out.name} for command ' f'"{command}"') out.close() # Run sbatch to start the job and specify job output file sbatch_command = ['sbatch', '-o', out.name, '--requeue', sbatch.name] try: sbatch_out = subprocess.check_output(sbatch_command, env=conf.env) except subprocess.CalledProcessError as e: raise JobSchedulingError(f'Submitting job "{command}" failed ' f'with output: {e.output}') # Parse the task ID from the sbatch stdout task_id = re.match(r'Submitted batch job (\d+)', sbatch_out.decode()).group(1) logger.debug(f'Submitted job {task_id}: "{command}"') return task_id, out.name
def swap(env, swapout_module, swapin_module): logger.debug(f'Swapping out module "{swapout_module}" in favor of ' f'module "{swapin_module}"') output = _call_modulecmd(env, 'swap', swapout_module, swapin_module) return _apply_modulecmd_output(env, output)
def unload(env, module): logger.debug(f'Unloading module "{module}"') output = _call_modulecmd(env, 'unload', module) return _apply_modulecmd_output(env, output)
def _poll(task_ids): """Polls SLURM using sacct. Args: task_ids: A list or set of SLURM job IDs. Returns: A subset of `task_ids` with all job IDs of jobs that have finished running. """ # Early exit in case of empty job list if not task_ids: return set() # Run sacct to get job status jobstr = ','.join(task_ids) sacct_command = ['sacct', '--format=jobid,jobname,state,exitcode', '--parsable2', '--noheader', '--jobs=' + jobstr] logger.debug('Running "{}"'.format(' '.join(sacct_command))) info = subprocess.check_output(sacct_command).decode().strip() finished = set() if not info: # We do nothing (apart from logging) if sacct gives no output # Happens normally a very short time after job start logger.debug(f'Sacct gave no output while waiting') return finished logger.debug('Sacct output while waiting:', info) # SLURM job states for unfinished jobs wait_states = {'PENDING', 'CONFIGURING', 'RUNNING', 'COMPLETING'} # Parse sacct output for line in info.splitlines(): jobid, jobname, state, exitcode = line.split('|') if state not in wait_states: if state == 'CANCELLED': raise JobError(f'Job {jobid} ({jobname}) was cancelled') elif state == 'COMPLETED': # There might be additional internal subtasks in the output, # that we do not want to add to the set of finished jobs so we # have to check the ID here if jobid in task_ids: finished.add(jobid) elif state == 'FAILED': exitcode = int(exitcode.split(':')[0]) raise JobError(f'Job {jobid} ({jobname}) failed with exitcode ' f'{exitcode}') elif state == 'NODE_FAIL': # Ignore node failures, job should be automatically rescheduled # by SLURM here as we use the --requeue flag on submission pass elif state == 'TIMEOUT': raise JobError(f'Job {jobid} ({jobname}) timed out, consider ' f'increasing the time limit') else: raise JobError(f'Job {jobid} ({jobname}) failed with state ' f'{state}') return finished
def run(commands, conf=None, job_limit=None, retry=5): """Runs the given command(s) using SLURM and the given configuration. `conf` must be a valid argument for `perftest.config.get`. Args: commands: A string or a list of strings, console command(s) to run. conf: (Default value = None) The config to use or None for default. job_limit: An integer defining the max number of jobs submitted to SLURM in parallel. Returns: A list of collected console outputs of all commands. """ if isinstance(commands, str): commands = [commands] conf = config.get(conf) if job_limit is None: job_limit = len(commands) # Initialize list of outputs outputs = [None] * len(commands) # Put all commands into a list with indices commands = list(enumerate(commands)) # Set of running jobs running = set() while any(output is None for output in outputs): # Submit jobs if less than `job_limit` are running while len(running) < job_limit and commands: index, command = commands.pop() # Try to submit job `retry` times for _ in range(retry): try: task_id, outfile = _submit(command, conf) break except JobSchedulingError: time.sleep(1) else: # Raise error if all attempts failed raise JobSchedulingError(f'Failed to run command "{command}", ' f'all {retry} attempts failed') running.add((index, task_id, outfile)) # Wait a bit to avoid overloading SLURM time.sleep(0.1) # Poll SLURM to get finished jobs try: finished_ids = _poll([task_id for _, task_id, _ in running]) except JobError as e: # Cancel all (possibly still running) jobs for _, task_id, _ in running: subprocess.call(['scancel', '--full', str(task_id)], env=conf.env) # Wait a few more seconds for buffered output time.sleep(10) # Print all current job outputs on job failure for _, task_id, outfile in running: with open(outfile, 'r') as f: output = f.read() logger.debug(f'Current output of job {task_id}:', output) os.remove(outfile) # Re-raise job error raise e logger.debug('Finished jobs: ' + ', '.join(finished_ids)) time.sleep(10) # Get output of finished jobs, build set of still running jobs still_running = set() for index, task_id, outfile in running: if task_id in finished_ids: logger.debug(f'Reading output of job {task_id}') with open(outfile, 'r') as f: output = f.read() logger.debug(f'Job {task_id} generated output:', output) os.remove(outfile) outputs[index] = output else: still_running.add((index, task_id, outfile)) running = still_running logger.debug('Running jobs: ' + ', '.join(task_id for _, task_id, _ in running)) return outputs