def runtime(self, runtime, *args, **kwargs):
        """Searches for and instantiates the given runtime with the arguments.

        Looks in the loaded config module for all classes derived from
        `perftest.runtime.Runtime` and checks if there is one with a name
        matching the given argument `runtime`. For comparison, the class name
        is converted to lowercase and the string "runtime" is removed. E.g.
        the argument "foo" will match a class FooRuntime.

        Args:
            runtime: Lower case name of the runtime to load.
            *args: Arguments passed on to the constructor of the runtime class.
            **kwargs: Keyword arguments, passed on to the runtime class.

        Returns:
            The instantiated runtime object.
        """
        logger.debug(f'Trying to get runtime "{runtime}"')
        if runtime == 'gridtools':
            return GridtoolsRuntime(self)

        for k, v in self._config.__dict__.items():
            if isinstance(v, type) and (issubclass(v, Runtime) and
                                        v is not Runtime):
                if v.__name__.lower().rstrip('runtime') == runtime:
                    return v(self, *args, **kwargs)
        raise ConfigError(f'Runtime "{runtime}" not available')
Beispiel #2
0
def configure(build_dir, source_dir, cmake_args, conf=None):
    conf = config.get(conf)

    build_dir = os.path.abspath(build_dir)
    source_dir = os.path.abspath(source_dir)

    if not os.path.isdir(source_dir):
        raise NotFoundError(f'Source directory {source_dir} does not exist')

    os.makedirs(build_dir, exist_ok=True)

    def argstr(k, v):
        if isinstance(v, bool):
            v = 'ON' if v else 'OFF'
        return '-D' + k + '=' + v

    command = conf.cmake_command + [argstr(k, v) for k, v
                                    in cmake_args.items()] + [source_dir]

    logger.debug('Invoking CMake: ' + ' '.join(command))
    try:
        output = subprocess.check_output(command,
                                         env=conf.env,
                                         cwd=build_dir,
                                         stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as e:
        logger.error('CMake failed with output:', e.output.decode())
        raise e

    logger.info('CMake finished')
    logger.debug('CMake output:', output)
def hostname():
    """Host name of the current machine.

    Example:
        >>> hostname()
        'keschln-0002'
    """
    hostname = platform.node()
    logger.debug(f'Host name is {hostname}')
    return hostname
    def __init__(self, name):
        self.name = name
        self.hostname = hostname()
        self.clustername = clustername()

        logger.debug(f'Trying to load config "{self.name}"')
        self._config = importlib.import_module('perftest.config.' + self.name)
        logger.info(f'Successfully loaded config "{self.name}"')

        required_attrs = ['modules', 'env', 'cmake_command', 'make_command',
                          'sbatch']
        for attr in required_attrs:
            if not hasattr(self._config, attr):
                raise ConfigError(f'Incomplete config "{self.name}", missing '
                                  f'attribute "{attr}"')
    def env(self):
        if not hasattr(self, '_env'):
            self._env = os.environ.copy()
            if self._config.modules:
                from perftest import modules
                logger.debug(f'Trying to load modules for config '
                             f'"{self.name}"')
                for module in self._config.modules:
                    self._env = modules.load(self._env, module)
                logger.info(f'Successfully loaded modules for config '
                            f'"{self.name}"')
            self._env.update({str(k): str(v) for k, v in
                              self._config.env.items()})

            envstr = '\n'.join(f'{k}={v}' for k, v in self.env.items())
            logger.debug(f'Environment for config "{self.name}":', envstr)
        return self._env
Beispiel #6
0
def make(build_dir, targets=None, conf=None):
    conf = config.get(conf)

    build_dir = os.path.abspath(build_dir)

    command = conf.make_command
    if targets is not None:
        command += targets

    logger.debug('Invoking make: ' + ' '.join(command))
    try:
        output = subprocess.check_output(command, env=conf.env,
                                         cwd=build_dir,
                                         stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as e:
        logger.error('CMake failed with output:', e.output.decode())
        raise e

    logger.info('make finished')
    logger.debug('make output:', output)
def load(grid):
    """Stencil loading functions.

    Loads all stencils for the given grid from the respective module.

    Args:
        grid: Name of the grid for which the stencils should be loaded.

    Returns:
        A list of all stencils provided for the given type.
    """

    logger.debug(f'Trying to import stencils for grid "{grid}"')
    mod = importlib.import_module('perftest.stencils.' + grid)

    stencils = []
    for k, v in mod.__dict__.items():
        if isinstance(v, type) and issubclass(v, Stencil) and v is not Stencil:
            stencils.append(v())

    sstr = ', '.join(f'"{s.name}"' for s in stencils)
    logger.info(f'Successfully imported stencils {sstr} for grid "{grid}"')
    return stencils
Beispiel #8
0
def load(env, module, resolve_conflicts=True):
    output = _call_modulecmd(env, 'load', module)
    if 'conflict' in output:
        conflicting = []
        p = re.compile(r'.*Tcl command execution failed: conflict (.*)')
        for line in output.splitlines():
            m = p.match(line)
            if m:
                conflicting.append(m.group(1))
        confstr = ', '.join(f'"{c}"' for c in conflicting)
        logger.debug(f'Module "{module}" conflicts with {confstr}')
        if not resolve_conflicts:
            raise ConfigError(f'Unresolved conflicts between module "{module}"'
                              f' and modules {confstr}')
        else:
            logger.debug('Resolving module conflicts…')
            if len(conflicting) == 1:
                return swap(env, conflicting[0], module)
            else:
                for c in conflicting:
                    env = unload(env, c)
                return load(env, module, resolve_conflicts=False)
    else:
        return _apply_modulecmd_output(env, output)
def _submit(command, conf):
    """Submits a command to SLURM using sbatch.

    Args:
        command: A string, the command that should be submitted to SLURM.
        conf: The config to use for the submission.

    Returns:
        A tuple of the SLURM job ID and a temporary file name to which the
        job output will be written.
    """

    with tempfile.NamedTemporaryFile(suffix='.sh', mode='w') as sbatch:
        # Generate SLURM sbatch file contents to submit job
        sbatchstr = conf.sbatch(command)
        logger.debug(f'Generated sbatch file:', sbatchstr)
        # Write sbatch to file
        sbatch.write(sbatchstr)
        sbatch.flush()

        # Wait a bit to make sure that we don’t overload SLURM
        time.sleep(0.1)

        # Create a file to store the job output
        # It is created in the working dir (normal /tmp does not seem to work)
        out = tempfile.NamedTemporaryFile(suffix='.out', dir='.', delete=False)
        logger.debug(f'Created temporary output file {out.name} for command '
                     f'"{command}"')
        out.close()

        # Run sbatch to start the job and specify job output file
        sbatch_command = ['sbatch', '-o', out.name, '--requeue', sbatch.name]
        try:
            sbatch_out = subprocess.check_output(sbatch_command, env=conf.env)
        except subprocess.CalledProcessError as e:
            raise JobSchedulingError(f'Submitting job "{command}" failed '
                                     f'with output: {e.output}')

        # Parse the task ID from the sbatch stdout
        task_id = re.match(r'Submitted batch job (\d+)',
                           sbatch_out.decode()).group(1)

    logger.debug(f'Submitted job {task_id}: "{command}"')

    return task_id, out.name
Beispiel #10
0
def swap(env, swapout_module, swapin_module):
    logger.debug(f'Swapping out module "{swapout_module}" in favor of '
                 f'module "{swapin_module}"')
    output = _call_modulecmd(env, 'swap', swapout_module, swapin_module)
    return _apply_modulecmd_output(env, output)
Beispiel #11
0
def unload(env, module):
    logger.debug(f'Unloading module "{module}"')
    output = _call_modulecmd(env, 'unload', module)
    return _apply_modulecmd_output(env, output)
Beispiel #12
0
def _poll(task_ids):
    """Polls SLURM using sacct.

    Args:
        task_ids: A list or set of SLURM job IDs.

    Returns:
        A subset of `task_ids` with all job IDs of jobs that have finished
        running.
    """

    # Early exit in case of empty job list
    if not task_ids:
        return set()

    # Run sacct to get job status
    jobstr = ','.join(task_ids)
    sacct_command = ['sacct', '--format=jobid,jobname,state,exitcode',
                     '--parsable2', '--noheader', '--jobs=' + jobstr]
    logger.debug('Running "{}"'.format(' '.join(sacct_command)))
    info = subprocess.check_output(sacct_command).decode().strip()

    finished = set()

    if not info:
        # We do nothing (apart from logging) if sacct gives no output
        # Happens normally a very short time after job start
        logger.debug(f'Sacct gave no output while waiting')
        return finished

    logger.debug('Sacct output while waiting:', info)

    # SLURM job states for unfinished jobs
    wait_states = {'PENDING', 'CONFIGURING', 'RUNNING', 'COMPLETING'}

    # Parse sacct output
    for line in info.splitlines():
        jobid, jobname, state, exitcode = line.split('|')
        if state not in wait_states:
            if state == 'CANCELLED':
                raise JobError(f'Job {jobid} ({jobname}) was cancelled')
            elif state == 'COMPLETED':
                # There might be additional internal subtasks in the output,
                # that we do not want to add to the set of finished jobs so we
                # have to check the ID here
                if jobid in task_ids:
                    finished.add(jobid)
            elif state == 'FAILED':
                exitcode = int(exitcode.split(':')[0])
                raise JobError(f'Job {jobid} ({jobname}) failed with exitcode '
                               f'{exitcode}')
            elif state == 'NODE_FAIL':
                # Ignore node failures, job should be automatically rescheduled
                # by SLURM here as we use the --requeue flag on submission
                pass
            elif state == 'TIMEOUT':
                raise JobError(f'Job {jobid} ({jobname}) timed out, consider '
                               f'increasing the time limit')
            else:
                raise JobError(f'Job {jobid} ({jobname}) failed with state '
                               f'{state}')

    return finished
Beispiel #13
0
def run(commands, conf=None, job_limit=None, retry=5):
    """Runs the given command(s) using SLURM and the given configuration.

    `conf` must be a valid argument for `perftest.config.get`.

    Args:
        commands: A string or a list of strings, console command(s) to run.
        conf:  (Default value = None) The config to use or None for default.
        job_limit: An integer defining the max number of jobs submitted to
                   SLURM in parallel.

    Returns:
        A list of collected console outputs of all commands.
    """
    if isinstance(commands, str):
        commands = [commands]

    conf = config.get(conf)

    if job_limit is None:
        job_limit = len(commands)

    # Initialize list of outputs
    outputs = [None] * len(commands)

    # Put all commands into a list with indices
    commands = list(enumerate(commands))

    # Set of running jobs
    running = set()

    while any(output is None for output in outputs):

        # Submit jobs if less than `job_limit` are running
        while len(running) < job_limit and commands:
            index, command = commands.pop()
            # Try to submit job `retry` times
            for _ in range(retry):
                try:
                    task_id, outfile = _submit(command, conf)
                    break
                except JobSchedulingError:
                    time.sleep(1)
            else:
                # Raise error if all attempts failed
                raise JobSchedulingError(f'Failed to run command "{command}", '
                                         f'all {retry} attempts failed')
            running.add((index, task_id, outfile))
            # Wait a bit to avoid overloading SLURM
            time.sleep(0.1)

        # Poll SLURM to get finished jobs
        try:
            finished_ids = _poll([task_id for _, task_id, _ in running])
        except JobError as e:
            # Cancel all (possibly still running) jobs
            for _, task_id, _ in running:
                subprocess.call(['scancel', '--full', str(task_id)],
                                env=conf.env)
            # Wait a few more seconds for buffered output
            time.sleep(10)
            # Print all current job outputs on job failure
            for _, task_id, outfile in running:
                with open(outfile, 'r') as f:
                    output = f.read()
                logger.debug(f'Current output of job {task_id}:', output)
                os.remove(outfile)
            # Re-raise job error
            raise e

        logger.debug('Finished jobs: ' + ', '.join(finished_ids))
        time.sleep(10)

        # Get output of finished jobs, build set of still running jobs
        still_running = set()
        for index, task_id, outfile in running:
            if task_id in finished_ids:
                logger.debug(f'Reading output of job {task_id}')
                with open(outfile, 'r') as f:
                    output = f.read()
                logger.debug(f'Job {task_id} generated output:', output)
                os.remove(outfile)
                outputs[index] = output
            else:
                still_running.add((index, task_id, outfile))
        running = still_running
        logger.debug('Running jobs: ' + ', '.join(task_id for _, task_id, _
                                                  in running))

    return outputs