Example #1
0
    def __init__(self):
        # Try to figure out if we are indeed using LMOD
        lmod_cmd = os.getenv('LMOD_CMD')
        if lmod_cmd is None:
            raise ConfigError('could not find a sane Lmod installation: '
                              'environment variable LMOD_CMD is not defined')

        try:
            completed = os_ext.run_command('%s --version' % lmod_cmd)
        except OSError as e:
            raise ConfigError(
                'could not find a sane Lmod installation: %s' % e)

        version_match = re.search(r'.*Version\s*(\S+)', completed.stderr,
                                  re.MULTILINE)
        if version_match is None:
            raise ConfigError('could not retrieve Lmod version')

        self._version = version_match.group(1)
        self._command = '%s python ' % lmod_cmd
        try:
            # Try the Python bindings now
            completed = os_ext.run_command(self._command)
        except OSError as e:
            raise ConfigError(
                'could not get the Python bindings for Lmod: ' % e)

        if re.search(r'Unknown shell type', completed.stderr):
            raise ConfigError('Python is not supported by '
                              'this Lmod installation')
Example #2
0
    def __init__(self):
        # Try to figure out if we are indeed using the TCL version
        try:
            completed = os_ext.run_command('modulecmd -V')
        except OSError as e:
            raise ConfigError('could not find a sane Tmod installation: %s' %
                              e) from e

        version_match = re.search(r'^VERSION=(\S+)', completed.stdout,
                                  re.MULTILINE)
        tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout,
                                      re.MULTILINE)

        if version_match is None or tcl_version_match is None:
            raise ConfigError('could not find a sane Tmod installation')

        self._version = version_match.group(1)
        self._command = 'modulecmd python'
        try:
            # Try the Python bindings now
            completed = os_ext.run_command(self._command)
        except OSError as e:
            raise ConfigError('could not get the Python bindings for Tmod: ' %
                              e) from e

        if re.search(r'Unknown shell type', completed.stderr):
            raise ConfigError(
                'Python is not supported by this Tmod installation')
Example #3
0
 def test_command_timeout(self):
     try:
         os_ext.run_command('sleep 3', timeout=2)
     except SpawnedProcessTimeout as e:
         assert e.timeout == 2
         # Try to get the string repr. of the exception: see bug #658
         s = str(e)
     else:
         pytest.fail('expected timeout')
Example #4
0
    def test_command_timeout(self):
        with pytest.raises(
                SpawnedProcessTimeout,
                match=r"command 'sleep 3' timed out after 2s") as exc_info:
            os_ext.run_command('sleep 3', timeout=2)

        assert exc_info.value.timeout == 2

        # Try to get the string repr. of the exception: see bug #658
        s = str(exc_info.value)
Example #5
0
def test_trap_error(script_file):
    with shell.generate_script(script_file, trap_errors=True) as gen:
        gen.write('false')
        gen.write('echo hello')

    with pytest.raises(SpawnedProcessError) as cm:
        os_ext.run_command(str(script_file), check=True)

    exc = cm.value
    assert 'hello' not in exc.stdout
    assert 1 == exc.exitcode
    assert "-reframe: command `false' failed (exit code: 1)" in exc.stdout
Example #6
0
    def test_trap_error(self):
        with shell.generate_script(self.script_file.name,
                                   trap_errors=True) as gen:
            gen.write('false')
            gen.write('echo hello')

        with self.assertRaises(SpawnedProcessError) as cm:
            os_ext.run_command(self.script_file.name, check=True)

        exc = cm.exception
        self.assertNotIn('hello', exc.stdout)
        self.assertEqual(1, exc.exitcode)
        self.assertIn("-reframe: command `false' failed (exit code: 1)",
                      exc.stdout)
Example #7
0
    def _get_reservation_nodes(self, reservation):
        completed = os_ext.run_command('scontrol -a show res %s' % reservation,
                                       check=True)
        node_match = re.search(r'(Nodes=\S+)', completed.stdout)
        if node_match:
            reservation_nodes = node_match[1]
        else:
            raise JobError("could not extract the nodes names for "
                           "reservation '%s'" % valid_reservation)

        completed = os_ext.run_command(
            'scontrol -a show -o %s' % reservation_nodes, check=True)
        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Example #8
0
    def _update_state(self, job):
        time_from_submit = datetime.now() - self._submit_time
        rem_wait = self._squeue_delay - time_from_submit.total_seconds()
        if rem_wait > 0:
            time.sleep(rem_wait)

        # We don't run the command with check=True, because if the job has
        # finished already, squeue might return an error about an invalid
        # job id.
        completed = os_ext.run_command('squeue -h -j %s -o "%%T|%%N|%%r"' %
                                       job.jobid)
        state_match = list(re.finditer(r'^(?P<state>\S+)\|(?P<nodespec>\S*)\|'
                                       r'(?P<reason>.+)', completed.stdout))
        if not state_match:
            # Assume that job has finished
            job.state = 'CANCELLED' if self._cancelled else 'COMPLETED'

            # Set exit code manually, if not set already by the polling
            if job.exitcode is None:
                job.exitcode = 0

            return

        # Join the states with ',' in case of job arrays
        job.state = ','.join(s.group('state') for s in state_match)

        # Use ',' to join nodes to be consistent with Slurm syntax
        self._set_nodelist(
            job, ','.join(s.group('nodespec') for s in state_match)
        )

        if not self._is_cancelling and not slurm_state_pending(job.state):
            for s in state_match:
                self._check_and_cancel(job, s.group('reason'))
Example #9
0
    def completion_time(self, job):
        if (self._completion_time or not slurm_state_completed(job.state)):
            return self._completion_time

        with rt.temp_environment(variables={'SLURM_TIME_FORMAT': '%s'}):
            completed = os_ext.run_command(
                'sacct -S %s -P -j %s -o jobid,end' %
                (self._submit_time.strftime('%F'), job.jobid),
                log=False)

        state_match = list(
            re.finditer(r'^(?P<jobid>%s)\|(?P<end>\S+)' % self._state_patt,
                        completed.stdout, re.MULTILINE))
        if not state_match:
            return None

        completion_times = []
        for s in state_match:
            with suppress(ValueError):
                completion_times.append(float(s.group('end')))

        if completion_times:
            self._completion_time = max(completion_times)

        return self._completion_time
Example #10
0
    def _update_state(self):
        time_from_submit = datetime.now() - self.submit_time
        rem_wait = self.squeue_delay - time_from_submit.total_seconds()
        if rem_wait > 0:
            time.sleep(rem_wait)

        # We don't run the command with check=True, because if the job has
        # finished already, squeue might return an error about an invalid
        # job id.
        completed = os_ext.run_command('squeue -h -j %s -o "%%T|%%N|%%r"' %
                                       self._jobid)
        state_match = re.search(
            r'^(?P<state>\S+)\|(?P<nodespec>\S*)\|'
            r'(?P<reason>.+)', completed.stdout)
        if state_match is None:
            # Assume that job has finished
            self._state = (SLURM_JOB_CANCELLED
                           if self._cancelled else SLURM_JOB_COMPLETED)

            # Set exit code manually, if not set already by the polling
            if self._exitcode is None:
                self._exitcode = 0

            return

        self._state = SlurmJobState(state_match.group('state'))
        self._set_nodelist(state_match.group('nodespec'))
        if not self._is_cancelling and self._state in self._pending_states:
            self._check_and_cancel(state_match.group('reason'))
Example #11
0
    def __init__(self):
        self._command = 'modulecmd python'
        try:
            completed = os_ext.run_command(self._command + ' -V', check=True)
        except OSError as e:
            raise ConfigError(
                'could not find a sane TMod4 installation') from e
        except SpawnedProcessError as e:
            raise ConfigError(
                'could not get the Python bindings for TMod4') from e

        version_match = re.match(r'^Modules Release (\S+)\s+',
                                 completed.stderr)
        if not version_match:
            raise ConfigError('could not retrieve the TMod4 version')

        version = version_match.group(1)
        try:
            ver_major, ver_minor, *_ = [int(v) for v in version.split('.')]
        except ValueError:
            raise ConfigError(
                'could not parse TMod4 version string: ' + version) from None

        if (ver_major, ver_minor) < self.MIN_VERSION:
            raise ConfigError(
                'unsupported TMod4 version: %s (required >= %s)' %
                (version, self.MIN_VERSION))

        self._version = version
Example #12
0
def _cray_cle_version():
    completed = os_ext.run_command('cat /etc/opt/cray/release/cle-release')
    matched = re.match(r'^RELEASE=(\S+)', completed.stdout)
    if matched is None:
        return None

    return matched.group(1)
Example #13
0
def test_trap_exit(script_file):
    with shell.generate_script(script_file, trap_exit=True) as gen:
        gen.write('echo hello')

    completed = os_ext.run_command(str(script_file), check=True)
    assert 'hello' in completed.stdout
    assert 0 == completed.returncode
    assert '-reframe: script exiting with exit code: 0' in completed.stdout
Example #14
0
    def _get_all_nodes(self):
        try:
            completed = os_ext.run_command('scontrol -a show -o nodes',
                                           check=True)
        except SpawnedProcessError as e:
            raise JobError('could not retrieve node information') from e

        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Example #15
0
    def test_trap_exit(self):
        with shell.generate_script(self.script_file.name,
                                   trap_exit=True) as gen:
            gen.write('echo hello')

        completed = os_ext.run_command(self.script_file.name, check=True)
        self.assertIn('hello', completed.stdout)
        self.assertEqual(0, completed.returncode)
        self.assertIn("-reframe: script exiting with exit code: 0",
                      completed.stdout)
Example #16
0
    def _get_nodes_by_name(self, nodespec):
        try:
            completed = os_ext.run_command(
                'scontrol -a show -o node %s' % nodespec, check=True)
        except SpawnedProcessError as e:
            raise JobError('could not retrieve the node description '
                           'of nodes: %s' % nodespec) from e

        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Example #17
0
    def __init__(self):
        # Try to figure out if we are indeed using the TCL version
        try:
            modulecmd = os.getenv('MODULESHOME')
            modulecmd = os.path.join(modulecmd, 'modulecmd.tcl')
            completed = os_ext.run_command(modulecmd)
        except OSError as e:
            raise ConfigError(
                'could not find a sane TMod31 installation: %s' % e) from e

        version_match = re.search(r'Release Tcl (\S+)', completed.stderr,
                                  re.MULTILINE)
        tcl_version_match = version_match

        if version_match is None or tcl_version_match is None:
            raise ConfigError('could not find a sane TMod31 installation')

        version = version_match.group(1)
        try:
            ver_major, ver_minor, *_ = [int(v) for v in version.split('.')]
        except ValueError:
            raise ConfigError(
                'could not parse TMod31 version string: ' + version) from None

        if (ver_major, ver_minor) < self.MIN_VERSION:
            raise ConfigError(
                'unsupported TMod version: %s (required >= %s)' %
                (version, self.MIN_VERSION))

        self._version = version
        self._command = '%s python' % modulecmd

        try:
            # Try the Python bindings now
            completed = os_ext.run_command(self._command)
        except OSError as e:
            raise ConfigError(
                'could not get the Python bindings for TMod31: ' % e) from e

        if re.search(r'Unknown shell type', completed.stderr):
            raise ConfigError(
                'Python is not supported by this TMod installation')
Example #18
0
    def _get_nodes_by_name(self, nodespec):
        completed = os_ext.run_command('scontrol -a show -o node %s' %
                                       nodespec)
        node_descriptions = completed.stdout.splitlines()
        nodes_avail = set()
        for descr in node_descriptions:
            try:
                nodes_avail.add(SlurmNode(descr))
            except JobError:
                pass

        return nodes_avail
Example #19
0
    def _run_module_command(self, *args, msg=None):
        command = ' '.join([self._command, *args])
        try:
            completed = os_ext.run_command(command, check=True)
        except SpawnedProcessError as e:
            raise EnvironError(msg) from e

        if self._module_command_failed(completed):
            err = SpawnedProcessError(command, completed.stdout,
                                      completed.stderr, completed.returncode)
            raise EnvironError(msg) from err

        return completed
Example #20
0
    def __init__(self):
        # Try to figure out if we are indeed using the TCL version
        try:
            completed = os_ext.run_command('modulecmd -V')
        except OSError as e:
            raise ConfigError('could not find a sane TMod installation') from e

        version_match = re.search(r'^VERSION=(\S+)', completed.stdout,
                                  re.MULTILINE)
        tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout,
                                      re.MULTILINE)

        if version_match is None or tcl_version_match is None:
            raise ConfigError('could not find a sane TMod installation')

        version = version_match.group(1)
        try:
            ver_major, ver_minor = [int(v) for v in version.split('.')[:2]]
        except ValueError:
            raise ConfigError('could not parse TMod version string: ' +
                              version) from None

        if (ver_major, ver_minor) < self.MIN_VERSION:
            raise ConfigError('unsupported TMod version: %s (required >= %s)' %
                              (version, self.MIN_VERSION))

        self._version = version
        self._command = 'modulecmd python'
        try:
            # Try the Python bindings now
            completed = os_ext.run_command(self._command)
        except OSError as e:
            raise ConfigError('could not get the Python bindings for TMod: ' %
                              e) from e

        if re.search(r'Unknown shell type', completed.stderr):
            raise ConfigError(
                'Python is not supported by this TMod installation')
Example #21
0
    def _get_excluded_node_names(self):
        if not self.sched_exclude_nodelist:
            return set()

        command = 'scontrol show -o node %s' % self.sched_exclude_nodelist
        try:
            completed = os_ext.run_command(command, check=True)
        except SpawnedProcessError as e:
            raise JobError('could not retrieve the node description '
                           'of nodes: %s' % self.sched_exclude_nodelist) from e

        node_descriptions = completed.stdout.splitlines()
        slurm_nodes = (SlurmNode(descr) for descr in node_descriptions)
        return {n.name for n in slurm_nodes}
Example #22
0
    def _exec_module_command(self, *args, msg=None):
        command = ' '.join([self._command, *args])
        completed = os_ext.run_command(command, check=True)
        namespace = {}
        exec(completed.stdout, {}, namespace)
        if not namespace['_mlstatus']:
            # _mlstatus is set by the TMod4 Python bindings
            if msg is None:
                msg = 'modules system command failed: '
                if isinstance(completed.args, str):
                    msg += completed.args
                else:
                    msg += ' '.join(completed.args)

            raise EnvironError(msg)
Example #23
0
    def poll(self, *jobs):
        if not jobs:
            return

        m = max(job.submit_time for job in jobs)
        time_from_last_submit = time.time() - m
        rem_wait = self.SQUEUE_DELAY - time_from_last_submit
        if rem_wait > 0:
            time.sleep(rem_wait)

        # We don't run the command with check=True, because if the job has
        # finished already, squeue might return an error about an invalid
        # job id.
        completed = os_ext.run_command(
            f'squeue -h -j {",".join(job.jobid for job in jobs)} '
            f'-o "%%i|%%T|%%N|%%r"'
        )

        # We need the match objects, so we have to use finditer()
        state_match = list(re.finditer(
            fr'^(?P<jobid>{self._state_patt})\|(?P<state>\S+)\|'
            fr'(?P<nodespec>\S*)\|(?P<reason>.+)',
            completed.stdout, re.MULTILINE)
        )
        jobinfo = {}
        for s in state_match:
            jobid = s.group('jobid').split('_')[0]
            jobinfo.setdefault(jobid, []).append(s)

        for job in jobs:
            if job is None:
                continue

            try:
                job_match = jobinfo[job.jobid]
            except KeyError:
                job._state = 'CANCELLED' if job.is_cancelling else 'COMPLETED'
                if job.exitcode is None:
                    job._exitcode = 0

                continue

            # Join the states with ',' in case of job arrays
            job._state = ','.join(s.group('state') for s in job_match)
            self._cancel_if_blocked(
                job, [s.group('reason') for s in state_match]
            )
            self._cancel_if_pending_too_long(job)
Example #24
0
    def _compile_file(self, source_file, executable, lang, options):
        if not executable:
            # default executable, same as source_file without the extension
            executable = os.path.join(os.path.dirname(source_file),
                                      source_file.rsplit('.')[:-1][0])

        if not lang:
            lang = self.guess_language(source_file)

        # Replace None's with empty strings
        cppflags = self.cppflags or ''
        cflags = self.cflags or ''
        cxxflags = self.cxxflags or ''
        fflags = self.fflags or ''
        ldflags = self.ldflags or ''

        flags = [cppflags]
        if lang == 'C':
            compiler = self.cc
            flags.append(cflags)
        elif lang == 'C++':
            compiler = self.cxx
            flags.append(cxxflags)
        elif lang == 'Fortran':
            compiler = self.ftn
            flags.append(fflags)
        elif lang == 'CUDA':
            compiler = 'nvcc'
            flags.append(cxxflags)
        else:
            raise EnvironError('Unknown language: %s' % lang)

        # Append include search path
        flags += ['-I' + d for d in self.include_search_path]
        cmd = ('%s %s %s -o %s %s %s' %
               (compiler, ' '.join(flags), source_file, executable, ldflags,
                options))
        try:
            return os_ext.run_command(cmd, check=True)
        except SpawnedProcessError as e:
            # Re-raise as compilation error
            raise CompilationError(command=e.command,
                                   stdout=e.stdout,
                                   stderr=e.stderr,
                                   exitcode=e.exitcode) from None
Example #25
0
    def _update_state(self, job):
        '''Check the status of the job.'''

        completed = os_ext.run_command('qstat -f %s' % job.jobid)

        # Depending on the configuration, completed jobs will remain on the job
        # list for a limited time, or be removed upon completion.
        # If qstat cannot find the jobid, it returns code 153.
        if completed.returncode == 153:
            getlogger().debug(
                'jobid not known by scheduler, assuming job completed')
            job.state = 'COMPLETED'
            return

        if completed.returncode != 0:
            raise JobError('qstat failed: %s' % completed.stderr, job.jobid)

        nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)',
                                   completed.stdout, re.MULTILINE)
        if nodelist_match:
            nodespec = nodelist_match.group('nodespec')
            nodespec = re.sub(r'[\n\t]*', '', nodespec)
            self._set_nodelist(job, nodespec)

        state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])',
                                completed.stdout, re.MULTILINE)
        if not state_match:
            getlogger().debug('job state not found (stdout follows)\n%s' %
                              completed.stdout)
            return

        state = state_match.group('state')
        job.state = JOB_STATES[state]
        if job.state == 'COMPLETED':
            code_match = re.search(
                r'^\s*exit_status = (?P<code>\d+)',
                completed.stdout,
                re.MULTILINE,
            )
            if not code_match:
                return

            job.exitcode = int(code_match.group('code'))
Example #26
0
    def _autodetect_system(self):
        """Auto-detect system."""

        # Try to detect directly the cluster name from /etc/xthostname (Cray
        # specific)
        try:
            hostname = os_ext.run_command('cat /etc/xthostname',
                                          check=True).stdout
        except SpawnedProcessError:
            # Try to figure it out with the standard method
            hostname = socket.gethostname()

        # Go through the supported systems and try to match the hostname
        for system in self._site_config.systems.values():
            for hostname_patt in system.hostnames:
                if re.match(hostname_patt, hostname):
                    return system

        raise SystemAutodetectionError
Example #27
0
def autodetect_system(site_config):
    """Auto-detect system"""
    import re
    import socket

    # Try to detect directly the cluster name from /etc/xthostname (Cray
    # specific)
    try:
        hostname = os_ext.run_command('cat /etc/xthostname', check=True).stdout
    except ReframeError:
        # Try to figure it out with the standard method
        hostname = socket.gethostname()

    # Go through the supported systems and try to match the hostname
    for system in site_config.systems.values():
        for hostname_patt in system.hostnames:
            if re.match(hostname_patt, hostname):
                return system

    return None
Example #28
0
    def _compile_dir(self, source_dir, makefile, options):
        if makefile:
            cmd = 'make -C %s -f %s %s ' % (source_dir, makefile, options)
        else:
            cmd = 'make -C %s %s ' % (source_dir, options)

        # Pass a set of predefined options to the Makefile
        if self.propagate:
            flags = [
                "CC='%s'" % self.cc,
                "CXX='%s'" % self.cxx,
                "FC='%s'" % self.ftn
            ]

            # Explicitly check against None here; the user may explicitly want
            # to clear the flags
            if self.cppflags is not None:
                flags.append("CPPFLAGS='%s'" % self.cppflags)

            if self.cflags is not None:
                flags.append("CFLAGS='%s'" % self.cflags)

            if self.cxxflags is not None:
                flags.append("CXXFLAGS='%s'" % self.cxxflags)

            if self.fflags is not None:
                flags.append("FFLAGS='%s'" % self.fflags)

            if self.ldflags is not None:
                flags.append("LDFLAGS='%s'" % self.ldflags)

            cmd += ' '.join(flags)

        try:
            return os_ext.run_command(cmd, check=True)
        except SpawnedProcessError as e:
            # Re-raise as compilation error
            raise CompilationError(command=e.command,
                                   stdout=e.stdout,
                                   stderr=e.stderr,
                                   exitcode=e.exitcode) from None
Example #29
0
    def completion_time(self, job):
        if (self._completion_time or not slurm_state_completed(job.state)):
            return self._completion_time

        with env.temp_environment(variables={'SLURM_TIME_FORMAT': 'standard'}):
            completed = os_ext.run_command(
                'sacct -S %s -P -j %s -o jobid,end' %
                (datetime.now().strftime('%F'), job.jobid),
                log=False)

        state_match = list(
            re.finditer(r'^(?P<jobid>%s)\|(?P<end>\S+)' % self._state_patt,
                        completed.stdout, re.MULTILINE))
        if not state_match:
            return None

        self._completion_time = max(
            datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S')
            for s in state_match)

        return self._completion_time
Example #30
0
 def _run_command(self, cmd, timeout=None):
     """Run command cmd and re-raise any exception as a JobError."""
     try:
         return os_ext.run_command(cmd, check=True, timeout=timeout)
     except SpawnedProcessError as e:
         raise JobError(jobid=self._jobid) from e