Exemple #1
0
    def _schedule(self, test, kickoff_path):
        """Submit the kick off script using sbatch.

        :param TestRun test: The TestRun we're kicking off.
        :param Path kickoff_path: The kickoff script path.
        """

        if not kickoff_path.is_file():
            raise SchedulerPluginError(
                'Submission script {} not found'.format(kickoff_path))

        slurm_out = test.path / 'slurm.log'

        proc = subprocess.Popen([
            'sbatch', '--output={}'.format(slurm_out),
            kickoff_path.as_posix()
        ],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()

        if proc.poll() != 0:
            raise SchedulerPluginError(
                "Sbatch failed for kickoff script '{}': {}".format(
                    kickoff_path, stderr.decode('utf8')))

        return stdout.decode('UTF-8').strip().split()[-1]
Exemple #2
0
    def _filter_nodes(self, min_nodes, config, nodes):
        """Filter the system nodes down to just those we can use. For each step,
        we check to make sure we still have the minimum nodes needed in order
        to give more relevant errors.

        :param int min_nodes: The minimum number of nodes desired. This will
        :param dict config: The scheduler config for a test.
        :param [list] nodes: Nodes (as defined by collect node data)
        :returns: A list of node names that are compatible with the given
            config.
        :rtype: list
        """

        # Remove any nodes that aren't compute nodes.
        nodes = list(
            filter(lambda n: 'Partitions' in n and 'State' in n, nodes))

        # Remove nodes that aren't up.
        up_states = config['up_states']
        nodes = list(filter(lambda n: n['State'] in up_states, nodes))
        if min_nodes > len(nodes):
            raise SchedulerPluginError(
                "Insufficient nodes in up states: {}".format(up_states))

        # Check for compute nodes that are part of the right partition.
        partition = config['partition']
        nodes = list(filter(lambda n: partition in n['Partitions'], nodes))

        if min_nodes > len(nodes):
            raise SchedulerPluginError('Insufficient nodes in partition '
                                       '{}.'.format(partition))

        if config['immediate'].lower() == 'true':
            states = config['avail_states']
            # Check for compute nodes in this partition in the right state.
            nodes = list(filter(lambda n: n['State'] in states, nodes))

            if min_nodes > len(nodes):
                raise SchedulerPluginError('Insufficient nodes in partition'
                                           ' {} and states {}.'.format(
                                               partition, states))

        tasks_per_node = config.get('tasks_per_node')
        # When we want all the CPUs, it doesn't matter how many are on a node.
        tasks_per_node = 0 if tasks_per_node == 'all' else int(tasks_per_node)
        nodes = list(filter(lambda n: tasks_per_node <= n['CPUTot'], nodes))

        if min_nodes > len(nodes):
            raise SchedulerPluginError(
                'Insufficient nodes with more than {} '
                'procs per node available.'.format(tasks_per_node))

        return nodes
Exemple #3
0
    def check_job(self, id):
        job_dict = {}
        try:
            job_output = subprocess.check_output(
                ['scontrol', 'show', 'job', id])
            job_output = job_output.decode('UTF-8').split()
            for item in job_output:
                item = item.strip()
                if not item:
                    continue
                from pavilion.utils import cprint
                cprint(item, color=36)
                key, value = item.split('=', 1)
                job_dict[key] = value
        except subprocess.CalledProcessError:
            raise SchedulerPluginError('Job {} not found.'.format(id))

        try:
            value = job_dict[key]
        except KeyError:
            raise SchedulerPluginError('Key {} not found in '.format(key) +\
                                       'scontrol output.')

        ret_val = None
        run_list = ['RUNNING', 'COMPLETING', 'CONFIGURING']
        pend_list = ['PENDING']
        finish_list = ['COMPLETED']
        fail_list = [
            'BOOT_FAIL', 'FAILED', 'DEADLINE', 'NODE_FAIL', 'PREEMPTED',
            'OUT_OF_MEMORY', 'TIMEOUT'
        ]

        if key == 'JobState':
            if value in run_list:
                ret_val = 'running'
            elif value in pend_list:
                ret_val = 'pending'
            elif value in finish_list:
                ret_val = 'finished'
            elif value in fail_list:
                ret_val = 'failed'
            else:
                raise SchedulerPluginError(
                    'Job status {} not recognized.'.format(key))

        return ret_val
Exemple #4
0
    def _get_node_range(self, sched_config, nodes):
        """Translate user requests for a number of nodes into a numerical
        range based on the number of nodes on the actual system.

        :param dict sched_config: The scheduler config for a particular test.
        :param list nodes: A list of nodes.
        :rtype: str
        :returns: A range suitable for the num_nodes argument of slurm.
        """

        # Figure out the requested number of nodes
        num_nodes = sched_config.get('num_nodes')
        min_all = False
        if '-' in num_nodes:
            min_nodes, max_nodes = num_nodes.split('-')
        else:
            min_nodes = max_nodes = num_nodes

        if min_nodes == 'all':
            # We'll translate this to something else in a bit.
            min_nodes = '1'
            min_all = True

        nodes = self._filter_nodes(int(min_nodes), sched_config, nodes)

        if min_all:
            min_nodes = len(nodes)
        else:
            try:
                min_nodes = int(min_nodes)
            except ValueError:
                raise SchedulerPluginError(
                    "Invalid num_nodes minimum value: {}".format(min_nodes))

        if max_nodes == 'all':
            max_nodes = len(nodes)
        else:
            try:
                max_nodes = int(max_nodes)
            except ValueError:
                raise SchedulerPluginError(
                    "Invalid num_nodes maximum value: {}".format(max_nodes))

        return '{}-{}'.format(min_nodes, max_nodes)
Exemple #5
0
    def submit_job(self, path):
        """Submit the kick off script using sbatch."""

        if os.path.isfile(path):
            job_id = subprocess.check_output(['sbatch', path])
            job_id = job_id.decode('UTF-8').strip().split()[-1]
        else:
            raise SchedulerPluginError('Submission script {}'.format(path)+\
                                       ' not found.')
        return job_id
Exemple #6
0
    def _get_node_range(self, sched_config, nodes):
        """Translate user requests for a number of nodes into a numerical
        range based on the number of nodes on the actual system.

        :param dict sched_config: The scheduler config for a particular test.
        :param list nodes: A list of nodes.
        :rtype: str
        :returns: A range suitable for the num_nodes argument of slurm.
        """

        # Figure out the requested number of nodes
        num_nodes = sched_config.get('num_nodes')

        if self.NUM_NODES_REGEX.match(num_nodes) is None:
            raise SchedulerPluginError(
                "Invalid value for 'num_nodes'. Got '{}', expected something "
                "like '3', 'all', or '1-all'.".format(num_nodes))

        min_all = False
        if '-' in num_nodes:
            min_nodes, max_nodes = num_nodes.split('-')
        else:
            min_nodes = max_nodes = num_nodes

        if min_nodes == 'all':
            # We'll translate this to something else in a bit.
            min_nodes = '1'
            min_all = True

        nodes = self._filter_nodes(int(min_nodes), sched_config, nodes)

        include_nodes = self.parse_node_list(sched_config['include_nodes'])
        if min_all:
            min_nodes = len(nodes)
        else:
            min_nodes = int(min_nodes)
            if include_nodes:
                min_nodes = max(len(include_nodes), min_nodes)

        if max_nodes == 'all':
            max_nodes = len(nodes)
        else:
            max_nodes = int(max_nodes)

        return '{}-{}'.format(min_nodes, max_nodes)
Exemple #7
0
    def _filter_nodes(self, min_nodes, config, nodes):
        """Filter the system nodes down to just those we can use. For each step,
        we check to make sure we still have the minimum nodes needed in order
        to give more relevant errors.

        :param int min_nodes: The minimum number of nodes desired. This will
        :param dict config: The scheduler config for a test.
        :param [list] nodes: Nodes (as defined by collect node data)
        :returns: A list of node names that are compatible with the given
            config.
        :rtype: list
        """

        # Remove any nodes that aren't compute nodes.
        nodes = list(
            filter(lambda n: 'Partitions' in n and 'State' in n, nodes))

        up_states = config['up_states']

        include_nodes = self.parse_node_list(config['include_nodes'])
        exclude_nodes = self.parse_node_list(config['exclude_nodes'])

        def in_up_states(state):
            """state in up states"""
            return state in config['up_states']

        # Nodes can be in multiple simultaneous states. Only include nodes
        # for which all of their states are in the 'up_states'.
        nodes = [
            node for node in nodes if all(map(in_up_states, node['State']))
        ]
        if min_nodes > len(nodes):
            raise SchedulerPluginError(
                "Insufficient nodes in up states: {}. Needed {}, found {}.".
                format(up_states, min_nodes,
                       [node['NodeName'] for node in nodes]))

        # Check for compute nodes that are part of the right partition.
        partition = config['partition']
        nodes = list(filter(lambda n: partition in n['Partitions'], nodes))

        if min_nodes > len(nodes):
            raise SchedulerPluginError('Insufficient nodes in partition '
                                       '{}.'.format(partition))

        if config['immediate'].lower() == 'true':

            def in_avail(state):
                """state in avail_states."""
                return state in config['avail_states']

            # Check for compute nodes in this partition in the avail states.
            nodes = [
                node for node in nodes if all(map(in_avail, node['State']))
            ]

            if min_nodes > len(nodes):
                raise SchedulerPluginError(
                    'Insufficient nodes in partition {} and states {}.'.format(
                        partition, config['avail_states']))

        tasks_per_node = config.get('tasks_per_node')
        # When we want all the CPUs, it doesn't matter how many are on a node.
        tasks_per_node = 0 if tasks_per_node == 'all' else int(tasks_per_node)
        nodes = list(filter(lambda n: tasks_per_node <= n['CPUTot'], nodes))

        # Remove any specifically excluded nodes.
        nodes = [
            node for node in nodes if node['NodeName'] not in exclude_nodes
        ]
        node_names = [node['NodeName'] for node in nodes]
        for name in include_nodes:
            if name not in node_names:
                raise SchedulerPluginError(
                    "Specifically requested node '{}', but it was determined "
                    "to be unavailable.".format(name))

        if min_nodes > len(nodes):
            raise SchedulerPluginError(
                'Insufficient nodes with more than {} procs per node available.'
                .format(tasks_per_node))

        return nodes