Ejemplo n.º 1
0
         def send_command(self, command, **kwargs):
             expected_cmd = 'qstat -i'\
                            ' `echo {} '\
                            '| xargs -n 1 qselect -N` '\
                            '| tail -n+6 '\
                            '| awk \'{{ print $4 "|" $10 }}\''.format(
                                shlex_quote(' '.join(
                                    map(shlex_quote, job_names))))
             self._test_case.assertEqual(command, expected_cmd)
             return """   test_1 | S
test 2   | R\n""", 0
Ejemplo n.º 2
0
    def _get_states_detailed(self, job_names):
        """
        Get job states by job names

        This function uses `qstat` command to query PBSPro.
        Please don't launch this call very frequently. Polling it
        frequently, especially across all users on the cluster,
        will slow down response times and may bring
        scheduling to a crawl.

        It allows to a precise mapping of Torque states to
        Slurm states by taking into account `exit_code`.
        Unlike `get_states_tabular` it parses output on host
        and uses several SSH commands.
        """
        # identify job ids
        # Read environment, required by some HPC (e.g. HLRS Hawk)
        read_environment = "source /etc/profile > /dev/null 2>&1; "
        call = read_environment + "echo {} | xargs -n 1 qselect -x -N".format(
            shlex_quote(' '.join(map(shlex_quote, job_names))))

        client = SshClient(self.credentials)

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=self.workdir,
                                                         wait_result=True)
        job_ids = Pbspro._parse_qselect(output)
        if not job_ids:
            return {}

        # get detailed information about jobs
        call = read_environment + "qstat -x -f {}".format(' '.join(
            map(str, job_ids)))

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=self.workdir,
                                                         wait_result=True)
        client.close_connection()
        try:
            job_states, audits = Pbspro._parse_qstat_detailed(output)
        except SyntaxError as e:
            self.logger.warning(
                "cannot parse state response for job ids=[{}]".format(','.join(
                    map(str, job_ids))))
            self.logger.warning(
                "{err}\n`qstat -x -f` output to parse:\n\\[\n{text}\n\\]".
                format(err=str(e), text=output))
            # TODO: think whether error ignoring is better
            #       for the correct lifecycle
            raise e

        return job_states, audits
Ejemplo n.º 3
0
    def test_identifying_job_ids_call(self):
        """ Call for revealing job ids by job names. """
        job_names = ('test_1', 'test 2')

        # @TODO: replace by _get_jobids_by_name() as soon as the dependency on
        #        SSH client is removed.
        from croupier_plugin.utilities import shlex_quote
        response = "qstat -i `echo {} | xargs -n 1 qselect -N` |"\
                   " tail -n+6 | awk '{{ print $4 \" \" $1 }}'".format(
                       shlex_quote(' '.join(map(shlex_quote, job_names))))

        self.assertEqual(
            response, 'qstat -i '
            '`echo \'\'"\'"\'test 2\'"\'"\' test_1\' |'
            ' xargs -n 1 qselect -N` |'
            ' tail -n+6 | awk \'{ print $4 " " $1 }\'')
Ejemplo n.º 4
0
    def _get_states_tabular(ssh_client, job_names, logger):
        """
        Get job states by job names

        This function uses `qstat` command to query Torque.
        Please don't launch this call very friquently. Polling it
        frequently, especially across all users on the cluster,
        will slow down response times and may bring
        scheduling to a crawl.

        It invokes `tail/awk` to make simple parsing on the remote HPC.
        """
        # TODO:(emepetres) set start day of consulting
        # @caution This code fails to manage the situation
        #          if several jobs have the same name
        call = "qstat -i `echo {} | xargs -n 1 qselect -N` "\
            "| tail -n+6 | awk '{{ print $4 \"|\" $10 }}'".format(
                shlex_quote(' '.join(map(shlex_quote, job_names))))
        output, exit_code = ssh_client.send_command(call, wait_result=True)

        return Torque._parse_qstat_tabular(output) if exit_code == 0 else {}
Ejemplo n.º 5
0
    def send_command(self,
                     command,
                     exec_timeout=3000,
                     read_chunk_timeout=500,
                     wait_result=False):
        """Sends a command and returns stdout, stderr and exitcode"""

        # Check if connection is made previously
        if self._client is not None:

            if self._login_shell:
                cmd = "bash -l -c {}".format(shlex_quote(command))
            else:
                cmd = command
            # there is one channel per command
            stdin, stdout, stderr = self._client.exec_command(
                cmd,
                timeout=exec_timeout)

            if wait_result:
                # get the shared channel for stdout/stderr/stdin
                channel = stdout.channel

                # we do not need stdin
                stdin.close()
                # indicate that we're not going to write to that channel
                channel.shutdown_write()

                # read stdout/stderr in order to prevent read block hangs
                stdout_chunks = []
                stdout_chunks.append(stdout.channel.recv(
                    len(stdout.channel.in_buffer)))
                # chunked read to prevent stalls
                while (not channel.closed
                       or channel.recv_ready()
                       or channel.recv_stderr_ready()):
                    # Stop if channel was closed prematurely,
                    # and there is no data in the buffers.
                    got_chunk = False
                    readq, _, _ = select.select([stdout.channel],
                                                [],
                                                [],
                                                read_chunk_timeout)
                    for c in readq:
                        if c.recv_ready():
                            stdout_chunks.append(stdout.channel.recv(
                                len(c.in_buffer)))
                            got_chunk = True
                        if c.recv_stderr_ready():
                            # make sure to read stderr to prevent stall
                            stderr.channel.recv_stderr(len(c.in_stderr_buffer))
                            got_chunk = True
                    '''
                    1) make sure that there are at least 2 cycles with no data
                        in the input buffers in order to not exit too early
                        (i.e. cat on a >200k file).
                    2) if no data arrived in the last loop, check if we already
                        received the exit code
                    3) check if input buffers are empty
                    4) exit the loop
                    '''
                    if (not got_chunk
                            and stdout.channel.exit_status_ready()
                            and not stderr.channel.recv_stderr_ready()
                            and not stdout.channel.recv_ready()):
                        # Indicate that we're not going to read from
                        # this channel anymore
                        stdout.channel.shutdown_read()
                        # close the channel
                        stdout.channel.close()
                        # Remote side is finished & our bufferes are empty
                        break

            # close all the pseudofiles
            stdout.close()
            stderr.close()

            if wait_result:
                # exit code is always ready at this point
                exit_code = stdout.channel.recv_exit_status()
                if exit_code == 0:
                    output = ''.join(stdout_chunks)
                else:
                    output = ''.join(stdout_chunks)  # TODO stderr
                return (output, exit_code)
            else:
                return True
        else:
            if wait_result:
                return (None, None)
            else:
                return False
Ejemplo n.º 6
0
    def _build_job_submission_call(self, name, job_settings, logger):
        # check input information correctness
        if not isinstance(job_settings, dict) or \
                not isinstance(name, basestring):
            return {'error': "Incorrect inputs"}

        if 'type' not in job_settings or 'command' not in job_settings:
            return {
                'error':
                "'type' and 'command' " + "must be defined in job settings"
            }

        if 'type' in job_settings and job_settings['type'] != 'SBATCH':
            return {
                'error':
                "Job type '" + job_settings['type'] +
                "'not supported. Torque support only batched jobs."
            }

        # Build single line command
        torque_call = ''

        # NOTE an uploaded script could also be interesting to execute
        if 'pre' in job_settings:
            for entry in job_settings['pre']:
                torque_call += entry + '; '

#       ################### Torque settings ###################
# qsub command plus job name
        torque_call += "qsub -V -N {}".format(shlex_quote(name))

        resources_request = ""
        if 'nodes' in job_settings:
            resources_request = "nodes={}".format(job_settings['nodes'])

            # number of cores requested per node
            if 'tasks_per_node' in job_settings:
                resources_request += ':ppn={}'.format(
                    job_settings['tasks_per_node'])
        else:
            if 'tasks_per_node' in job_settings:
                logger.error(
                    r"Specify 'tasks_per_node' while 'nodes' is not specified")

        if 'max_time' in job_settings:
            if len(resources_request) > 0:
                resources_request += ','
            resources_request += 'walltime={}'.format(job_settings['max_time'])

        if len(resources_request) > 0:
            torque_call += ' -l {}'.format(resources_request)

        # more precisely is it a destination [queue][@server]
        if 'queue' in job_settings:
            torque_call += " -q {}".format(shlex_quote(job_settings['queue']))

        if 'rerunnable' in job_settings:  # same to requeue in SLURM
            torque_call += " -r {}".format(
                'y' if job_settings['rerunnable'] else 'n')

        if 'work_dir' in job_settings:
            torque_call += " -w {}".format(
                shlex_quote(job_settings['work_dir']))

        additional_attributes = {}
        if 'group_name' in job_settings:
            additional_attributes["group_list"] = shlex_quote(
                job_settings['group_name'])

        if len(additional_attributes) > 0:
            torque_call += " -W {}".format(','.join(
                "{0}={1}".format(k, v)
                for k, v in additional_attributes.iteritems()))

        # if 'tasks' in job_settings:
        #     torque_call += ' -n ' + str(job_settings['tasks'])
#       #######################################################

        response = {}
        if 'scale' in job_settings and \
                int(job_settings['scale']) > 1:
            # set the max of parallel jobs
            scale_max = job_settings['scale']
            # set the job array
            torque_call += ' -J 0-{}'.format(scale_max - 1)
            if 'scale_max_in_parallel' in job_settings and \
                    int(job_settings['scale_max_in_parallel']) > 0:
                torque_call += '%{}'.format(
                    job_settings['scale_max_in_parallel'])
                scale_max = job_settings['scale_max_in_parallel']
            # map the orchestrator variables after last sbatch
            scale_env_mapping_call = \
                "sed -i '/# DYNAMIC VARIABLES/a\\" \
                "SCALE_INDEX=$PBS_ARRAYID\\n" \
                "SCALE_COUNT={scale_count}\\n" \
                "SCALE_MAX={scale_max}' {command}".format(
                    scale_count=job_settings['scale'],
                    scale_max=scale_max,
                    command=job_settings['command'].split()[0])  # file only
            response['scale_env_mapping_call'] = scale_env_mapping_call

        # add executable and arguments
        torque_call += ' {}'.format(job_settings['command'])

        # NOTE an uploaded script could also be interesting to execute
        if 'post' in job_settings:
            torque_call += '; '
            for entry in job_settings['post']:
                torque_call += entry + '; '

        response['call'] = torque_call
        return response
Ejemplo n.º 7
0
 def _build_job_cancellation_call(self, name, job_settings, logger):
     return r"qselect -N {} | xargs qdel".format(shlex_quote(name))
Ejemplo n.º 8
0
    def _parse_job_settings(self, job_id, job_settings, script=False):
        _settings = {'data': ''}
        if script:
            _prefix = '#PBS'
            _suffix = '\n'
        else:
            _prefix = ''
            _suffix = ''

        # TODO writ for script (prefix, suffix ??)

        if not script:
            # qsub command plus job name
            _settings['data'] += "qsub -V -N {}".format(shlex_quote(job_id))

        # Check if exists and has content
        def _check_job_settings_key(key):
            return key in job_settings and str(job_settings[key]).strip()

        def _add_setting(option, value, op_separator=' '):
            _settings['data'] += '{} {}{}{}{}'.format(_prefix, option,
                                                      op_separator, value,
                                                      _suffix)

        if not _check_job_settings_key('nodes') and \
                _check_job_settings_key('tasks_per_node'):
            return {
                'error':
                "Specified 'tasks_per_node' while"
                "'nodes' is not specified"
            }

        if _check_job_settings_key('nodes'):
            node_request = "nodes={}".format(job_settings['nodes'])

            # number of cores requested per node
            # TODO If tasks and no tasks_per_node, then
            # tasks_per_node = tasks/nodes
            if _check_job_settings_key('tasks_per_node'):
                node_request += ':ppn={}'.format(
                    job_settings['tasks_per_node'])

            _add_setting('-l', node_request)

        if _check_job_settings_key('max_time'):
            _add_setting('-l', 'walltime={}'.format(job_settings['max_time']))

        if _check_job_settings_key('queue') or \
                _check_job_settings_key('partition'):
            if _check_job_settings_key('queue'):
                queue = job_settings['queue']
            else:
                queue = job_settings['partition']
            _add_setting('-q', shlex_quote(queue))

        if _check_job_settings_key('memory'):
            _add_setting('-l', 'mem={}'.format(job_settings('memory')))

        if _check_job_settings_key('mail_user'):
            _add_setting('-M', job_settings['mail_user'])

        # FIXME make slurm and torque compatible
        # a (aborted)
        # b (when it begins)
        # e (when it ends)
        # f (when it terminates with a non-zero exit code)
        if _check_job_settings_key('mail_type'):
            _add_setting('-m', job_settings['mail_type'])

        if _check_job_settings_key('account'):
            _add_setting('-A', job_settings['account'])

        if _check_job_settings_key('stderr_file'):
            _add_setting('-e', job_settings['stdoutstderr_file_file'])
        else:
            _add_setting('-e', job_id + '.err')

        if _check_job_settings_key('stdout_file'):
            _add_setting('-o', job_settings['stdout_file'])
        else:
            _add_setting('-o', job_id + '.out')

        additional_attributes = {}
        if 'group_name' in job_settings:
            additional_attributes["group_list"] = shlex_quote(
                job_settings['group_name'])

        # add scale, executable and arguments
        if not script:
            if 'scale' in job_settings and \
                    int(job_settings['scale']) > 1:
                # set the job array
                _settings['data'] += ' -t 0-{}'.format(job_settings['scale'] -
                                                       1)
                if 'scale_max_in_parallel' in job_settings and \
                        int(job_settings['scale_max_in_parallel']) > 0:
                    _settings['data'] += '%{}'.format(
                        job_settings['scale_max_in_parallel'])

            _settings['data'] += ' ' + job_settings['script']
            if _check_job_settings_key('arguments'):
                args = ''
                for arg in job_settings['arguments']:
                    args += arg + ' '
                _settings['data'] += ' -F "{}"'.format(args)
            _settings['data'] += '; '

        return _settings