Exemple #1
0
    def _parse_submit_output(self, retval, stdout, stderr):
        """
        Parse the output of the submit command, as returned by executing the
        command returned by _get_submit_command command.

        To be implemented by the plugin.

        Return a string with the JobID.
        """
        if retval != 0:
            self.logger.error('Error in _parse_submit_output: retval={}; '
                              'stdout={}; stderr={}'.format(
                                  retval, stdout, stderr))
            raise SchedulerError('Error during submission, retval={}\n'
                                 'stdout={}\nstderr={}'.format(
                                     retval, stdout, stderr))

        try:
            transport_string = ' for {}'.format(self.transport)
        except SchedulerError:
            transport_string = ''

        if stderr.strip():
            self.logger.warning('in _parse_submit_output{}: '
                                'there was some text in stderr: {}'.format(
                                    transport_string, stderr))

        try:
            return stdout.strip().split('Job <')[1].split('>')[0]
        except IndexError:
            raise SchedulerParsingError(
                'Cannot parse submission output: {}'.format(stdout))
Exemple #2
0
    def _parse_submit_output(self, retval, stdout, stderr):
        """
        Parse the output of the submit command, as returned by executing the
        command returned by _get_submit_command command.

        To be implemented by the plugin.

        Return a string with the JobID.
        """
        if retval != 0:
            self.logger.error(f'Error in _parse_submit_output: retval={retval}; stdout={stdout}; stderr={stderr}')
            raise SchedulerError(f'Error during submission, retval={retval}\nstdout={stdout}\nstderr={stderr}')

        try:
            transport_string = f' for {self.transport}'
        except SchedulerError:
            transport_string = ''

        if stderr.strip():
            self.logger.warning(f'in _parse_submit_output{transport_string}: there was some text in stderr: {stderr}')

        try:
            return stdout.strip().split('Job <')[1].split('>')[0]
        except IndexError as exc:
            raise SchedulerParsingError(f'Cannot parse submission output: `{stdout}`') from exc
def parse_sge_script(local_script_path):
    """
    Parse the SGE script

    :returns: A dictionary of the options for constructing AiiDAJobFirework
    """

    with open(local_script_path) as handle:
        lines = handle.readlines()

    options = {
        'stdout_fname': '_scheduler-stdout.txt',
        'stderr_fname': '_scheduler-stderr.txt',
        'priority':
        100,  # Base priority of AiiDA jobs in the FW system, hard coded to 100 for now
    }

    for line in lines:
        if '#$ -N' in line:
            options['job_name'] = line.split()[-1]  # Name of the job
        if '#$ -o' in line:
            options['stdout_fname'] = line.replace("#$ -o", "").strip()
        if '#$ -e' in line:
            options['stderr_fname'] = line.replace("#$ -e", "").strip()
        if '#$ -pe' in line:
            options['mpinp'] = int(line.split()[-1])
        if 'h_rt' in line:
            timestring = line.split('=')[1].strip()
            hours, minutes, seconds = timestring.split(':')
            seconds = int(seconds) + int(minutes) * 60 + int(hours) * 3600
            options['walltime'] = int(seconds)

        if '#$ -p ' in line:
            options['priority'] += int(line.split()[-1])
    required_fields = ['job_name', 'mpinp', 'walltime']

    missing = [field for field in required_fields if field not in options]
    if missing:
        raise SchedulerParsingError(
            f"Missing fields: {missing} while parsing the job script")

    return options
Exemple #4
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        # pylint: disable=too-many-statements,too-many-branches
        if retval != 0:
            self.logger.error(
                f'Error in _parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}'
            )
            raise SchedulerError(
                f'Error during joblist retrieval, retval={retval}')

        if stderr.strip():
            self.logger.warning(
                f'in _parse_joblist_output for {str(self.transport)}: there was some text in stderr: {stderr}'
            )

        if stdout:
            try:
                xmldata = xml.dom.minidom.parseString(stdout)
            except xml.parsers.expat.ExpatError:
                self.logger.error(
                    f'in sge._parse_joblist_output: xml parsing of stdout failed: {stdout}'
                )
                raise SchedulerParsingError(
                    'Error during joblist retrieval, xml parsing of stdout failed'
                )
        else:
            self.logger.error(
                f'Error in sge._parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}'
            )
            raise SchedulerError(
                'Error during joblist retrieval, no stdout produced')

        try:
            first_child = xmldata.firstChild
            second_childs = first_child.childNodes
            tag_names_sec = [elem.tagName for elem in second_childs \
                             if elem.nodeType == 1]
            if 'queue_info' not in tag_names_sec:
                self.logger.error(
                    f'Error in sge._parse_joblist_output: no queue_info: {stdout}'
                )
                raise SchedulerError
            if 'job_info' not in tag_names_sec:
                self.logger.error(
                    f'Error in sge._parse_joblist_output: no job_info: {stdout}'
                )
                raise SchedulerError
        except SchedulerError:
            self.logger.error(
                f'Error in sge._parse_joblist_output: stdout={stdout}')
            raise SchedulerError('Error during xml processing, of stdout:'
                                 "There is no 'job_info' or no 'queue_info'"
                                 'element or there are no jobs!')
        # If something weird happens while firstChild, pop, etc:
        except Exception:
            self.logger.error(
                f'Error in sge._parse_joblist_output: stdout={stdout}')
            raise SchedulerError('Error during xml processing, of stdout')

        jobs = list(first_child.getElementsByTagName('job_list'))
        # jobs = [i for i in jobinfo.getElementsByTagName('job_list')]
        # print [i[0].childNodes[0].data for i in job_numbers if i]
        joblist = []
        for job in jobs:
            this_job = JobInfo()

            # In case the user needs more information the xml-data for
            # each job is stored:
            this_job.raw_data = job.toxml()

            try:
                job_element = job.getElementsByTagName('JB_job_number').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_id = str(element_child.data).strip()
                if not this_job.job_id:
                    raise SchedulerError
            except SchedulerError:
                self.logger.error(
                    f'Error in sge._parse_joblist_output:no job id is given, stdout={stdout}'
                )
                raise SchedulerError(
                    'Error in sge._parse_joblist_output: no job id is given')
            except IndexError:
                self.logger.error("No 'job_number' given for job index {} in "
                                  'job list, stdout={}'.format(jobs.index(job) \
                                                               , stdout))
                raise IndexError(
                    'Error in sge._parse_joblist_output: no job id is given')

            try:
                job_element = job.getElementsByTagName('state').pop(0)
                element_child = job_element.childNodes.pop(0)
                job_state_string = str(element_child.data).strip()
                try:
                    this_job.job_state = _MAP_STATUS_SGE[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        'id {}'.format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = JobState.UNDETERMINED
            except IndexError:
                self.logger.warning("No 'job_state' field for job id {} in"
                                    'stdout={}'.format(this_job.job_id,
                                                       stdout))
                this_job.job_state = JobState.UNDETERMINED

            try:
                job_element = job.getElementsByTagName('JB_owner').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_owner = str(element_child.data).strip()
            except IndexError:
                self.logger.warning(
                    f"No 'job_owner' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName('JB_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.title = str(element_child.data).strip()
            except IndexError:
                self.logger.warning(
                    f"No 'title' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName('queue_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.queue_name = str(element_child.data).strip()
            except IndexError:
                if this_job.job_state == JobState.RUNNING:
                    self.logger.warning(
                        f"No 'queue_name' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName(
                    'JB_submission_time').pop(0)
                element_child = job_element.childNodes.pop(0)
                time_string = str(element_child.data).strip()
                try:
                    this_job.submission_time = self._parse_time_string(
                        time_string)
                except ValueError:
                    self.logger.warning(
                        f"Error parsing 'JB_submission_time' for job id {this_job.job_id} ('{time_string}')"
                    )
            except IndexError:
                try:
                    job_element = job.getElementsByTagName(
                        'JAT_start_time').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    time_string = str(element_child.data).strip()
                    try:
                        this_job.dispatch_time = self._parse_time_string(
                            time_string)
                    except ValueError:
                        self.logger.warning(
                            f"Error parsing 'JAT_start_time'for job id {this_job.job_id} ('{time_string}')"
                        )
                except IndexError:
                    self.logger.warning("No 'JB_submission_time' and no "
                                        "'JAT_start_time' field for job "
                                        'id {}'.format(this_job.job_id))

            # There is also cpu_usage, mem_usage, io_usage information available:
            if this_job.job_state == JobState.RUNNING:
                try:
                    job_element = job.getElementsByTagName('slots').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    this_job.num_mpiprocs = str(element_child.data).strip()
                except IndexError:
                    self.logger.warning(
                        f"No 'slots' field for job id {this_job.job_id}")

            joblist.append(this_job)
        # self.logger.debug("joblist final: {}".format(joblist))
        return joblist
Exemple #5
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """

        # I don't raise because if I pass a list of jobs, I get a non-zero status
        # if one of the job is not in the list anymore

        # retval should be zero
        # if retval != 0:
        # _LOGGER.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output
        # but I strip lines containing "Unknown Job Id", that happens
        # also when I ask for a calculation that has finished
        #
        # I also strip for "Job has finished" because this happens for
        # those schedulers configured to leave the job in the output
        # of qstat for some time after job completion.
        filtered_stderr = '\n'.join(
            l for l in stderr.split('\n')
            if 'Unknown Job Id' not in l and 'Job has finished' not in l)
        if filtered_stderr.strip():
            _LOGGER.warning('Warning in _parse_joblist_output, non-empty '
                            "(filtered) stderr='{}'".format(filtered_stderr))
            if retval != 0:
                raise SchedulerError('Error during qstat parsing, retval={}\n'
                                     'stdout={}\nstderr={}'.format(
                                         retval, stdout, stderr))

        jobdata_raw = []  # will contain raw data parsed from qstat output
        # Get raw data and split in lines
        for line_num, line in enumerate(stdout.split('\n'), start=1):
            # Each new job stanza starts with the string 'Job Id:': I
            # create a new item in the jobdata_raw list
            if line.startswith('Job Id:'):
                jobdata_raw.append({
                    'id': line.split(':', 1)[1].strip(),
                    'lines': [],
                    'warning_lines_idx': []
                })
                # warning_lines_idx: lines that do not start either with
                # tab or space
            else:
                if line.strip():
                    # This is a non-empty line, therefore it is an attribute
                    # of the last job found
                    if not jobdata_raw:
                        # The list is still empty! (This means that I found a
                        # non-empty line, before finding the first 'Job Id:'
                        # string: it is an error. However this may happen
                        # only before the first job.
                        raise SchedulerParsingError(
                            'I did not find the header for the first job')
                        # _LOGGER.warning("I found some text before the "
                        # "first job: {}".format(l))
                    else:
                        if line.startswith(' '):
                            # If it starts with a space, it is a new field
                            jobdata_raw[-1]['lines'].append(line)
                        elif line.startswith('\t'):
                            # If a line starts with a TAB,
                            # I append to the previous string
                            # stripping the TAB
                            if not jobdata_raw[-1]['lines']:
                                raise SchedulerParsingError(
                                    'Line {} is the first line of the job, but it '
                                    'starts with a TAB! ({})'.format(
                                        line_num, line))
                            jobdata_raw[-1]['lines'][-1] += line[1:]
                        else:
                            # raise SchedulerParsingError(
                            #    "Wrong starting character at line {}! ({})"
                            #    "".format(line_num, l))
                            ## For some reasons, the output of 'comment' and
                            ## 'Variable_List', for instance, can have
                            ## newlines if they are included... # I do a
                            ## workaround
                            jobdata_raw[-1]['lines'][-1] += '\n{}'.format(line)
                            jobdata_raw[-1]['warning_lines_idx'].append(
                                len(jobdata_raw[-1]['lines']) - 1)

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:
            this_job = JobInfo()
            this_job.job_id = job['id']

            lines_without_equals_sign = [
                i for i in job['lines'] if '=' not in i
            ]

            # There are lines without equals sign: this is bad
            if lines_without_equals_sign:
                # Should I only warn?
                _LOGGER.error('There are lines without equals sign! {}'.format(
                    lines_without_equals_sign))
                raise SchedulerParsingError(
                    'There are lines without equals sign.')

            raw_data = {
                i.split('=', 1)[0].strip().lower(): i.split('=',
                                                            1)[1].lstrip()
                for i in job['lines'] if '=' in i
            }

            ## I ignore the errors for the time being - this seems to be
            ## a problem if there are \n in the content of some variables?
            ## I consider this a workaround...
            # for line_with_warning in set(job['warning_lines_idx']):
            #    if job['lines'][line_with_warning].split(
            #        '=',1)[0].strip().lower() != "comment":
            #        raise SchedulerParsingError(
            #            "Wrong starting character in one of the lines "
            #            "of job {}, and it's not a comment! ({})"
            #            "".format(this_job.job_id,
            #                      job['lines'][line_with_warning]))

            problematic_fields = []
            for line_with_warning in set(job['warning_lines_idx']):
                problematic_fields.append(
                    job['lines'][line_with_warning].split(
                        '=', 1)[0].strip().lower())
            if problematic_fields:
                # These are the fields that contain unexpected newlines
                raw_data['warning_fields_with_newlines'] = problematic_fields

            # I believe that exit_status and terminating_signal cannot be
            # retrieved from the qstat -f output.

            # I wrap calls in try-except clauses to avoid errors if a field
            # is missing
            try:
                this_job.title = raw_data['job_name']
            except KeyError:
                _LOGGER.debug("No 'job_name' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.annotation = raw_data['comment']
            except KeyError:
                # Many jobs do not have a comment; I do not complain about it.
                pass
                # _LOGGER.debug("No 'comment' field for job id {}".format(
                #    this_job.job_id))

            try:
                job_state_string = raw_data['job_state']
                try:
                    this_job.job_state = self._map_status[job_state_string]
                except KeyError:
                    _LOGGER.warning("Unrecognized job_state '{}' for job "
                                    'id {}'.format(job_state_string,
                                                   this_job.job_id))
                    this_job.job_state = JobState.UNDETERMINED
            except KeyError:
                _LOGGER.debug("No 'job_state' field for job id {}".format(
                    this_job.job_id))
                this_job.job_state = JobState.UNDETERMINED

            try:
                this_job.job_substate = raw_data['substate']
            except KeyError:
                _LOGGER.debug("No 'substate' field for job id {}".format(
                    this_job.job_id))

            try:
                exec_hosts = raw_data['exec_host'].split('+')
            except KeyError:
                # No exec_host information found (it may be ok, if the job
                # is not running)
                pass
            else:
                # parse each host; syntax, from the man page:
                # hosta/J1+hostb/J2*P+...
                # where  J1 and J2 are an index of the job
                # on the named host and P is the number of
                # processors allocated from that host to this job.
                # P does not appear if it is 1.
                try:

                    exec_host_list = []
                    for exec_host in exec_hosts:
                        node = MachineInfo()
                        node.name, data = exec_host.split('/')
                        data = data.split('*')
                        if len(data) == 1:
                            node.jobIndex = int(data[0])
                            node.num_cpus = 1
                        elif len(data) == 2:
                            node.jobIndex = int(data[0])
                            node.num_cpus = int(data[1])
                        else:
                            raise ValueError(
                                'Wrong number of pieces: {} '
                                'instead of 1 or 2 in exec_hosts: '
                                '{}'.format(len(data), exec_hosts))
                        exec_host_list.append(node)
                    this_job.allocated_machines = exec_host_list
                except Exception as exc:
                    _LOGGER.debug('Problem parsing the node names, I '
                                  'got Exception {} with message {}; '
                                  'exec_hosts was {}'.format(
                                      str(type(exc)), exc, exec_hosts))

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = raw_data['job_owner'].split('@')[0]
            except KeyError:
                _LOGGER.debug("No 'job_owner' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.num_cpus = int(raw_data['resource_list.ncpus'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.ncpus' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.ncpus' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.ncpus'],
                                    this_job.job_id))

            try:
                this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.mpiprocs' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.mpiprocs' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.mpiprocs'],
                                    this_job.job_id))

            try:
                this_job.num_machines = int(raw_data['resource_list.nodect'])
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.nodect' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.nodect' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.nodect'],
                                    this_job.job_id))

            # Double check of redundant info
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(
                        set(machine.name for machine in this_job.
                            allocated_machines)) != this_job.num_machines:
                    _LOGGER.error('The length of the list of allocated '
                                  'nodes ({}) is different from the '
                                  'expected number of nodes ({})!'.format(
                                      len(this_job.allocated_machines),
                                      this_job.num_machines))

            try:
                this_job.queue_name = raw_data['queue']
            except KeyError:
                _LOGGER.debug("No 'queue' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.RequestedWallclockTime = (self._convert_time(
                    raw_data['resource_list.walltime']))
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.walltime' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resource_list.walltime' for job id {}".
                    format(this_job.job_id))

            try:
                this_job.wallclock_time_seconds = (self._convert_time(
                    raw_data['resources_used.walltime']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resources_used.walltime' for job id {}".
                    format(this_job.job_id))

            try:
                this_job.cpu_time = (self._convert_time(
                    raw_data['resources_used.cput']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resources_used.cput' for job id {}".format(
                        this_job.job_id))

            #
            # ctime: The time that the job was created
            # mtime: The time that the job was last modified, changed state,
            #        or changed locations.
            # qtime: The time that the job entered the current queue
            # stime: The time when the job started execution.
            # etime: The time that the job became eligible to run, i.e. in a
            #        queued state while residing in an execution queue.

            try:
                this_job.submission_time = self._parse_time_string(
                    raw_data['ctime'])
            except KeyError:
                _LOGGER.debug("No 'ctime' field for job id {}".format(
                    this_job.job_id))
            except ValueError:
                _LOGGER.warning("Error parsing 'ctime' for job id {}".format(
                    this_job.job_id))

            try:
                this_job.dispatch_time = self._parse_time_string(
                    raw_data['stime'])
            except KeyError:
                # The job may not have been started yet
                pass
            except ValueError:
                _LOGGER.warning("Error parsing 'stime' for job id {}".format(
                    this_job.job_id))

            # TODO: see if we want to set also finish_time for finished jobs,
            # if there are any

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = raw_data

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list