コード例 #1
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        # I don't raise because if I pass a list of jobs, I get a non-zero status
        # if one of the job is not in the list anymore

        # retval should be zero
        #if retval != 0:
        #self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output
        # but I strip lines containing "Unknown Job Id", that happens
        # also when I ask for a calculation that has finished
        #
        # I also strip for "Job has finished" because this happens for
        # those schedulers configured to leave the job in the output
        # of qstat for some time after job completion.
        filtered_stderr = '\n'.join(
            l for l in stderr.split('\n') if "Unknown Job Id" not in l and "Job has finished" not in l)
        if filtered_stderr.strip():
            self.logger.warning("Warning in _parse_joblist_output, non-empty "
                                "(filtered) stderr='{}'".format(filtered_stderr))
            if retval != 0:
                raise SchedulerError(
                    "Error during qstat parsing (_parse_joblist_output function)")

        jobdata_raw = []  # will contain raw data parsed from qstat output
        # Get raw data and split in lines
        for line_num, l in enumerate(stdout.split('\n'), start=1):
            # Each new job stanza starts with the string 'Job Id:': I
            # create a new item in the jobdata_raw list
            if l.startswith('Job Id:'):
                jobdata_raw.append(
                    {'id': l.split(':', 1)[1].strip(),
                     'lines': [], 'warning_lines_idx': []})
                # warning_lines_idx: lines that do not start either with
                # tab or space
            else:
                if l.strip():
                    # This is a non-empty line, therefore it is an attribute
                    # of the last job found
                    if not jobdata_raw:
                        # The list is still empty! (This means that I found a
                        # non-empty line, before finding the first 'Job Id:'
                        # string: it is an error. However this may happen
                        # only before the first job.
                        raise SchedulerParsingError("I did not find the header for the first job")
                        #self.logger.warning("I found some text before the "
                        #"first job: {}".format(l))
                    else:
                        if l.startswith(' '):
                            # If it starts with a space, it is a new field
                            jobdata_raw[-1]['lines'].append(l)
                        elif l.startswith('\t'):
                            # If a line starts with a TAB,
                            # I append to the previous string
                            # stripping the TAB
                            if not jobdata_raw[-1]['lines']:
                                raise SchedulerParsingError(
                                    "Line {} is the first line of the job, but it "
                                    "starts with a TAB! ({})".format(line_num, l))
                            jobdata_raw[-1]['lines'][-1] += l[1:]
                        else:
                            #raise SchedulerParsingError(
                            #    "Wrong starting character at line {}! ({})"
                            #    "".format(line_num, l))
                            ## For some reasons, the output of 'comment' and
                            ## 'Variable_List', for instance, can have
                            ## newlines if they are included... # I do a
                            ## workaround
                            jobdata_raw[-1]['lines'][-1] += "\n{}".format(l)
                            jobdata_raw[-1]['warning_lines_idx'].append(
                                len(jobdata_raw[-1]['lines']) - 1)

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:
            this_job = JobInfo()
            this_job.job_id = job['id']

            lines_without_equals_sign = [i for i in job['lines']
                                         if '=' not in i]

            # There are lines without equals sign: this is bad
            if lines_without_equals_sign:
                # Should I only warn?
                self.logger.error("There are lines without equals sign! {}"
                                  "".format(lines_without_equals_sign))
                raise (SchedulerParsingError("There are lines without equals "
                                             "sign."))

            raw_data = {i.split('=', 1)[0].strip().lower():
                            i.split('=', 1)[1].lstrip()
                        for i in job['lines'] if '=' in i}

            ## I ignore the errors for the time being - this seems to be
            ## a problem if there are \n in the content of some variables?
            ## I consider this a workaround...
            #for line_with_warning in set(job['warning_lines_idx']):
            #    if job['lines'][line_with_warning].split(
            #        '=',1)[0].strip().lower() != "comment":
            #        raise SchedulerParsingError(
            #            "Wrong starting character in one of the lines "
            #            "of job {}, and it's not a comment! ({})"
            #            "".format(this_job.job_id,
            #                      job['lines'][line_with_warning]))

            problematic_fields = []
            for line_with_warning in set(job['warning_lines_idx']):
                problematic_fields.append(job['lines'][line_with_warning].split(
                    '=', 1)[0].strip().lower())
            if problematic_fields:
                # These are the fields that contain unexpected newlines
                raw_data['warning_fields_with_newlines'] = problematic_fields

            # I believe that exit_status and terminating_signal cannot be
            # retrieved from the qstat -f output.

            # I wrap calls in try-except clauses to avoid errors if a field
            # is missing
            try:
                this_job.title = raw_data['job_name']
            except KeyError:
                self.logger.debug("No 'job_name' field for job id "
                                  "{}".format(this_job.job_id))

            try:
                this_job.annotation = raw_data['comment']
            except KeyError:
                # Many jobs do not have a comment; I do not complain about it.
                pass
                #self.logger.debug("No 'comment' field for job id {}".format(
                #    this_job.job_id))

            try:
                job_state_string = raw_data['job_state']
                try:
                    this_job.job_state = self._map_status[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        "id {}".format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = job_states.UNDETERMINED
            except KeyError:
                self.logger.debug("No 'job_state' field for job id {}".format(
                    this_job.job_id))
                this_job.job_state = job_states.UNDETERMINED

            try:
                this_job.job_substate = raw_data['substate']
            except KeyError:
                self.logger.debug("No 'substate' field for job id {}".format(
                    this_job.job_id))

            try:
                exec_hosts = raw_data['exec_host'].split('+')
            except KeyError:
                # No exec_host information found (it may be ok, if the job
                # is not running)
                pass
            else:
                # parse each host; syntax, from the man page:
                # hosta/J1+hostb/J2*P+...
                # where  J1 and J2 are an index of the job
                # on the named host and P is the number of
                # processors allocated from that host to this job.
                # P does not appear if it is 1.
                try:
                    exec_host_list = []
                    for exec_host in exec_hosts:
                        node = MachineInfo()
                        node.name, data = exec_host.split('/')
                        data = data.split('*')
                        if len(data) == 1:
                            node.jobIndex = int(data[0])
                            node.num_cpus = 1
                        elif len(data) == 2:
                            node.jobIndex = int(data[0])
                            node.num_cpus = int(data[1])
                        else:
                            raise ValueError("Wrong number of pieces: {} "
                                             "instead of 1 or 2 in exec_hosts: "
                                             "{}".format(len(data), exec_hosts))
                        exec_host_list.append(node)
                    this_job.allocated_machines = exec_host_list
                except Exception as e:
                    self.logger.debug("Problem parsing the node names, I "
                                      "got Exception {} with message {}; "
                                      "exec_hosts was {}".format(
                        str(type(e)), e.message, exec_hosts))

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = raw_data['job_owner'].split('@')[0]
            except KeyError:
                self.logger.debug("No 'job_owner' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.num_cpus = int(raw_data['resource_list.ncpus'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                self.logger.debug("No 'resource_list.ncpus' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.ncpus' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.ncpus'],
                    this_job.job_id))

            try:
                this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                self.logger.debug("No 'resource_list.mpiprocs' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.mpiprocs' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.mpiprocs'],
                    this_job.job_id))

            try:
                this_job.num_machines = int(raw_data['resource_list.nodect'])
            except KeyError:
                self.logger.debug("No 'resource_list.nodect' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.nodect' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.nodect'],
                    this_job.job_id))

            # Double check of redundant info
            if (this_job.allocated_machines is not None and
                        this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                        len(this_job.allocated_machines), this_job.num_machines))

            try:
                this_job.queue_name = raw_data['queue']
            except KeyError:
                self.logger.debug("No 'queue' field for job id "
                                  "{}".format(this_job.job_id))

            try:
                this_job.RequestedWallclockTime = (self._convert_time(
                    raw_data['resource_list.walltime']))
            except KeyError:
                self.logger.debug("No 'resource_list.walltime' field for "
                                  "job id {}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("Error parsing 'resource_list.walltime' "
                                    "for job id {}".format(this_job.job_id))

            try:
                this_job.wallclock_time_seconds = (self._convert_time(
                    raw_data['resources_used.walltime']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.walltime' "
                                    "for job id {}".format(this_job.job_id))

            try:
                this_job.cpu_time = (self._convert_time(
                    raw_data['resources_used.cput']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.cput' "
                                    "for job id {}".format(this_job.job_id))

            #
            # ctime: The time that the job was created
            # mtime: The time that the job was last modified, changed state,
            #        or changed locations.
            # qtime: The time that the job entered the current queue
            # stime: The time when the job started execution.
            # etime: The time that the job became eligible to run, i.e. in a
            #        queued state while residing in an execution queue.

            try:
                this_job.submission_time = self._parse_time_string(
                    raw_data['ctime'])
            except KeyError:
                self.logger.debug("No 'ctime' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("Error parsing 'ctime' for job id "
                                    "{}".format(this_job.job_id))

            try:
                this_job.dispatch_time = self._parse_time_string(
                    raw_data['stime'])
            except KeyError:
                # The job may not have been started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'stime' for job id "
                                    "{}".format(this_job.job_id))

            # TODO: see if we want to set also finish_time for finished jobs,
            # if there are any

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = raw_data

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
コード例 #2
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self.fields)

        # I don't raise because if I pass a list of jobs,
        # I get a non-zero status
        # if one of the job is not in the list anymore
        # retval should be zero
        #if retval != 0:
        #self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output and
        # there is no line containing "Invalid job id specified", that happens
        # when I ask for specific calculations, and they are all finished
        if stderr.strip() and "Invalid job id specified" not in stderr:
            self.logger.warning("Warning in _parse_joblist_output, non-empty "
                                "stderr='{}'".format(stderr.strip()))
            if retval != 0:
                raise SchedulerError(
                    "Error during squeue parsing (_parse_joblist_output function)"
                )

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)}

            this_job = JobInfo()
            try:
                this_job.job_id = thisjob_dict['job_id']

                this_job.annotation = thisjob_dict['annotation']
                job_state_raw = thisjob_dict['state_raw']
            except KeyError:
                # I skip this calculation if I couldn't find this basic info
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            try:
                job_state_string = _map_status_slurm[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED
            # QUEUED_HELD states are not specific states in SLURM;
            # they are instead set with state QUEUED, and then the
            # annotation tells if the job is held.
            # I check for 'Dependency', 'JobHeldUser',
            # 'JobHeldAdmin', 'BeginTime'.
            # Other states should not bring the job in QUEUED_HELD, I believe
            # (the man page of slurm seems to be incomplete, for instance
            # JobHeld* are not reported there; I also checked at the source code
            # of slurm 2.6 on github (https://github.com/SchedMD/slurm),
            # file slurm/src/common/slurm_protocol_defs.c,
            # and these seem all the states to be taken into account for the
            # QUEUED_HELD status).
            # There are actually a few others, like possible
            # failures, or partition-related reasons, but for the moment I
            # leave them in the QUEUED state.
            if (job_state_string == job_states.QUEUED
                    and this_job.annotation in [
                        'Dependency', 'JobHeldUser', 'JobHeldAdmin',
                        'BeginTime'
                    ]):
                job_state_string = job_states.QUEUED_HELD

            this_job.job_state = job_state_string

            ####
            # Up to here, I just made sure that there were at least three
            # fields, to set the most important fields for a job.
            # I now check if the length is equal to the number of fields
            if len(job) < num_fields:
                # I store this job only with the information
                # gathered up to now, and continue to the next job
                # Also print a warning
                self.logger.warning("Wrong line length in squeue output!"
                                    "Skipping optional fields. Line: '{}'"
                                    "".format(jobdata_raw))
                # I append this job before continuing
                job_list.append(this_job)
                continue

            # TODO: store executing_host?

            this_job.job_owner = thisjob_dict['username']

            try:
                this_job.num_machines = int(thisjob_dict['number_nodes'])
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_nodes'],
                                        this_job.job_id))

            try:
                this_job.num_mpiprocs = int(thisjob_dict['number_cpus'])
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_cpus'],
                                        this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = thisjob_dict[
                    'allocated_machines']

            this_job.queue_name = thisjob_dict['partition']

            try:
                this_job.requested_wallclock_time_seconds = (
                    self._convert_time(thisjob_dict['time_limit']))
            except ValueError:
                self.logger.warning("Error parsing the time limit "
                                    "for job id {}".format(this_job.job_id))

            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    this_job.wallclock_time_seconds = (self._convert_time(
                        thisjob_dict['time_used']))
                except ValueError:
                    self.logger.warning("Error parsing time_used "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    this_job.dispatch_time = self._parse_time_string(
                        thisjob_dict['dispatch_time'])
                except ValueError:
                    self.logger.warning("Error parsing dispatch_time for job "
                                        "id {}".format(this_job.job_id))

            try:
                this_job.submission_time = self._parse_time_string(
                    thisjob_dict['submission_time'])
            except ValueError:
                self.logger.warning("Error parsing submission_time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = thisjob_dict['job_name']

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
コード例 #3
0
ファイル: sge.py プロジェクト: kamatani0164/aiida-core
    def _parse_joblist_output(self, retval, stdout, stderr):
        import xml.dom.minidom

        if retval != 0:
            self.logger.error("Error in _parse_joblist_output: retval={}; "
                              "stdout={}; stderr={}".format(
                                  retval, stdout, stderr))
            raise SchedulerError("Error during joblist retrieval, retval={}".\
                                 format(retval))

        if stderr.strip():
            self.logger.warning("in _parse_joblist_output for {}: "
                                "there was some text in stderr: {}".format(
                                    str(self.transport), stderr))

        if stdout:
            try:
                xmldata = xml.dom.minidom.parseString(stdout)
            except xml.parsers.expat.ExpatError:
                self.logger.error("in sge._parse_joblist_output: "
                                  "xml parsing of stdout failed:"
                                  "{}".format(stdout))
                raise SchedulerParsingError("Error during joblist retrieval,"
                                            "xml parsing of stdout failed")
        else:
            self.logger.error("Error in sge._parse_joblist_output: retval={}; "
                              "stdout={}; stderr={}".format(
                                  retval, stdout, stderr))
            raise SchedulerError("Error during joblist retrieval,"
                                 "no stdout produced")

        try:
            first_child = xmldata.firstChild
            second_childs = first_child.childNodes
            tag_names_sec = [elem.tagName for elem in second_childs \
                             if elem.nodeType == 1]
            if not 'queue_info' in tag_names_sec:
                self.logger.error("Error in sge._parse_joblist_output: "
                                  "no queue_info: {}".\
                                  format(stdout))
                raise SchedulerError
            if not 'job_info' in tag_names_sec:
                self.logger.error("Error in sge._parse_joblist_output: "
                                  "no job_info: {}".\
                                  format(stdout))
                raise SchedulerError
        except SchedulerError:
            self.logger.error("Error in sge._parse_joblist_output: stdout={}"\
                              .format(stdout))
            raise SchedulerError("Error during xml processing, of stdout:"
                                 "There is no 'job_info' or no 'queue_info'"
                                 "element or there are no jobs!")
        #If something weird happens while firstChild, pop, etc:
        except Exception:
            self.logger.error("Error in sge._parse_joblist_output: stdout={}"\
                              .format(stdout))
            raise SchedulerError("Error during xml processing, of stdout")

        jobs = [i for i in first_child.getElementsByTagName('job_list')]
        #jobs = [i for i in jobinfo.getElementsByTagName('job_list')]
        #print [i[0].childNodes[0].data for i in job_numbers if i]
        joblist = []
        for job in jobs:
            this_job = JobInfo()

            #In case the user needs more information the xml-data for
            #each job is stored:
            this_job.raw_data = job.toxml()

            try:
                job_element = job.getElementsByTagName('JB_job_number').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_id = str(element_child.data).strip()
                if not this_job.job_id:
                    raise SchedulerError
            except SchedulerError:
                self.logger.error("Error in sge._parse_joblist_output:"
                                  "no job id is given, stdout={}"\
                                  .format(stdout))
                raise SchedulerError("Error in sge._parse_joblist_output:"
                                     "no job id is given")
            except IndexError:
                self.logger.error("No 'job_number' given for job index {} in "
                                  "job list, stdout={}".format(jobs.index(job)\
                                  ,stdout))
                raise IndexError("Error in sge._parse_joblist_output:"
                                 "no job id is given")

            try:
                job_element = job.getElementsByTagName('state').pop(0)
                element_child = job_element.childNodes.pop(0)
                job_state_string = str(element_child.data).strip()
                try:
                    this_job.job_state = _map_status_sge[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        "id {}".format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = job_states.UNDETERMINED
            except IndexError:
                self.logger.warning("No 'job_state' field for job id {} in"
                                    "stdout={}".format(this_job.job_id,
                                                       stdout))
                this_job.job_state = job_states.UNDETERMINED

            try:
                job_element = job.getElementsByTagName('JB_owner').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_owner = str(element_child.data).strip()
            except IndexError:
                self.logger.warning("No 'job_owner' field for job "
                                    "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName('JB_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.title = str(element_child.data).strip()
            except IndexError:
                self.logger.warning("No 'title' field for job "
                                    "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName('queue_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.queue_name = str(element_child.data).strip()
            except IndexError:
                if this_job.job_state == job_states.RUNNING:
                    self.logger.warning("No 'queue_name' field for job "
                                        "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName(
                    'JB_submission_time').pop(0)
                element_child = job_element.childNodes.pop(0)
                time_string = str(element_child.data).strip()
                try:
                    this_job.submission_time = self._parse_time_string(
                        time_string)
                except ValueError:
                    self.logger.warning("Error parsing 'JB_submission_time' "
                                        "for job id {} ('{}')".format(
                                            this_job.job_id, time_string))
            except IndexError:
                try:
                    job_element = job.getElementsByTagName(
                        'JAT_start_time').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    time_string = str(element_child.data).strip()
                    try:
                        this_job.dispatch_time = self._parse_time_string(
                            time_string)
                    except ValueError:
                        self.logger.warning("Error parsing 'JAT_start_time'"
                                            "for job id {} ('{}')".format(
                                                this_job.job_id, time_string))
                except IndexError:
                    self.logger.warning("No 'JB_submission_time' and no "
                                        "'JAT_start_time' field for job "
                                        "id {}".format(this_job.job_id))

            #There is also cpu_usage, mem_usage, io_usage information available:
            if this_job.job_state == job_states.RUNNING:
                try:
                    job_element = job.getElementsByTagName('slots').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    this_job.num_mpiprocs = str(element_child.data).strip()
                except IndexError:
                    self.logger.warning("No 'slots' field for job "
                                        "id {}".format(this_job.job_id))

            joblist.append(this_job)
        #self.logger.debug("joblist final: {}".format(joblist))
        return joblist
コード例 #4
0
ファイル: lsf.py プロジェクト: kamatani0164/aiida-core
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self._joblist_fields)

        if retval != 0:
            self.logger.warning("Error in _parse_joblist_output: retval={}; "
                                "stdout={}; stderr={}".format(
                                    retval, stdout, stderr))
            raise SchedulerError("Error during parsing joblist output, "
                                 "retval={}\n"
                                 "stdout={}\nstderr={}".format(
                                     retval, stdout, stderr))

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            # Each job should have all fields.
            if len(job) != num_fields:
                # I skip this calculation
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            this_job = JobInfo()
            this_job.job_id = job[0]
            this_job.annotation = job[2]
            job_state_raw = job[1]

            try:
                job_state_string = _map_status_lsf[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED

            this_job.job_state = job_state_string

            # I get the remaining fields
            # The first three were already obtained
            # I know that the length is exactly num_fields because
            # I used split(_field_separator, num_fields) before
            # when creting 'job'
            #            (_, _, _, executing_host, username, number_nodes,
            #             number_cpus, allocated_machines, partition,
            #             time_limit, time_used, dispatch_time, job_name) = job
            (_, _, _, executing_host, username, number_nodes, number_cpus,
             allocated_machines, partition, finish_time, start_time,
             percent_complete, submission_time, job_name) = job

            this_job.job_owner = username
            try:
                this_job.num_machines = int(number_nodes)
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_nodes, this_job.job_id))

            try:
                this_job.num_mpiprocs = int(number_cpus)
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_cpus, this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = allocated_machines

            this_job.queue_name = partition

            psd_finish_time = self._parse_time_string(finish_time,
                                                      fmt='%b %d %H:%M')
            psd_start_time = self._parse_time_string(start_time,
                                                     fmt='%b %d %H:%M')
            psd_submission_time = self._parse_time_string(submission_time,
                                                          fmt='%b %d %H:%M')

            # Now get the time in seconds which has been used
            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    requested_walltime = psd_finish_time - psd_start_time
                    # fix of a weird bug. Since the year is not parsed, it is assumed
                    # to always be 1900. Therefore, job submitted
                    # in december and finishing in january would produce negative time differences
                    if requested_walltime.total_seconds() < 0:
                        import datetime
                        old_month = psd_finish_time.month
                        old_day = psd_finish_time.day
                        old_hour = psd_finish_time.hour
                        old_minute = psd_finish_time.minute
                        new_year = psd_start_time.year + 1
                        # note: we assume that no job will last more than 1 year...
                        psd_finish_time = datetime.datetime(year=new_year,
                                                            month=old_month,
                                                            day=old_day,
                                                            hour=old_hour,
                                                            minute=old_minute)
                        requested_walltime = psd_finish_time - psd_start_time

                    this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds(
                    )
                except (TypeError, ValueError):
                    self.logger.warning("Error parsing the time limit "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    psd_percent_complete = float(
                        percent_complete.strip(' L').strip("%"))
                    this_job.wallclock_time_seconds = requested_walltime.total_seconds(
                    ) * psd_percent_complete / 100.
                except ValueError:
                    self.logger.warning("Error parsing the time used "
                                        "for job id {}".format(
                                            this_job.job_id))

            try:
                this_job.submission_time = psd_submission_time
            except ValueError:
                self.logger.warning("Error parsing submission time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = job_name

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list