예제 #1
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self.fields)

        # I don't raise because if I pass a list of jobs,
        # I get a non-zero status
        # if one of the job is not in the list anymore
        # retval should be zero
        #if retval != 0:
        #self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output and
        # there is no line containing "Invalid job id specified", that happens
        # when I ask for specific calculations, and they are all finished
        if stderr.strip() and "Invalid job id specified" not in stderr:
            self.logger.warning("Warning in _parse_joblist_output, non-empty "
                                "stderr='{}'".format(stderr.strip()))
            if retval != 0:
                raise SchedulerError(
                    "Error during squeue parsing (_parse_joblist_output function)"
                )

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)}

            this_job = JobInfo()
            try:
                this_job.job_id = thisjob_dict['job_id']

                this_job.annotation = thisjob_dict['annotation']
                job_state_raw = thisjob_dict['state_raw']
            except KeyError:
                # I skip this calculation if I couldn't find this basic info
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            try:
                job_state_string = _map_status_slurm[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED
            # QUEUED_HELD states are not specific states in SLURM;
            # they are instead set with state QUEUED, and then the
            # annotation tells if the job is held.
            # I check for 'Dependency', 'JobHeldUser',
            # 'JobHeldAdmin', 'BeginTime'.
            # Other states should not bring the job in QUEUED_HELD, I believe
            # (the man page of slurm seems to be incomplete, for instance
            # JobHeld* are not reported there; I also checked at the source code
            # of slurm 2.6 on github (https://github.com/SchedMD/slurm),
            # file slurm/src/common/slurm_protocol_defs.c,
            # and these seem all the states to be taken into account for the
            # QUEUED_HELD status).
            # There are actually a few others, like possible
            # failures, or partition-related reasons, but for the moment I
            # leave them in the QUEUED state.
            if (job_state_string == job_states.QUEUED
                    and this_job.annotation in [
                        'Dependency', 'JobHeldUser', 'JobHeldAdmin',
                        'BeginTime'
                    ]):
                job_state_string = job_states.QUEUED_HELD

            this_job.job_state = job_state_string

            ####
            # Up to here, I just made sure that there were at least three
            # fields, to set the most important fields for a job.
            # I now check if the length is equal to the number of fields
            if len(job) < num_fields:
                # I store this job only with the information
                # gathered up to now, and continue to the next job
                # Also print a warning
                self.logger.warning("Wrong line length in squeue output!"
                                    "Skipping optional fields. Line: '{}'"
                                    "".format(jobdata_raw))
                # I append this job before continuing
                job_list.append(this_job)
                continue

            # TODO: store executing_host?

            this_job.job_owner = thisjob_dict['username']

            try:
                this_job.num_machines = int(thisjob_dict['number_nodes'])
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_nodes'],
                                        this_job.job_id))

            try:
                this_job.num_mpiprocs = int(thisjob_dict['number_cpus'])
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_cpus'],
                                        this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = thisjob_dict[
                    'allocated_machines']

            this_job.queue_name = thisjob_dict['partition']

            try:
                this_job.requested_wallclock_time_seconds = (
                    self._convert_time(thisjob_dict['time_limit']))
            except ValueError:
                self.logger.warning("Error parsing the time limit "
                                    "for job id {}".format(this_job.job_id))

            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    this_job.wallclock_time_seconds = (self._convert_time(
                        thisjob_dict['time_used']))
                except ValueError:
                    self.logger.warning("Error parsing time_used "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    this_job.dispatch_time = self._parse_time_string(
                        thisjob_dict['dispatch_time'])
                except ValueError:
                    self.logger.warning("Error parsing dispatch_time for job "
                                        "id {}".format(this_job.job_id))

            try:
                this_job.submission_time = self._parse_time_string(
                    thisjob_dict['submission_time'])
            except ValueError:
                self.logger.warning("Error parsing submission_time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = thisjob_dict['job_name']

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
예제 #2
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self._joblist_fields)

        if retval != 0:
            self.logger.warning("Error in _parse_joblist_output: retval={}; "
                                "stdout={}; stderr={}".format(
                                    retval, stdout, stderr))
            raise SchedulerError("Error during parsing joblist output, "
                                 "retval={}\n"
                                 "stdout={}\nstderr={}".format(
                                     retval, stdout, stderr))

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            # Each job should have all fields.
            if len(job) != num_fields:
                # I skip this calculation
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            this_job = JobInfo()
            this_job.job_id = job[0]
            this_job.annotation = job[2]
            job_state_raw = job[1]

            try:
                job_state_string = _map_status_lsf[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED

            this_job.job_state = job_state_string

            # I get the remaining fields
            # The first three were already obtained
            # I know that the length is exactly num_fields because
            # I used split(_field_separator, num_fields) before
            # when creting 'job'
            #            (_, _, _, executing_host, username, number_nodes,
            #             number_cpus, allocated_machines, partition,
            #             time_limit, time_used, dispatch_time, job_name) = job
            (_, _, _, executing_host, username, number_nodes, number_cpus,
             allocated_machines, partition, finish_time, start_time,
             percent_complete, submission_time, job_name) = job

            this_job.job_owner = username
            try:
                this_job.num_machines = int(number_nodes)
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_nodes, this_job.job_id))

            try:
                this_job.num_mpiprocs = int(number_cpus)
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_cpus, this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = allocated_machines

            this_job.queue_name = partition

            psd_finish_time = self._parse_time_string(finish_time,
                                                      fmt='%b %d %H:%M')
            psd_start_time = self._parse_time_string(start_time,
                                                     fmt='%b %d %H:%M')
            psd_submission_time = self._parse_time_string(submission_time,
                                                          fmt='%b %d %H:%M')

            # Now get the time in seconds which has been used
            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    requested_walltime = psd_finish_time - psd_start_time
                    # fix of a weird bug. Since the year is not parsed, it is assumed
                    # to always be 1900. Therefore, job submitted
                    # in december and finishing in january would produce negative time differences
                    if requested_walltime.total_seconds() < 0:
                        import datetime
                        old_month = psd_finish_time.month
                        old_day = psd_finish_time.day
                        old_hour = psd_finish_time.hour
                        old_minute = psd_finish_time.minute
                        new_year = psd_start_time.year + 1
                        # note: we assume that no job will last more than 1 year...
                        psd_finish_time = datetime.datetime(year=new_year,
                                                            month=old_month,
                                                            day=old_day,
                                                            hour=old_hour,
                                                            minute=old_minute)
                        requested_walltime = psd_finish_time - psd_start_time

                    this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds(
                    )
                except (TypeError, ValueError):
                    self.logger.warning("Error parsing the time limit "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    psd_percent_complete = float(
                        percent_complete.strip(' L').strip("%"))
                    this_job.wallclock_time_seconds = requested_walltime.total_seconds(
                    ) * psd_percent_complete / 100.
                except ValueError:
                    self.logger.warning("Error parsing the time used "
                                        "for job id {}".format(
                                            this_job.job_id))

            try:
                this_job.submission_time = psd_submission_time
            except ValueError:
                self.logger.warning("Error parsing submission time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = job_name

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list