Beispiel #1
0
    def get_jobs(self, jobs=None, user=None, as_dict=False):
        """
        Overrides original method from DirectScheduler in order to list
        missing processes as DONE.
        """
        job_stats = super().get_jobs(jobs=jobs, user=user, as_dict=as_dict)

        found_jobs = []
        # Get the list of known jobs
        if as_dict:
            found_jobs = job_stats.keys()
        else:
            found_jobs = [j.job_id for j in job_stats]
        # Now check if there are any the user requested but were not found
        not_found_jobs = list(set(jobs) - set(found_jobs)) if jobs else []

        for job_id in not_found_jobs:
            job = JobInfo()
            job.job_id = job_id
            job.job_state = JobState.DONE
            # Owner and wallclock time is unknown
            if as_dict:
                job_stats[job_id] = job
            else:
                job_stats.append(job)

        return job_stats
Beispiel #2
0
    def get_last_job_info(self):
        """Return the last information asked to the scheduler about the status of the job.

        :return: a `JobInfo` object (that closely resembles a dictionary) or None.
        """
        from aiida.schedulers.datastructures import JobInfo

        last_job_info_serialized = self.get_attribute(
            self.SCHEUDLER_LAST_JOB_INFO_KEY, None)

        if last_job_info_serialized is not None:
            job_info = JobInfo()
            job_info.load_from_serialized(last_job_info_serialized)
        else:
            job_info = None

        return job_info
Beispiel #3
0
    def test_serialization(self):
        """Test the serialization/deserialization of JobInfo classes."""
        from aiida.schedulers.datastructures import JobInfo, JobState
        from datetime import datetime

        dict_serialized_content = {
            'job_id': '12723',
            'title': 'some title',
            'queue_name': 'some_queue',
            'account': 'my_account'
        }

        to_serialize = {'job_state': (JobState.QUEUED, 'job_state'), 'submission_time': (datetime.now(), 'date')}

        job_info = JobInfo()
        for key, val in dict_serialized_content.items():
            setattr(job_info, key, val)

        for key, (val, field_type) in to_serialize.items():
            setattr(job_info, key, val)
            # Also append to the dictionary for easier comparison later
            dict_serialized_content[key] = JobInfo.serialize_field(value=val, field_type=field_type)

        self.assertEqual(job_info.get_dict(), dict_serialized_content)
        # Full loop via JSON, moving data from job_info to job_info2;
        # we check that the content is fully preserved
        job_info2 = JobInfo.load_from_serialized(job_info.serialize())
        self.assertEqual(job_info2.get_dict(), dict_serialized_content)

        # Check that fields are properly re-serialized with the correct type
        self.assertEqual(job_info2.job_state, to_serialize['job_state'][0])
        # Check that fields are properly re-serialized with the correct type
        self.assertEqual(job_info2.submission_time, to_serialize['submission_time'][0])
Beispiel #4
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.
        """
        if stderr.strip():
            self.logger.warning("Stderr when parsing joblist: {}".format(
                stderr.strip()))
        job_list = [job.split() for job in stdout.split('\n') if job]
        job_infos = []
        for job_id, status in job_list:
            job = JobInfo()
            job.job_id = job_id
            job.job_state = _MAP_STATUS_YASCHEDULER[status]
            job_infos.append(job)
        return job_infos
Beispiel #5
0
    def get_last_job_info(self) -> Optional['JobInfo']:
        """Return the last information asked to the scheduler about the status of the job.

        The last job info is updated on every poll of the scheduler, except for the final poll when the job drops from
        the scheduler's job queue.
        For completed jobs, the last job info therefore contains the "second-to-last" job info that still shows the job
        as running. Please use :meth:`~aiida.orm.nodes.process.calculation.calcjob.CalcJobNode.get_detailed_job_info`
        instead.

        :return: a `JobInfo` object (that closely resembles a dictionary) or None.
        """
        from aiida.schedulers.datastructures import JobInfo

        last_job_info_dictserialized = self.get_attribute(self.SCHEDULER_LAST_JOB_INFO_KEY, None)

        if last_job_info_dictserialized is not None:
            job_info = JobInfo.load_from_dict(last_job_info_dictserialized)
        else:
            job_info = None

        return job_info
Beispiel #6
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self.fields)

        # I don't raise because if I pass a list of jobs,
        # I get a non-zero status
        # if one of the job is not in the list anymore
        # retval should be zero
        # if retval != 0:
        # self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output and
        # there is no line containing "Invalid job id specified", that happens
        # when I ask for specific calculations, and they are all finished
        if stderr.strip() and 'Invalid job id specified' not in stderr:
            self.logger.warning("Warning in _parse_joblist_output, non-empty stderr='{}'".format(stderr.strip()))
            if retval != 0:
                raise SchedulerError('Error during squeue parsing (_parse_joblist_output function)')

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)}

            this_job = JobInfo()
            try:
                this_job.job_id = thisjob_dict['job_id']

                this_job.annotation = thisjob_dict['annotation']
                job_state_raw = thisjob_dict['state_raw']
            except KeyError:
                # I skip this calculation if I couldn't find this basic info
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'".format(job))
                continue

            try:
                job_state_string = _MAP_STATUS_SLURM[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    'id {}'.format(job_state_raw, this_job.job_id))
                job_state_string = JobState.UNDETERMINED
            # QUEUED_HELD states are not specific states in SLURM;
            # they are instead set with state QUEUED, and then the
            # annotation tells if the job is held.
            # I check for 'Dependency', 'JobHeldUser',
            # 'JobHeldAdmin', 'BeginTime'.
            # Other states should not bring the job in QUEUED_HELD, I believe
            # (the man page of slurm seems to be incomplete, for instance
            # JobHeld* are not reported there; I also checked at the source code
            # of slurm 2.6 on github (https://github.com/SchedMD/slurm),
            # file slurm/src/common/slurm_protocol_defs.c,
            # and these seem all the states to be taken into account for the
            # QUEUED_HELD status).
            # There are actually a few others, like possible
            # failures, or partition-related reasons, but for the moment I
            # leave them in the QUEUED state.
            if (job_state_string == JobState.QUEUED and
                    this_job.annotation in ['Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime']):
                job_state_string = JobState.QUEUED_HELD

            this_job.job_state = job_state_string

            ####
            # Up to here, I just made sure that there were at least three
            # fields, to set the most important fields for a job.
            # I now check if the length is equal to the number of fields
            if len(job) < num_fields:
                # I store this job only with the information
                # gathered up to now, and continue to the next job
                # Also print a warning
                self.logger.warning('Wrong line length in squeue output!'
                                    "Skipping optional fields. Line: '{}'"
                                    ''.format(jobdata_raw))
                # I append this job before continuing
                job_list.append(this_job)
                continue

            # TODO: store executing_host?

            this_job.job_owner = thisjob_dict['username']

            try:
                this_job.num_machines = int(thisjob_dict['number_nodes'])
            except ValueError:
                self.logger.warning('The number of allocated nodes is not '
                                    'an integer ({}) for job id {}!'.format(thisjob_dict['number_nodes'],
                                                                            this_job.job_id))

            try:
                this_job.num_mpiprocs = int(thisjob_dict['number_cpus'])
            except ValueError:
                self.logger.warning('The number of allocated cores is not '
                                    'an integer ({}) for job id {}!'.format(thisjob_dict['number_cpus'],
                                                                            this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == JobState.RUNNING:
                this_job.allocated_machines_raw = thisjob_dict['allocated_machines']

            this_job.queue_name = thisjob_dict['partition']

            try:
                this_job.requested_wallclock_time_seconds = (self._convert_time(thisjob_dict['time_limit']))
            except ValueError:
                self.logger.warning('Error parsing the time limit for job id {}'.format(this_job.job_id))

            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == JobState.RUNNING:
                try:
                    this_job.wallclock_time_seconds = (self._convert_time(thisjob_dict['time_used']))
                except ValueError:
                    self.logger.warning('Error parsing time_used for job id {}'.format(this_job.job_id))

                try:
                    this_job.dispatch_time = self._parse_time_string(thisjob_dict['dispatch_time'])
                except ValueError:
                    self.logger.warning('Error parsing dispatch_time for job id {}'.format(this_job.job_id))

            try:
                this_job.submission_time = self._parse_time_string(thisjob_dict['submission_time'])
            except ValueError:
                self.logger.warning('Error parsing submission_time for job id {}'.format(this_job.job_id))

            this_job.title = thisjob_dict['job_name']

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error('The length of the list of allocated '
                                      'nodes ({}) is different from the '
                                      'expected number of nodes ({})!'.format(
                        len(this_job.allocated_machines), this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
Beispiel #7
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        # pylint: disable=too-many-statements,too-many-branches
        if retval != 0:
            self.logger.error(
                f'Error in _parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}'
            )
            raise SchedulerError(
                f'Error during joblist retrieval, retval={retval}')

        if stderr.strip():
            self.logger.warning(
                f'in _parse_joblist_output for {str(self.transport)}: there was some text in stderr: {stderr}'
            )

        if stdout:
            try:
                xmldata = xml.dom.minidom.parseString(stdout)
            except xml.parsers.expat.ExpatError:
                self.logger.error(
                    f'in sge._parse_joblist_output: xml parsing of stdout failed: {stdout}'
                )
                raise SchedulerParsingError(
                    'Error during joblist retrieval, xml parsing of stdout failed'
                )
        else:
            self.logger.error(
                f'Error in sge._parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}'
            )
            raise SchedulerError(
                'Error during joblist retrieval, no stdout produced')

        try:
            first_child = xmldata.firstChild
            second_childs = first_child.childNodes
            tag_names_sec = [elem.tagName for elem in second_childs \
                             if elem.nodeType == 1]
            if 'queue_info' not in tag_names_sec:
                self.logger.error(
                    f'Error in sge._parse_joblist_output: no queue_info: {stdout}'
                )
                raise SchedulerError
            if 'job_info' not in tag_names_sec:
                self.logger.error(
                    f'Error in sge._parse_joblist_output: no job_info: {stdout}'
                )
                raise SchedulerError
        except SchedulerError:
            self.logger.error(
                f'Error in sge._parse_joblist_output: stdout={stdout}')
            raise SchedulerError('Error during xml processing, of stdout:'
                                 "There is no 'job_info' or no 'queue_info'"
                                 'element or there are no jobs!')
        # If something weird happens while firstChild, pop, etc:
        except Exception:
            self.logger.error(
                f'Error in sge._parse_joblist_output: stdout={stdout}')
            raise SchedulerError('Error during xml processing, of stdout')

        jobs = list(first_child.getElementsByTagName('job_list'))
        # jobs = [i for i in jobinfo.getElementsByTagName('job_list')]
        # print [i[0].childNodes[0].data for i in job_numbers if i]
        joblist = []
        for job in jobs:
            this_job = JobInfo()

            # In case the user needs more information the xml-data for
            # each job is stored:
            this_job.raw_data = job.toxml()

            try:
                job_element = job.getElementsByTagName('JB_job_number').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_id = str(element_child.data).strip()
                if not this_job.job_id:
                    raise SchedulerError
            except SchedulerError:
                self.logger.error(
                    f'Error in sge._parse_joblist_output:no job id is given, stdout={stdout}'
                )
                raise SchedulerError(
                    'Error in sge._parse_joblist_output: no job id is given')
            except IndexError:
                self.logger.error("No 'job_number' given for job index {} in "
                                  'job list, stdout={}'.format(jobs.index(job) \
                                                               , stdout))
                raise IndexError(
                    'Error in sge._parse_joblist_output: no job id is given')

            try:
                job_element = job.getElementsByTagName('state').pop(0)
                element_child = job_element.childNodes.pop(0)
                job_state_string = str(element_child.data).strip()
                try:
                    this_job.job_state = _MAP_STATUS_SGE[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        'id {}'.format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = JobState.UNDETERMINED
            except IndexError:
                self.logger.warning("No 'job_state' field for job id {} in"
                                    'stdout={}'.format(this_job.job_id,
                                                       stdout))
                this_job.job_state = JobState.UNDETERMINED

            try:
                job_element = job.getElementsByTagName('JB_owner').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_owner = str(element_child.data).strip()
            except IndexError:
                self.logger.warning(
                    f"No 'job_owner' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName('JB_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.title = str(element_child.data).strip()
            except IndexError:
                self.logger.warning(
                    f"No 'title' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName('queue_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.queue_name = str(element_child.data).strip()
            except IndexError:
                if this_job.job_state == JobState.RUNNING:
                    self.logger.warning(
                        f"No 'queue_name' field for job id {this_job.job_id}")

            try:
                job_element = job.getElementsByTagName(
                    'JB_submission_time').pop(0)
                element_child = job_element.childNodes.pop(0)
                time_string = str(element_child.data).strip()
                try:
                    this_job.submission_time = self._parse_time_string(
                        time_string)
                except ValueError:
                    self.logger.warning(
                        f"Error parsing 'JB_submission_time' for job id {this_job.job_id} ('{time_string}')"
                    )
            except IndexError:
                try:
                    job_element = job.getElementsByTagName(
                        'JAT_start_time').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    time_string = str(element_child.data).strip()
                    try:
                        this_job.dispatch_time = self._parse_time_string(
                            time_string)
                    except ValueError:
                        self.logger.warning(
                            f"Error parsing 'JAT_start_time'for job id {this_job.job_id} ('{time_string}')"
                        )
                except IndexError:
                    self.logger.warning("No 'JB_submission_time' and no "
                                        "'JAT_start_time' field for job "
                                        'id {}'.format(this_job.job_id))

            # There is also cpu_usage, mem_usage, io_usage information available:
            if this_job.job_state == JobState.RUNNING:
                try:
                    job_element = job.getElementsByTagName('slots').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    this_job.num_mpiprocs = str(element_child.data).strip()
                except IndexError:
                    self.logger.warning(
                        f"No 'slots' field for job id {this_job.job_id}")

            joblist.append(this_job)
        # self.logger.debug("joblist final: {}".format(joblist))
        return joblist
Beispiel #8
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        num_fields = len(self._joblist_fields)

        if retval != 0:
            self.logger.warning('Error in _parse_joblist_output: retval={}; '
                                'stdout={}; stderr={}'.format(
                                    retval, stdout, stderr))
            raise SchedulerError('Error during parsing joblist output, '
                                 'retval={}\n'
                                 'stdout={}\nstderr={}'.format(
                                     retval, stdout, stderr))

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines()
            if _FIELD_SEPARATOR in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            # Each job should have all fields.
            if len(job) != num_fields:
                # I skip this calculation
                # (I don't append anything to job_list before continuing)
                self.logger.error(
                    "Wrong line length in squeue output! '{}'".format(job))
                continue

            this_job = JobInfo()
            this_job.job_id = job[0]
            this_job.annotation = job[2]
            job_state_raw = job[1]

            try:
                job_state_string = _MAP_STATUS_LSF[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    'id {}'.format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = JobState.UNDETERMINED

            this_job.job_state = job_state_string

            # I get the remaining fields
            # The first three were already obtained
            # I know that the length is exactly num_fields because
            # I used split(_field_separator, num_fields) before
            # when creting 'job'
            #            (_, _, _, executing_host, username, number_nodes,
            #             number_cpus, allocated_machines, partition,
            #             time_limit, time_used, dispatch_time, job_name) = job
            (_, _, _, _, username, number_nodes, number_cpus,
             allocated_machines, partition, finish_time, start_time,
             percent_complete, submission_time, job_name) = job

            this_job.job_owner = username
            try:
                this_job.num_machines = int(number_nodes)
            except ValueError:
                self.logger.warning('The number of allocated nodes is not '
                                    'an integer ({}) for job id {}!'.format(
                                        number_nodes, this_job.job_id))

            try:
                this_job.num_mpiprocs = int(number_cpus)
            except ValueError:
                self.logger.warning('The number of allocated cores is not '
                                    'an integer ({}) for job id {}!'.format(
                                        number_cpus, this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == JobState.RUNNING:
                this_job.allocated_machines_raw = allocated_machines

            this_job.queue_name = partition

            psd_finish_time = self._parse_time_string(finish_time,
                                                      fmt='%b %d %H:%M')
            psd_start_time = self._parse_time_string(start_time,
                                                     fmt='%b %d %H:%M')
            psd_submission_time = self._parse_time_string(submission_time,
                                                          fmt='%b %d %H:%M')

            # Now get the time in seconds which has been used
            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == JobState.RUNNING:
                try:
                    requested_walltime = psd_finish_time - psd_start_time
                    # fix of a weird bug. Since the year is not parsed, it is assumed
                    # to always be 1900. Therefore, job submitted
                    # in december and finishing in january would produce negative time differences
                    if requested_walltime.total_seconds() < 0:
                        import datetime
                        old_month = psd_finish_time.month
                        old_day = psd_finish_time.day
                        old_hour = psd_finish_time.hour
                        old_minute = psd_finish_time.minute
                        new_year = psd_start_time.year + 1
                        # note: we assume that no job will last more than 1 year...
                        psd_finish_time = datetime.datetime(year=new_year,
                                                            month=old_month,
                                                            day=old_day,
                                                            hour=old_hour,
                                                            minute=old_minute)
                        requested_walltime = psd_finish_time - psd_start_time

                    this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds(
                    )  # pylint: disable=invalid-name
                except (TypeError, ValueError):
                    self.logger.warning(
                        'Error parsing the time limit for job id {}'.format(
                            this_job.job_id))

                try:
                    psd_percent_complete = float(
                        percent_complete.strip(' L').strip('%'))
                    this_job.wallclock_time_seconds = requested_walltime.total_seconds(
                    ) * psd_percent_complete / 100.
                except ValueError:
                    self.logger.warning(
                        'Error parsing the time used for job id {}'.format(
                            this_job.job_id))

            try:
                this_job.submission_time = psd_submission_time
            except ValueError:
                self.logger.warning(
                    'Error parsing submission time for job id {}'.format(
                        this_job.job_id))

            this_job.title = job_name

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error('The length of the list of allocated '
                                      'nodes ({}) is different from the '
                                      'expected number of nodes ({})!'.format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
Beispiel #9
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """

        # I don't raise because if I pass a list of jobs, I get a non-zero status
        # if one of the job is not in the list anymore

        # retval should be zero
        # if retval != 0:
        # _LOGGER.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output
        # but I strip lines containing "Unknown Job Id", that happens
        # also when I ask for a calculation that has finished
        #
        # I also strip for "Job has finished" because this happens for
        # those schedulers configured to leave the job in the output
        # of qstat for some time after job completion.
        filtered_stderr = '\n'.join(
            l for l in stderr.split('\n')
            if 'Unknown Job Id' not in l and 'Job has finished' not in l)
        if filtered_stderr.strip():
            _LOGGER.warning('Warning in _parse_joblist_output, non-empty '
                            "(filtered) stderr='{}'".format(filtered_stderr))
            if retval != 0:
                raise SchedulerError('Error during qstat parsing, retval={}\n'
                                     'stdout={}\nstderr={}'.format(
                                         retval, stdout, stderr))

        jobdata_raw = []  # will contain raw data parsed from qstat output
        # Get raw data and split in lines
        for line_num, line in enumerate(stdout.split('\n'), start=1):
            # Each new job stanza starts with the string 'Job Id:': I
            # create a new item in the jobdata_raw list
            if line.startswith('Job Id:'):
                jobdata_raw.append({
                    'id': line.split(':', 1)[1].strip(),
                    'lines': [],
                    'warning_lines_idx': []
                })
                # warning_lines_idx: lines that do not start either with
                # tab or space
            else:
                if line.strip():
                    # This is a non-empty line, therefore it is an attribute
                    # of the last job found
                    if not jobdata_raw:
                        # The list is still empty! (This means that I found a
                        # non-empty line, before finding the first 'Job Id:'
                        # string: it is an error. However this may happen
                        # only before the first job.
                        raise SchedulerParsingError(
                            'I did not find the header for the first job')
                        # _LOGGER.warning("I found some text before the "
                        # "first job: {}".format(l))
                    else:
                        if line.startswith(' '):
                            # If it starts with a space, it is a new field
                            jobdata_raw[-1]['lines'].append(line)
                        elif line.startswith('\t'):
                            # If a line starts with a TAB,
                            # I append to the previous string
                            # stripping the TAB
                            if not jobdata_raw[-1]['lines']:
                                raise SchedulerParsingError(
                                    'Line {} is the first line of the job, but it '
                                    'starts with a TAB! ({})'.format(
                                        line_num, line))
                            jobdata_raw[-1]['lines'][-1] += line[1:]
                        else:
                            # raise SchedulerParsingError(
                            #    "Wrong starting character at line {}! ({})"
                            #    "".format(line_num, l))
                            ## For some reasons, the output of 'comment' and
                            ## 'Variable_List', for instance, can have
                            ## newlines if they are included... # I do a
                            ## workaround
                            jobdata_raw[-1]['lines'][-1] += '\n{}'.format(line)
                            jobdata_raw[-1]['warning_lines_idx'].append(
                                len(jobdata_raw[-1]['lines']) - 1)

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:
            this_job = JobInfo()
            this_job.job_id = job['id']

            lines_without_equals_sign = [
                i for i in job['lines'] if '=' not in i
            ]

            # There are lines without equals sign: this is bad
            if lines_without_equals_sign:
                # Should I only warn?
                _LOGGER.error('There are lines without equals sign! {}'.format(
                    lines_without_equals_sign))
                raise SchedulerParsingError(
                    'There are lines without equals sign.')

            raw_data = {
                i.split('=', 1)[0].strip().lower(): i.split('=',
                                                            1)[1].lstrip()
                for i in job['lines'] if '=' in i
            }

            ## I ignore the errors for the time being - this seems to be
            ## a problem if there are \n in the content of some variables?
            ## I consider this a workaround...
            # for line_with_warning in set(job['warning_lines_idx']):
            #    if job['lines'][line_with_warning].split(
            #        '=',1)[0].strip().lower() != "comment":
            #        raise SchedulerParsingError(
            #            "Wrong starting character in one of the lines "
            #            "of job {}, and it's not a comment! ({})"
            #            "".format(this_job.job_id,
            #                      job['lines'][line_with_warning]))

            problematic_fields = []
            for line_with_warning in set(job['warning_lines_idx']):
                problematic_fields.append(
                    job['lines'][line_with_warning].split(
                        '=', 1)[0].strip().lower())
            if problematic_fields:
                # These are the fields that contain unexpected newlines
                raw_data['warning_fields_with_newlines'] = problematic_fields

            # I believe that exit_status and terminating_signal cannot be
            # retrieved from the qstat -f output.

            # I wrap calls in try-except clauses to avoid errors if a field
            # is missing
            try:
                this_job.title = raw_data['job_name']
            except KeyError:
                _LOGGER.debug("No 'job_name' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.annotation = raw_data['comment']
            except KeyError:
                # Many jobs do not have a comment; I do not complain about it.
                pass
                # _LOGGER.debug("No 'comment' field for job id {}".format(
                #    this_job.job_id))

            try:
                job_state_string = raw_data['job_state']
                try:
                    this_job.job_state = self._map_status[job_state_string]
                except KeyError:
                    _LOGGER.warning("Unrecognized job_state '{}' for job "
                                    'id {}'.format(job_state_string,
                                                   this_job.job_id))
                    this_job.job_state = JobState.UNDETERMINED
            except KeyError:
                _LOGGER.debug("No 'job_state' field for job id {}".format(
                    this_job.job_id))
                this_job.job_state = JobState.UNDETERMINED

            try:
                this_job.job_substate = raw_data['substate']
            except KeyError:
                _LOGGER.debug("No 'substate' field for job id {}".format(
                    this_job.job_id))

            try:
                exec_hosts = raw_data['exec_host'].split('+')
            except KeyError:
                # No exec_host information found (it may be ok, if the job
                # is not running)
                pass
            else:
                # parse each host; syntax, from the man page:
                # hosta/J1+hostb/J2*P+...
                # where  J1 and J2 are an index of the job
                # on the named host and P is the number of
                # processors allocated from that host to this job.
                # P does not appear if it is 1.
                try:

                    exec_host_list = []
                    for exec_host in exec_hosts:
                        node = MachineInfo()
                        node.name, data = exec_host.split('/')
                        data = data.split('*')
                        if len(data) == 1:
                            node.jobIndex = int(data[0])
                            node.num_cpus = 1
                        elif len(data) == 2:
                            node.jobIndex = int(data[0])
                            node.num_cpus = int(data[1])
                        else:
                            raise ValueError(
                                'Wrong number of pieces: {} '
                                'instead of 1 or 2 in exec_hosts: '
                                '{}'.format(len(data), exec_hosts))
                        exec_host_list.append(node)
                    this_job.allocated_machines = exec_host_list
                except Exception as exc:
                    _LOGGER.debug('Problem parsing the node names, I '
                                  'got Exception {} with message {}; '
                                  'exec_hosts was {}'.format(
                                      str(type(exc)), exc, exec_hosts))

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = raw_data['job_owner'].split('@')[0]
            except KeyError:
                _LOGGER.debug("No 'job_owner' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.num_cpus = int(raw_data['resource_list.ncpus'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.ncpus' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.ncpus' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.ncpus'],
                                    this_job.job_id))

            try:
                this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.mpiprocs' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.mpiprocs' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.mpiprocs'],
                                    this_job.job_id))

            try:
                this_job.num_machines = int(raw_data['resource_list.nodect'])
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.nodect' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning("'resource_list.nodect' is not an integer "
                                '({}) for job id {}!'.format(
                                    raw_data['resource_list.nodect'],
                                    this_job.job_id))

            # Double check of redundant info
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(
                        set(machine.name for machine in this_job.
                            allocated_machines)) != this_job.num_machines:
                    _LOGGER.error('The length of the list of allocated '
                                  'nodes ({}) is different from the '
                                  'expected number of nodes ({})!'.format(
                                      len(this_job.allocated_machines),
                                      this_job.num_machines))

            try:
                this_job.queue_name = raw_data['queue']
            except KeyError:
                _LOGGER.debug("No 'queue' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.RequestedWallclockTime = (self._convert_time(
                    raw_data['resource_list.walltime']))
            except KeyError:
                _LOGGER.debug(
                    "No 'resource_list.walltime' field for job id {}".format(
                        this_job.job_id))
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resource_list.walltime' for job id {}".
                    format(this_job.job_id))

            try:
                this_job.wallclock_time_seconds = (self._convert_time(
                    raw_data['resources_used.walltime']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resources_used.walltime' for job id {}".
                    format(this_job.job_id))

            try:
                this_job.cpu_time = (self._convert_time(
                    raw_data['resources_used.cput']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                _LOGGER.warning(
                    "Error parsing 'resources_used.cput' for job id {}".format(
                        this_job.job_id))

            #
            # ctime: The time that the job was created
            # mtime: The time that the job was last modified, changed state,
            #        or changed locations.
            # qtime: The time that the job entered the current queue
            # stime: The time when the job started execution.
            # etime: The time that the job became eligible to run, i.e. in a
            #        queued state while residing in an execution queue.

            try:
                this_job.submission_time = self._parse_time_string(
                    raw_data['ctime'])
            except KeyError:
                _LOGGER.debug("No 'ctime' field for job id {}".format(
                    this_job.job_id))
            except ValueError:
                _LOGGER.warning("Error parsing 'ctime' for job id {}".format(
                    this_job.job_id))

            try:
                this_job.dispatch_time = self._parse_time_string(
                    raw_data['stime'])
            except KeyError:
                # The job may not have been started yet
                pass
            except ValueError:
                _LOGGER.warning("Error parsing 'stime' for job id {}".format(
                    this_job.job_id))

            # TODO: see if we want to set also finish_time for finished jobs,
            # if there are any

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = raw_data

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
Beispiel #10
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        .. note:: depending on the scheduler configuration, finished jobs
            may either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        import re

        filtered_stderr = '\n'.join(l for l in stderr.split('\n'))
        if filtered_stderr.strip():
            self.logger.warning(
                'Warning in _parse_joblist_output, non-empty '
                "(filtered) stderr='{}'".format(filtered_stderr)
            )
            if retval != 0:
                raise SchedulerError('Error during direct execution parsing (_parse_joblist_output function)')

        # Create dictionary and parse specific fields
        job_list = []
        for line in stdout.split('\n'):
            if re.search(r'^\s*PID', line) or line == '':
                # Skip the header if present
                continue
            line = re.sub(r'^\s+', '', line)
            job = re.split(r'\s+', line)
            this_job = JobInfo()
            this_job.job_id = job[0]

            if len(job) < 3:
                raise SchedulerError(
                    'Unexpected output from the scheduler, '
                    "not enough fields in line '{}'".format(line)
                )

            try:
                job_state_string = job[1][0]  # I just check the first character
            except IndexError:
                self.logger.debug("No 'job_state' field for job id {}".format(this_job.job_id))
                this_job.job_state = JobState.UNDETERMINED
            else:
                try:
                    this_job.job_state = \
                        _MAP_STATUS_PS[job_state_string]
                except KeyError:
                    self.logger.warning(
                        "Unrecognized job_state '{}' for job "
                        'id {}'.format(job_state_string, this_job.job_id)
                    )
                    this_job.job_state = JobState.UNDETERMINED

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = job[2]
            except KeyError:
                self.logger.debug("No 'job_owner' field for job id {}".format(this_job.job_id))

            try:
                this_job.wallclock_time_seconds = self._convert_time(job[3])
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.walltime' for job id {}".format(this_job.job_id))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
    def get_jobs(self, jobs=None, user=None, as_dict=False):
        """
        Return the list of currently active jobs
        """
        computer_id = self.transport._machine  # Host name is used as the identifier
        lpad = self.lpad

        query = {
            "spec._aiida_job_info.computer_id":
            computer_id,  # Limit to this machine
            # Ignore completed and archived jobs
            "state": {
                "$not": {
                    "$in": ["COMPLETED", "ARCHIVED"]
                }
            }
        }

        # Limit to the specific fw_ids
        if jobs:
            # Convert to integer keys
            jobs = [int(job_id) for job_id in jobs]
            query['fw_id'] = {'$in': jobs}

        fw_ids = lpad.get_fw_ids(query)
        joblist = []
        for fid in fw_ids:
            # Get the information of the fireworks in the dict format
            # this is more robust than getting Fireworks objects
            try:
                fw_dict = lpad.get_fw_dict_by_id(fid)
            except ValueError:
                raise SchedulerError(f"No FW found for id: {fid}")

            spec = fw_dict.get("spec", {})

            this_job = JobInfo()
            this_job.job_id = str(fid)
            try:
                this_job.job_state = _MAP_STATUS_FW[fw_dict['state']]
            except IndexError:
                this_job.job_state = JobState.UNDETERMINED

            this_job.title = fw_dict.get('name')

            # Category or categories are mapped to queue_name attribute
            category = spec.get('category')
            if isinstance(category, str):
                this_job.queue_name = category
            elif isinstance(category, (tuple, list)):
                this_job.queue_name = ":".join(category)

            # The created_on is mapped to the submission time
            try:
                this_job.submission_time = datetime.strptime(
                    fw_dict['created_on'], "%Y-%m-%dT%H:%M:%S.%f")
            except ValueError:
                pass
            # NOTE: add information about the dispatch time by looking into the launches

            joblist.append(this_job)

        if as_dict:
            jobdict = {job.job_id: job for job in joblist}
            if None in jobdict:
                raise SchedulerError('Found at least one job without jobid')
            return jobdict

        return joblist