コード例 #1
0
ファイル: direct.py プロジェクト: santiama/aiida_core
    def getJobs(self, jobs=None, user=None, as_dict=False):
        """
        Overrides original method from DirectScheduler in order to list
        missing processes as DONE.
        """
        job_stats = super(DirectScheduler, self).getJobs(jobs=jobs,
                                                         user=user,
                                                         as_dict=as_dict)

        found_jobs = []
        # Get the list of known jobs
        if as_dict:
            found_jobs = job_stats.keys()
        else:
            found_jobs = [j.job_id for j in job_stats]
        # Now check if there are any the user requested but were not found
        not_found_jobs = list(set(jobs) - set(found_jobs)) if jobs else []

        for job_id in not_found_jobs:
            job = JobInfo()
            job.job_id = job_id
            job.job_state = job_states.DONE
            # Owner and wallclock time is unknown
            if as_dict:
                job_stats[job_id] = job
            else:
                job_stats.append(job)

        return job_stats
コード例 #2
0
def update_job_calc_from_detailed_job_info(calc, detailed_job_info):
    """
    Updates the detailed job info for a JobCalculation as obtained from
    the scheduler

    :param calc: The job calculation
    :param detailed_job_info: the detailed information as returned by the
        scheduler for this job
    """
    from aiida.scheduler.datastructures import JobInfo

    last_jobinfo = calc._get_last_jobinfo()
    if last_jobinfo is None:
        last_jobinfo = JobInfo()
        last_jobinfo.job_id = calc.get_job_id()
        last_jobinfo.job_state = JOB_STATES.DONE

    last_jobinfo.detailedJobinfo = detailed_job_info
    calc._set_last_jobinfo(last_jobinfo)
コード例 #3
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may
            either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        # I don't raise because if I pass a list of jobs, I get a non-zero status
        # if one of the job is not in the list anymore

        # retval should be zero
        #if retval != 0:
        #self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output
        # but I strip lines containing "Unknown Job Id", that happens
        # also when I ask for a calculation that has finished
        #
        # I also strip for "Job has finished" because this happens for
        # those schedulers configured to leave the job in the output
        # of qstat for some time after job completion.
        filtered_stderr = '\n'.join(
            l for l in stderr.split('\n') if "Unknown Job Id" not in l and "Job has finished" not in l)
        if filtered_stderr.strip():
            self.logger.warning("Warning in _parse_joblist_output, non-empty "
                                "(filtered) stderr='{}'".format(filtered_stderr))
            if retval != 0:
                raise SchedulerError(
                    "Error during qstat parsing (_parse_joblist_output function)")

        jobdata_raw = []  # will contain raw data parsed from qstat output
        # Get raw data and split in lines
        for line_num, l in enumerate(stdout.split('\n'), start=1):
            # Each new job stanza starts with the string 'Job Id:': I
            # create a new item in the jobdata_raw list
            if l.startswith('Job Id:'):
                jobdata_raw.append(
                    {'id': l.split(':', 1)[1].strip(),
                     'lines': [], 'warning_lines_idx': []})
                # warning_lines_idx: lines that do not start either with
                # tab or space
            else:
                if l.strip():
                    # This is a non-empty line, therefore it is an attribute
                    # of the last job found
                    if not jobdata_raw:
                        # The list is still empty! (This means that I found a
                        # non-empty line, before finding the first 'Job Id:'
                        # string: it is an error. However this may happen
                        # only before the first job.
                        raise SchedulerParsingError("I did not find the header for the first job")
                        #self.logger.warning("I found some text before the "
                        #"first job: {}".format(l))
                    else:
                        if l.startswith(' '):
                            # If it starts with a space, it is a new field
                            jobdata_raw[-1]['lines'].append(l)
                        elif l.startswith('\t'):
                            # If a line starts with a TAB,
                            # I append to the previous string
                            # stripping the TAB
                            if not jobdata_raw[-1]['lines']:
                                raise SchedulerParsingError(
                                    "Line {} is the first line of the job, but it "
                                    "starts with a TAB! ({})".format(line_num, l))
                            jobdata_raw[-1]['lines'][-1] += l[1:]
                        else:
                            #raise SchedulerParsingError(
                            #    "Wrong starting character at line {}! ({})"
                            #    "".format(line_num, l))
                            ## For some reasons, the output of 'comment' and
                            ## 'Variable_List', for instance, can have
                            ## newlines if they are included... # I do a
                            ## workaround
                            jobdata_raw[-1]['lines'][-1] += "\n{}".format(l)
                            jobdata_raw[-1]['warning_lines_idx'].append(
                                len(jobdata_raw[-1]['lines']) - 1)

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:
            this_job = JobInfo()
            this_job.job_id = job['id']

            lines_without_equals_sign = [i for i in job['lines']
                                         if '=' not in i]

            # There are lines without equals sign: this is bad
            if lines_without_equals_sign:
                # Should I only warn?
                self.logger.error("There are lines without equals sign! {}"
                                  "".format(lines_without_equals_sign))
                raise (SchedulerParsingError("There are lines without equals "
                                             "sign."))

            raw_data = {i.split('=', 1)[0].strip().lower():
                            i.split('=', 1)[1].lstrip()
                        for i in job['lines'] if '=' in i}

            ## I ignore the errors for the time being - this seems to be
            ## a problem if there are \n in the content of some variables?
            ## I consider this a workaround...
            #for line_with_warning in set(job['warning_lines_idx']):
            #    if job['lines'][line_with_warning].split(
            #        '=',1)[0].strip().lower() != "comment":
            #        raise SchedulerParsingError(
            #            "Wrong starting character in one of the lines "
            #            "of job {}, and it's not a comment! ({})"
            #            "".format(this_job.job_id,
            #                      job['lines'][line_with_warning]))

            problematic_fields = []
            for line_with_warning in set(job['warning_lines_idx']):
                problematic_fields.append(job['lines'][line_with_warning].split(
                    '=', 1)[0].strip().lower())
            if problematic_fields:
                # These are the fields that contain unexpected newlines
                raw_data['warning_fields_with_newlines'] = problematic_fields

            # I believe that exit_status and terminating_signal cannot be
            # retrieved from the qstat -f output.

            # I wrap calls in try-except clauses to avoid errors if a field
            # is missing
            try:
                this_job.title = raw_data['job_name']
            except KeyError:
                self.logger.debug("No 'job_name' field for job id "
                                  "{}".format(this_job.job_id))

            try:
                this_job.annotation = raw_data['comment']
            except KeyError:
                # Many jobs do not have a comment; I do not complain about it.
                pass
                #self.logger.debug("No 'comment' field for job id {}".format(
                #    this_job.job_id))

            try:
                job_state_string = raw_data['job_state']
                try:
                    this_job.job_state = self._map_status[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        "id {}".format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = job_states.UNDETERMINED
            except KeyError:
                self.logger.debug("No 'job_state' field for job id {}".format(
                    this_job.job_id))
                this_job.job_state = job_states.UNDETERMINED

            try:
                this_job.job_substate = raw_data['substate']
            except KeyError:
                self.logger.debug("No 'substate' field for job id {}".format(
                    this_job.job_id))

            try:
                exec_hosts = raw_data['exec_host'].split('+')
            except KeyError:
                # No exec_host information found (it may be ok, if the job
                # is not running)
                pass
            else:
                # parse each host; syntax, from the man page:
                # hosta/J1+hostb/J2*P+...
                # where  J1 and J2 are an index of the job
                # on the named host and P is the number of
                # processors allocated from that host to this job.
                # P does not appear if it is 1.
                try:
                    exec_host_list = []
                    for exec_host in exec_hosts:
                        node = MachineInfo()
                        node.name, data = exec_host.split('/')
                        data = data.split('*')
                        if len(data) == 1:
                            node.jobIndex = int(data[0])
                            node.num_cpus = 1
                        elif len(data) == 2:
                            node.jobIndex = int(data[0])
                            node.num_cpus = int(data[1])
                        else:
                            raise ValueError("Wrong number of pieces: {} "
                                             "instead of 1 or 2 in exec_hosts: "
                                             "{}".format(len(data), exec_hosts))
                        exec_host_list.append(node)
                    this_job.allocated_machines = exec_host_list
                except Exception as e:
                    self.logger.debug("Problem parsing the node names, I "
                                      "got Exception {} with message {}; "
                                      "exec_hosts was {}".format(
                        str(type(e)), e.message, exec_hosts))

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = raw_data['job_owner'].split('@')[0]
            except KeyError:
                self.logger.debug("No 'job_owner' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.num_cpus = int(raw_data['resource_list.ncpus'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                self.logger.debug("No 'resource_list.ncpus' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.ncpus' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.ncpus'],
                    this_job.job_id))

            try:
                this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs'])
                # TODO: understand if this is the correct field also for
                #       multithreaded (OpenMP) jobs.
            except KeyError:
                self.logger.debug("No 'resource_list.mpiprocs' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.mpiprocs' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.mpiprocs'],
                    this_job.job_id))

            try:
                this_job.num_machines = int(raw_data['resource_list.nodect'])
            except KeyError:
                self.logger.debug("No 'resource_list.nodect' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("'resource_list.nodect' is not an integer "
                                    "({}) for job id {}!".format(
                    raw_data['resource_list.nodect'],
                    this_job.job_id))

            # Double check of redundant info
            if (this_job.allocated_machines is not None and
                        this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                        len(this_job.allocated_machines), this_job.num_machines))

            try:
                this_job.queue_name = raw_data['queue']
            except KeyError:
                self.logger.debug("No 'queue' field for job id "
                                  "{}".format(this_job.job_id))

            try:
                this_job.RequestedWallclockTime = (self._convert_time(
                    raw_data['resource_list.walltime']))
            except KeyError:
                self.logger.debug("No 'resource_list.walltime' field for "
                                  "job id {}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("Error parsing 'resource_list.walltime' "
                                    "for job id {}".format(this_job.job_id))

            try:
                this_job.wallclock_time_seconds = (self._convert_time(
                    raw_data['resources_used.walltime']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.walltime' "
                                    "for job id {}".format(this_job.job_id))

            try:
                this_job.cpu_time = (self._convert_time(
                    raw_data['resources_used.cput']))
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.cput' "
                                    "for job id {}".format(this_job.job_id))

            #
            # ctime: The time that the job was created
            # mtime: The time that the job was last modified, changed state,
            #        or changed locations.
            # qtime: The time that the job entered the current queue
            # stime: The time when the job started execution.
            # etime: The time that the job became eligible to run, i.e. in a
            #        queued state while residing in an execution queue.

            try:
                this_job.submission_time = self._parse_time_string(
                    raw_data['ctime'])
            except KeyError:
                self.logger.debug("No 'ctime' field for job id "
                                  "{}".format(this_job.job_id))
            except ValueError:
                self.logger.warning("Error parsing 'ctime' for job id "
                                    "{}".format(this_job.job_id))

            try:
                this_job.dispatch_time = self._parse_time_string(
                    raw_data['stime'])
            except KeyError:
                # The job may not have been started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'stime' for job id "
                                    "{}".format(this_job.job_id))

            # TODO: see if we want to set also finish_time for finished jobs,
            # if there are any

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = raw_data

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
コード例 #4
0
ファイル: direct.py プロジェクト: santiama/aiida_core
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command (qstat -f).

        Return a list of JobInfo objects, one of each job,
        each relevant parameters implemented.

        .. note:: depending on the scheduler configuration, finished jobs
            may either appear here, or not.
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        import re

        filtered_stderr = '\n'.join(l for l in stderr.split('\n'))
        if filtered_stderr.strip():
            self.logger.warning(
                "Warning in _parse_joblist_output, non-empty "
                "(filtered) stderr='{}'".format(filtered_stderr))
            if retval != 0:
                raise SchedulerError(
                    "Error during direct execution parsing (_parse_joblist_output function)"
                )

        # Create dictionary and parse specific fields
        job_list = []
        for line in stdout.split('\n'):
            if re.search('^\s*PID', line) or line == '':
                # Skip the header if present
                continue
            line = re.sub('^\s+', '', line)
            job = re.split('\s+', line)
            this_job = JobInfo()
            this_job.job_id = job[0]

            try:
                job_state_string = job[1]
                try:
                    if job_state_string[0] == 'S':
                        this_job.job_state = job_states.SUSPENDED
                    else:
                        this_job.job_state = \
                            _map_status_ps[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        "id {}".format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = job_states.UNDETERMINED
            except KeyError:
                self.logger.debug("No 'job_state' field for job id {}".format(
                    this_job.job_id))
                this_job.job_state = job_states.UNDETERMINED

            try:
                # I strip the part after the @: is this always ok?
                this_job.job_owner = job[2]
            except KeyError:
                self.logger.debug("No 'job_owner' field for job id {}".format(
                    this_job.job_id))

            try:
                this_job.wallclock_time_seconds = self._convert_time(job[3])
            except KeyError:
                # May not have started yet
                pass
            except ValueError:
                self.logger.warning("Error parsing 'resources_used.walltime' "
                                    "for job id {}".format(this_job.job_id))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
コード例 #5
0
ファイル: sge.py プロジェクト: kamatani0164/aiida-core
    def _parse_joblist_output(self, retval, stdout, stderr):
        import xml.dom.minidom

        if retval != 0:
            self.logger.error("Error in _parse_joblist_output: retval={}; "
                              "stdout={}; stderr={}".format(
                                  retval, stdout, stderr))
            raise SchedulerError("Error during joblist retrieval, retval={}".\
                                 format(retval))

        if stderr.strip():
            self.logger.warning("in _parse_joblist_output for {}: "
                                "there was some text in stderr: {}".format(
                                    str(self.transport), stderr))

        if stdout:
            try:
                xmldata = xml.dom.minidom.parseString(stdout)
            except xml.parsers.expat.ExpatError:
                self.logger.error("in sge._parse_joblist_output: "
                                  "xml parsing of stdout failed:"
                                  "{}".format(stdout))
                raise SchedulerParsingError("Error during joblist retrieval,"
                                            "xml parsing of stdout failed")
        else:
            self.logger.error("Error in sge._parse_joblist_output: retval={}; "
                              "stdout={}; stderr={}".format(
                                  retval, stdout, stderr))
            raise SchedulerError("Error during joblist retrieval,"
                                 "no stdout produced")

        try:
            first_child = xmldata.firstChild
            second_childs = first_child.childNodes
            tag_names_sec = [elem.tagName for elem in second_childs \
                             if elem.nodeType == 1]
            if not 'queue_info' in tag_names_sec:
                self.logger.error("Error in sge._parse_joblist_output: "
                                  "no queue_info: {}".\
                                  format(stdout))
                raise SchedulerError
            if not 'job_info' in tag_names_sec:
                self.logger.error("Error in sge._parse_joblist_output: "
                                  "no job_info: {}".\
                                  format(stdout))
                raise SchedulerError
        except SchedulerError:
            self.logger.error("Error in sge._parse_joblist_output: stdout={}"\
                              .format(stdout))
            raise SchedulerError("Error during xml processing, of stdout:"
                                 "There is no 'job_info' or no 'queue_info'"
                                 "element or there are no jobs!")
        #If something weird happens while firstChild, pop, etc:
        except Exception:
            self.logger.error("Error in sge._parse_joblist_output: stdout={}"\
                              .format(stdout))
            raise SchedulerError("Error during xml processing, of stdout")

        jobs = [i for i in first_child.getElementsByTagName('job_list')]
        #jobs = [i for i in jobinfo.getElementsByTagName('job_list')]
        #print [i[0].childNodes[0].data for i in job_numbers if i]
        joblist = []
        for job in jobs:
            this_job = JobInfo()

            #In case the user needs more information the xml-data for
            #each job is stored:
            this_job.raw_data = job.toxml()

            try:
                job_element = job.getElementsByTagName('JB_job_number').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_id = str(element_child.data).strip()
                if not this_job.job_id:
                    raise SchedulerError
            except SchedulerError:
                self.logger.error("Error in sge._parse_joblist_output:"
                                  "no job id is given, stdout={}"\
                                  .format(stdout))
                raise SchedulerError("Error in sge._parse_joblist_output:"
                                     "no job id is given")
            except IndexError:
                self.logger.error("No 'job_number' given for job index {} in "
                                  "job list, stdout={}".format(jobs.index(job)\
                                  ,stdout))
                raise IndexError("Error in sge._parse_joblist_output:"
                                 "no job id is given")

            try:
                job_element = job.getElementsByTagName('state').pop(0)
                element_child = job_element.childNodes.pop(0)
                job_state_string = str(element_child.data).strip()
                try:
                    this_job.job_state = _map_status_sge[job_state_string]
                except KeyError:
                    self.logger.warning("Unrecognized job_state '{}' for job "
                                        "id {}".format(job_state_string,
                                                       this_job.job_id))
                    this_job.job_state = job_states.UNDETERMINED
            except IndexError:
                self.logger.warning("No 'job_state' field for job id {} in"
                                    "stdout={}".format(this_job.job_id,
                                                       stdout))
                this_job.job_state = job_states.UNDETERMINED

            try:
                job_element = job.getElementsByTagName('JB_owner').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.job_owner = str(element_child.data).strip()
            except IndexError:
                self.logger.warning("No 'job_owner' field for job "
                                    "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName('JB_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.title = str(element_child.data).strip()
            except IndexError:
                self.logger.warning("No 'title' field for job "
                                    "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName('queue_name').pop(0)
                element_child = job_element.childNodes.pop(0)
                this_job.queue_name = str(element_child.data).strip()
            except IndexError:
                if this_job.job_state == job_states.RUNNING:
                    self.logger.warning("No 'queue_name' field for job "
                                        "id {}".format(this_job.job_id))

            try:
                job_element = job.getElementsByTagName(
                    'JB_submission_time').pop(0)
                element_child = job_element.childNodes.pop(0)
                time_string = str(element_child.data).strip()
                try:
                    this_job.submission_time = self._parse_time_string(
                        time_string)
                except ValueError:
                    self.logger.warning("Error parsing 'JB_submission_time' "
                                        "for job id {} ('{}')".format(
                                            this_job.job_id, time_string))
            except IndexError:
                try:
                    job_element = job.getElementsByTagName(
                        'JAT_start_time').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    time_string = str(element_child.data).strip()
                    try:
                        this_job.dispatch_time = self._parse_time_string(
                            time_string)
                    except ValueError:
                        self.logger.warning("Error parsing 'JAT_start_time'"
                                            "for job id {} ('{}')".format(
                                                this_job.job_id, time_string))
                except IndexError:
                    self.logger.warning("No 'JB_submission_time' and no "
                                        "'JAT_start_time' field for job "
                                        "id {}".format(this_job.job_id))

            #There is also cpu_usage, mem_usage, io_usage information available:
            if this_job.job_state == job_states.RUNNING:
                try:
                    job_element = job.getElementsByTagName('slots').pop(0)
                    element_child = job_element.childNodes.pop(0)
                    this_job.num_mpiprocs = str(element_child.data).strip()
                except IndexError:
                    self.logger.warning("No 'slots' field for job "
                                        "id {}".format(this_job.job_id))

            joblist.append(this_job)
        #self.logger.debug("joblist final: {}".format(joblist))
        return joblist
コード例 #6
0
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self.fields)

        # I don't raise because if I pass a list of jobs,
        # I get a non-zero status
        # if one of the job is not in the list anymore
        # retval should be zero
        #if retval != 0:
        #self.logger.warning("Error in _parse_joblist_output: retval={}; "
        #    "stdout={}; stderr={}".format(retval, stdout, stderr))

        # issue a warning if there is any stderr output and
        # there is no line containing "Invalid job id specified", that happens
        # when I ask for specific calculations, and they are all finished
        if stderr.strip() and "Invalid job id specified" not in stderr:
            self.logger.warning("Warning in _parse_joblist_output, non-empty "
                                "stderr='{}'".format(stderr.strip()))
            if retval != 0:
                raise SchedulerError(
                    "Error during squeue parsing (_parse_joblist_output function)"
                )

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)}

            this_job = JobInfo()
            try:
                this_job.job_id = thisjob_dict['job_id']

                this_job.annotation = thisjob_dict['annotation']
                job_state_raw = thisjob_dict['state_raw']
            except KeyError:
                # I skip this calculation if I couldn't find this basic info
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            try:
                job_state_string = _map_status_slurm[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED
            # QUEUED_HELD states are not specific states in SLURM;
            # they are instead set with state QUEUED, and then the
            # annotation tells if the job is held.
            # I check for 'Dependency', 'JobHeldUser',
            # 'JobHeldAdmin', 'BeginTime'.
            # Other states should not bring the job in QUEUED_HELD, I believe
            # (the man page of slurm seems to be incomplete, for instance
            # JobHeld* are not reported there; I also checked at the source code
            # of slurm 2.6 on github (https://github.com/SchedMD/slurm),
            # file slurm/src/common/slurm_protocol_defs.c,
            # and these seem all the states to be taken into account for the
            # QUEUED_HELD status).
            # There are actually a few others, like possible
            # failures, or partition-related reasons, but for the moment I
            # leave them in the QUEUED state.
            if (job_state_string == job_states.QUEUED
                    and this_job.annotation in [
                        'Dependency', 'JobHeldUser', 'JobHeldAdmin',
                        'BeginTime'
                    ]):
                job_state_string = job_states.QUEUED_HELD

            this_job.job_state = job_state_string

            ####
            # Up to here, I just made sure that there were at least three
            # fields, to set the most important fields for a job.
            # I now check if the length is equal to the number of fields
            if len(job) < num_fields:
                # I store this job only with the information
                # gathered up to now, and continue to the next job
                # Also print a warning
                self.logger.warning("Wrong line length in squeue output!"
                                    "Skipping optional fields. Line: '{}'"
                                    "".format(jobdata_raw))
                # I append this job before continuing
                job_list.append(this_job)
                continue

            # TODO: store executing_host?

            this_job.job_owner = thisjob_dict['username']

            try:
                this_job.num_machines = int(thisjob_dict['number_nodes'])
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_nodes'],
                                        this_job.job_id))

            try:
                this_job.num_mpiprocs = int(thisjob_dict['number_cpus'])
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        thisjob_dict['number_cpus'],
                                        this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = thisjob_dict[
                    'allocated_machines']

            this_job.queue_name = thisjob_dict['partition']

            try:
                this_job.requested_wallclock_time_seconds = (
                    self._convert_time(thisjob_dict['time_limit']))
            except ValueError:
                self.logger.warning("Error parsing the time limit "
                                    "for job id {}".format(this_job.job_id))

            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    this_job.wallclock_time_seconds = (self._convert_time(
                        thisjob_dict['time_used']))
                except ValueError:
                    self.logger.warning("Error parsing time_used "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    this_job.dispatch_time = self._parse_time_string(
                        thisjob_dict['dispatch_time'])
                except ValueError:
                    self.logger.warning("Error parsing dispatch_time for job "
                                        "id {}".format(this_job.job_id))

            try:
                this_job.submission_time = self._parse_time_string(
                    thisjob_dict['submission_time'])
            except ValueError:
                self.logger.warning("Error parsing submission_time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = thisjob_dict['job_name']

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list
コード例 #7
0
ファイル: execmanager.py プロジェクト: santiama/aiida_core
def update_running_calcs_status(authinfo):
    """
    Update the states of calculations in WITHSCHEDULER status belonging
    to user and machine as defined in the 'dbauthinfo' table.
    """
    from aiida.orm import JobCalculation, Computer
    from aiida.scheduler.datastructures import JobInfo
    from aiida.utils.logger import get_dblogger_extra

    if not authinfo.enabled:
        return

    execlogger.debug("Updating running calc status for user {} "
                     "and machine {}".format(authinfo.aiidauser.email,
                                             authinfo.dbcomputer.name))

    # This returns an iterator over aiida JobCalculation objects
    calcs_to_inquire = list(
        JobCalculation._get_all_with_state(state=calc_states.WITHSCHEDULER,
                                           computer=authinfo.dbcomputer,
                                           user=authinfo.aiidauser))

    # NOTE: no further check is done that machine and
    # aiidauser are correct for each calc in calcs
    s = Computer(dbcomputer=authinfo.dbcomputer).get_scheduler()
    t = authinfo.get_transport()

    computed = []

    # I avoid to open an ssh connection if there are
    # no calcs with state WITHSCHEDULER
    if len(calcs_to_inquire):
        jobids_to_inquire = [str(c.get_job_id()) for c in calcs_to_inquire]

        # Open connection
        with t:
            s.set_transport(t)
            # TODO: Check if we are ok with filtering by job (to make this work,
            # I had to remove the check on the retval for getJobs,
            # because if the job has computed and is not in the output of
            # qstat, it gives a nonzero retval)

            # TODO: catch SchedulerError exception and do something
            # sensible (at least, skip this computer but continue with
            # following ones, and set a counter; set calculations to
            # UNKNOWN after a while?
            if s.get_feature('can_query_by_user'):
                found_jobs = s.getJobs(user="******", as_dict=True)
            else:
                found_jobs = s.getJobs(jobs=jobids_to_inquire, as_dict=True)

            # I update the status of jobs

            for c in calcs_to_inquire:
                try:
                    logger_extra = get_dblogger_extra(c)
                    t._set_logger_extra(logger_extra)

                    jobid = c.get_job_id()
                    if jobid is None:
                        execlogger.error("JobCalculation {} is WITHSCHEDULER "
                                         "but no job id was found!".format(
                                             c.pk),
                                         extra=logger_extra)
                        continue

                    # I check if the calculation to be checked (c)
                    # is in the output of qstat
                    if jobid in found_jobs:
                        # jobinfo: the information returned by
                        # qstat for this job
                        jobinfo = found_jobs[jobid]
                        execlogger.debug("Inquirying calculation {} (jobid "
                                         "{}): it has job_state={}".format(
                                             c.pk, jobid, jobinfo.job_state),
                                         extra=logger_extra)
                        # For the moment, FAILED is not defined
                        if jobinfo.job_state in [job_states.DONE
                                                 ]:  # , job_states.FAILED]:
                            computed.append(c)
                            try:
                                c._set_state(calc_states.COMPUTED)
                            except ModificationNotAllowed:
                                # Someone already set it, just skip
                                pass

                        ## Do not set the WITHSCHEDULER state multiple times,
                        ## this would raise a ModificationNotAllowed
                        # else:
                        # c._set_state(calc_states.WITHSCHEDULER)

                        c._set_scheduler_state(jobinfo.job_state)

                        c._set_last_jobinfo(jobinfo)
                    else:
                        execlogger.debug("Inquirying calculation {} (jobid "
                                         "{}): not found, assuming "
                                         "job_state={}".format(
                                             c.pk, jobid, job_states.DONE),
                                         extra=logger_extra)

                        # calculation c is not found in the output of qstat
                        computed.append(c)
                        c._set_scheduler_state(job_states.DONE)
                except Exception as e:
                    # TODO: implement a counter, after N retrials
                    # set it to a status that
                    # requires the user intervention
                    execlogger.warning("There was an exception for "
                                       "calculation {} ({}): {}".format(
                                           c.pk, e.__class__.__name__,
                                           e.message),
                                       extra=logger_extra)
                    continue

            for c in computed:
                try:
                    logger_extra = get_dblogger_extra(c)
                    try:
                        detailed_jobinfo = s.get_detailed_jobinfo(
                            jobid=c.get_job_id())
                    except NotImplementedError:
                        detailed_jobinfo = (
                            u"AiiDA MESSAGE: This scheduler does not implement "
                            u"the routine get_detailed_jobinfo to retrieve "
                            u"the information on "
                            u"a job after it has finished.")
                    last_jobinfo = c._get_last_jobinfo()
                    if last_jobinfo is None:
                        last_jobinfo = JobInfo()
                        last_jobinfo.job_id = c.get_job_id()
                        last_jobinfo.job_state = job_states.DONE
                    last_jobinfo.detailedJobinfo = detailed_jobinfo
                    c._set_last_jobinfo(last_jobinfo)
                except Exception as e:
                    execlogger.warning("There was an exception while "
                                       "retrieving the detailed jobinfo "
                                       "for calculation {} ({}): {}".format(
                                           c.pk, e.__class__.__name__,
                                           e.message),
                                       extra=logger_extra)
                    continue
                finally:
                    # Set the state to COMPUTED as the very last thing
                    # of this routine; no further change should be done after
                    # this, so that in general the retriever can just
                    # poll for this state, if we want to.
                    try:
                        c._set_state(calc_states.COMPUTED)
                    except ModificationNotAllowed:
                        # Someone already set it, just skip
                        pass

    return computed
コード例 #8
0
ファイル: lsf.py プロジェクト: kamatani0164/aiida-core
    def _parse_joblist_output(self, retval, stdout, stderr):
        """
        Parse the queue output string, as returned by executing the
        command returned by _get_joblist_command command,
        that is here implemented as a list of lines, one for each
        job, with _field_separator as separator. The order is described
        in the _get_joblist_command function.
        
        Return a list of JobInfo objects, one of each job, 
        each relevant parameters implemented.

        Note: depending on the scheduler configuration, finished jobs may 
            either appear here, or not. 
            This function will only return one element for each job find
            in the qstat output; missing jobs (for whatever reason) simply
            will not appear here.
        """
        num_fields = len(self._joblist_fields)

        if retval != 0:
            self.logger.warning("Error in _parse_joblist_output: retval={}; "
                                "stdout={}; stderr={}".format(
                                    retval, stdout, stderr))
            raise SchedulerError("Error during parsing joblist output, "
                                 "retval={}\n"
                                 "stdout={}\nstderr={}".format(
                                     retval, stdout, stderr))

        # will contain raw data parsed from output: only lines with the
        # separator, and already split in fields
        # I put num_fields, because in this way
        # if the symbol _field_separator appears in the title (that is
        # the last field), I don't split the title.
        # This assumes that _field_separator never
        # appears in any previous field.
        jobdata_raw = [
            l.split(_field_separator, num_fields) for l in stdout.splitlines()
            if _field_separator in l
        ]

        # Create dictionary and parse specific fields
        job_list = []
        for job in jobdata_raw:

            # Each job should have all fields.
            if len(job) != num_fields:
                # I skip this calculation
                # (I don't append anything to job_list before continuing)
                self.logger.error("Wrong line length in squeue output! '{}'"
                                  "".format(job))
                continue

            this_job = JobInfo()
            this_job.job_id = job[0]
            this_job.annotation = job[2]
            job_state_raw = job[1]

            try:
                job_state_string = _map_status_lsf[job_state_raw]
            except KeyError:
                self.logger.warning("Unrecognized job_state '{}' for job "
                                    "id {}".format(job_state_raw,
                                                   this_job.job_id))
                job_state_string = job_states.UNDETERMINED

            this_job.job_state = job_state_string

            # I get the remaining fields
            # The first three were already obtained
            # I know that the length is exactly num_fields because
            # I used split(_field_separator, num_fields) before
            # when creting 'job'
            #            (_, _, _, executing_host, username, number_nodes,
            #             number_cpus, allocated_machines, partition,
            #             time_limit, time_used, dispatch_time, job_name) = job
            (_, _, _, executing_host, username, number_nodes, number_cpus,
             allocated_machines, partition, finish_time, start_time,
             percent_complete, submission_time, job_name) = job

            this_job.job_owner = username
            try:
                this_job.num_machines = int(number_nodes)
            except ValueError:
                self.logger.warning("The number of allocated nodes is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_nodes, this_job.job_id))

            try:
                this_job.num_mpiprocs = int(number_cpus)
            except ValueError:
                self.logger.warning("The number of allocated cores is not "
                                    "an integer ({}) for job id {}!".format(
                                        number_cpus, this_job.job_id))

            # ALLOCATED NODES HERE
            # string may be in the format
            # nid00[684-685,722-723,748-749,958-959]
            # therefore it requires some parsing, that is unnecessary now.
            # I just store is as a raw string for the moment, and I leave
            # this_job.allocated_machines undefined
            if this_job.job_state == job_states.RUNNING:
                this_job.allocated_machines_raw = allocated_machines

            this_job.queue_name = partition

            psd_finish_time = self._parse_time_string(finish_time,
                                                      fmt='%b %d %H:%M')
            psd_start_time = self._parse_time_string(start_time,
                                                     fmt='%b %d %H:%M')
            psd_submission_time = self._parse_time_string(submission_time,
                                                          fmt='%b %d %H:%M')

            # Now get the time in seconds which has been used
            # Only if it is RUNNING; otherwise it is not meaningful,
            # and may be not set (in my test, it is set to zero)
            if this_job.job_state == job_states.RUNNING:
                try:
                    requested_walltime = psd_finish_time - psd_start_time
                    # fix of a weird bug. Since the year is not parsed, it is assumed
                    # to always be 1900. Therefore, job submitted
                    # in december and finishing in january would produce negative time differences
                    if requested_walltime.total_seconds() < 0:
                        import datetime
                        old_month = psd_finish_time.month
                        old_day = psd_finish_time.day
                        old_hour = psd_finish_time.hour
                        old_minute = psd_finish_time.minute
                        new_year = psd_start_time.year + 1
                        # note: we assume that no job will last more than 1 year...
                        psd_finish_time = datetime.datetime(year=new_year,
                                                            month=old_month,
                                                            day=old_day,
                                                            hour=old_hour,
                                                            minute=old_minute)
                        requested_walltime = psd_finish_time - psd_start_time

                    this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds(
                    )
                except (TypeError, ValueError):
                    self.logger.warning("Error parsing the time limit "
                                        "for job id {}".format(
                                            this_job.job_id))

                try:
                    psd_percent_complete = float(
                        percent_complete.strip(' L').strip("%"))
                    this_job.wallclock_time_seconds = requested_walltime.total_seconds(
                    ) * psd_percent_complete / 100.
                except ValueError:
                    self.logger.warning("Error parsing the time used "
                                        "for job id {}".format(
                                            this_job.job_id))

            try:
                this_job.submission_time = psd_submission_time
            except ValueError:
                self.logger.warning("Error parsing submission time for job "
                                    "id {}".format(this_job.job_id))

            this_job.title = job_name

            # Everything goes here anyway for debugging purposes
            this_job.raw_data = job

            # Double check of redundant info
            # Not really useful now, allocated_machines in this
            # version of the plugin is never set
            if (this_job.allocated_machines is not None
                    and this_job.num_machines is not None):
                if len(this_job.allocated_machines) != this_job.num_machines:
                    self.logger.error("The length of the list of allocated "
                                      "nodes ({}) is different from the "
                                      "expected number of nodes ({})!".format(
                                          len(this_job.allocated_machines),
                                          this_job.num_machines))

            # I append to the list of jobs to return
            job_list.append(this_job)

        return job_list