def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self.fields) # I don't raise because if I pass a list of jobs, # I get a non-zero status # if one of the job is not in the list anymore # retval should be zero # if retval != 0: # self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output and # there is no line containing "Invalid job id specified", that happens # when I ask for specific calculations, and they are all finished if stderr.strip() and 'Invalid job id specified' not in stderr: self.logger.warning("Warning in _parse_joblist_output, non-empty stderr='{}'".format(stderr.strip())) if retval != 0: raise SchedulerError('Error during squeue parsing (_parse_joblist_output function)') # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)} this_job = JobInfo() try: this_job.job_id = thisjob_dict['job_id'] this_job.annotation = thisjob_dict['annotation'] job_state_raw = thisjob_dict['state_raw'] except KeyError: # I skip this calculation if I couldn't find this basic info # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'".format(job)) continue try: job_state_string = _MAP_STATUS_SLURM[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED # QUEUED_HELD states are not specific states in SLURM; # they are instead set with state QUEUED, and then the # annotation tells if the job is held. # I check for 'Dependency', 'JobHeldUser', # 'JobHeldAdmin', 'BeginTime'. # Other states should not bring the job in QUEUED_HELD, I believe # (the man page of slurm seems to be incomplete, for instance # JobHeld* are not reported there; I also checked at the source code # of slurm 2.6 on github (https://github.com/SchedMD/slurm), # file slurm/src/common/slurm_protocol_defs.c, # and these seem all the states to be taken into account for the # QUEUED_HELD status). # There are actually a few others, like possible # failures, or partition-related reasons, but for the moment I # leave them in the QUEUED state. if (job_state_string == JobState.QUEUED and this_job.annotation in ['Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime']): job_state_string = JobState.QUEUED_HELD this_job.job_state = job_state_string #### # Up to here, I just made sure that there were at least three # fields, to set the most important fields for a job. # I now check if the length is equal to the number of fields if len(job) < num_fields: # I store this job only with the information # gathered up to now, and continue to the next job # Also print a warning self.logger.warning('Wrong line length in squeue output!' "Skipping optional fields. Line: '{}'" ''.format(jobdata_raw)) # I append this job before continuing job_list.append(this_job) continue # TODO: store executing_host? this_job.job_owner = thisjob_dict['username'] try: this_job.num_machines = int(thisjob_dict['number_nodes']) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_nodes'], this_job.job_id)) try: this_job.num_mpiprocs = int(thisjob_dict['number_cpus']) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_cpus'], this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = thisjob_dict['allocated_machines'] this_job.queue_name = thisjob_dict['partition'] try: this_job.requested_wallclock_time_seconds = (self._convert_time(thisjob_dict['time_limit'])) except ValueError: self.logger.warning('Error parsing the time limit for job id {}'.format(this_job.job_id)) # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: this_job.wallclock_time_seconds = (self._convert_time(thisjob_dict['time_used'])) except ValueError: self.logger.warning('Error parsing time_used for job id {}'.format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string(thisjob_dict['dispatch_time']) except ValueError: self.logger.warning('Error parsing dispatch_time for job id {}'.format(this_job.job_id)) try: this_job.submission_time = self._parse_time_string(thisjob_dict['submission_time']) except ValueError: self.logger.warning('Error parsing submission_time for job id {}'.format(this_job.job_id)) this_job.title = thisjob_dict['job_name'] # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches num_fields = len(self._joblist_fields) if retval != 0: self.logger.warning('Error in _parse_joblist_output: retval={}; ' 'stdout={}; stderr={}'.format( retval, stdout, stderr)) raise SchedulerError('Error during parsing joblist output, ' 'retval={}\n' 'stdout={}\nstderr={}'.format( retval, stdout, stderr)) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: # Each job should have all fields. if len(job) != num_fields: # I skip this calculation # (I don't append anything to job_list before continuing) self.logger.error( "Wrong line length in squeue output! '{}'".format(job)) continue this_job = JobInfo() this_job.job_id = job[0] this_job.annotation = job[2] job_state_raw = job[1] try: job_state_string = _MAP_STATUS_LSF[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED this_job.job_state = job_state_string # I get the remaining fields # The first three were already obtained # I know that the length is exactly num_fields because # I used split(_field_separator, num_fields) before # when creting 'job' # (_, _, _, executing_host, username, number_nodes, # number_cpus, allocated_machines, partition, # time_limit, time_used, dispatch_time, job_name) = job (_, _, _, _, username, number_nodes, number_cpus, allocated_machines, partition, finish_time, start_time, percent_complete, submission_time, job_name) = job this_job.job_owner = username try: this_job.num_machines = int(number_nodes) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format( number_nodes, this_job.job_id)) try: this_job.num_mpiprocs = int(number_cpus) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format( number_cpus, this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = allocated_machines this_job.queue_name = partition psd_finish_time = self._parse_time_string(finish_time, fmt='%b %d %H:%M') psd_start_time = self._parse_time_string(start_time, fmt='%b %d %H:%M') psd_submission_time = self._parse_time_string(submission_time, fmt='%b %d %H:%M') # Now get the time in seconds which has been used # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: requested_walltime = psd_finish_time - psd_start_time # fix of a weird bug. Since the year is not parsed, it is assumed # to always be 1900. Therefore, job submitted # in december and finishing in january would produce negative time differences if requested_walltime.total_seconds() < 0: import datetime old_month = psd_finish_time.month old_day = psd_finish_time.day old_hour = psd_finish_time.hour old_minute = psd_finish_time.minute new_year = psd_start_time.year + 1 # note: we assume that no job will last more than 1 year... psd_finish_time = datetime.datetime(year=new_year, month=old_month, day=old_day, hour=old_hour, minute=old_minute) requested_walltime = psd_finish_time - psd_start_time this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds( ) # pylint: disable=invalid-name except (TypeError, ValueError): self.logger.warning( 'Error parsing the time limit for job id {}'.format( this_job.job_id)) try: psd_percent_complete = float( percent_complete.strip(' L').strip('%')) this_job.wallclock_time_seconds = requested_walltime.total_seconds( ) * psd_percent_complete / 100. except ValueError: self.logger.warning( 'Error parsing the time used for job id {}'.format( this_job.job_id)) try: this_job.submission_time = psd_submission_time except ValueError: self.logger.warning( 'Error parsing submission time for job id {}'.format( this_job.job_id)) this_job.title = job_name # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list