def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # I don't raise because if I pass a list of jobs, I get a non-zero status # if one of the job is not in the list anymore # retval should be zero #if retval != 0: #self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output # but I strip lines containing "Unknown Job Id", that happens # also when I ask for a calculation that has finished # # I also strip for "Job has finished" because this happens for # those schedulers configured to leave the job in the output # of qstat for some time after job completion. filtered_stderr = '\n'.join( l for l in stderr.split('\n') if "Unknown Job Id" not in l and "Job has finished" not in l) if filtered_stderr.strip(): self.logger.warning("Warning in _parse_joblist_output, non-empty " "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError( "Error during qstat parsing (_parse_joblist_output function)") jobdata_raw = [] # will contain raw data parsed from qstat output # Get raw data and split in lines for line_num, l in enumerate(stdout.split('\n'), start=1): # Each new job stanza starts with the string 'Job Id:': I # create a new item in the jobdata_raw list if l.startswith('Job Id:'): jobdata_raw.append( {'id': l.split(':', 1)[1].strip(), 'lines': [], 'warning_lines_idx': []}) # warning_lines_idx: lines that do not start either with # tab or space else: if l.strip(): # This is a non-empty line, therefore it is an attribute # of the last job found if not jobdata_raw: # The list is still empty! (This means that I found a # non-empty line, before finding the first 'Job Id:' # string: it is an error. However this may happen # only before the first job. raise SchedulerParsingError("I did not find the header for the first job") #self.logger.warning("I found some text before the " #"first job: {}".format(l)) else: if l.startswith(' '): # If it starts with a space, it is a new field jobdata_raw[-1]['lines'].append(l) elif l.startswith('\t'): # If a line starts with a TAB, # I append to the previous string # stripping the TAB if not jobdata_raw[-1]['lines']: raise SchedulerParsingError( "Line {} is the first line of the job, but it " "starts with a TAB! ({})".format(line_num, l)) jobdata_raw[-1]['lines'][-1] += l[1:] else: #raise SchedulerParsingError( # "Wrong starting character at line {}! ({})" # "".format(line_num, l)) ## For some reasons, the output of 'comment' and ## 'Variable_List', for instance, can have ## newlines if they are included... # I do a ## workaround jobdata_raw[-1]['lines'][-1] += "\n{}".format(l) jobdata_raw[-1]['warning_lines_idx'].append( len(jobdata_raw[-1]['lines']) - 1) # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: this_job = JobInfo() this_job.job_id = job['id'] lines_without_equals_sign = [i for i in job['lines'] if '=' not in i] # There are lines without equals sign: this is bad if lines_without_equals_sign: # Should I only warn? self.logger.error("There are lines without equals sign! {}" "".format(lines_without_equals_sign)) raise (SchedulerParsingError("There are lines without equals " "sign.")) raw_data = {i.split('=', 1)[0].strip().lower(): i.split('=', 1)[1].lstrip() for i in job['lines'] if '=' in i} ## I ignore the errors for the time being - this seems to be ## a problem if there are \n in the content of some variables? ## I consider this a workaround... #for line_with_warning in set(job['warning_lines_idx']): # if job['lines'][line_with_warning].split( # '=',1)[0].strip().lower() != "comment": # raise SchedulerParsingError( # "Wrong starting character in one of the lines " # "of job {}, and it's not a comment! ({})" # "".format(this_job.job_id, # job['lines'][line_with_warning])) problematic_fields = [] for line_with_warning in set(job['warning_lines_idx']): problematic_fields.append(job['lines'][line_with_warning].split( '=', 1)[0].strip().lower()) if problematic_fields: # These are the fields that contain unexpected newlines raw_data['warning_fields_with_newlines'] = problematic_fields # I believe that exit_status and terminating_signal cannot be # retrieved from the qstat -f output. # I wrap calls in try-except clauses to avoid errors if a field # is missing try: this_job.title = raw_data['job_name'] except KeyError: self.logger.debug("No 'job_name' field for job id " "{}".format(this_job.job_id)) try: this_job.annotation = raw_data['comment'] except KeyError: # Many jobs do not have a comment; I do not complain about it. pass #self.logger.debug("No 'comment' field for job id {}".format( # this_job.job_id)) try: job_state_string = raw_data['job_state'] try: this_job.job_state = self._map_status[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except KeyError: self.logger.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = job_states.UNDETERMINED try: this_job.job_substate = raw_data['substate'] except KeyError: self.logger.debug("No 'substate' field for job id {}".format( this_job.job_id)) try: exec_hosts = raw_data['exec_host'].split('+') except KeyError: # No exec_host information found (it may be ok, if the job # is not running) pass else: # parse each host; syntax, from the man page: # hosta/J1+hostb/J2*P+... # where J1 and J2 are an index of the job # on the named host and P is the number of # processors allocated from that host to this job. # P does not appear if it is 1. try: exec_host_list = [] for exec_host in exec_hosts: node = MachineInfo() node.name, data = exec_host.split('/') data = data.split('*') if len(data) == 1: node.jobIndex = int(data[0]) node.num_cpus = 1 elif len(data) == 2: node.jobIndex = int(data[0]) node.num_cpus = int(data[1]) else: raise ValueError("Wrong number of pieces: {} " "instead of 1 or 2 in exec_hosts: " "{}".format(len(data), exec_hosts)) exec_host_list.append(node) this_job.allocated_machines = exec_host_list except Exception as e: self.logger.debug("Problem parsing the node names, I " "got Exception {} with message {}; " "exec_hosts was {}".format( str(type(e)), e.message, exec_hosts)) try: # I strip the part after the @: is this always ok? this_job.job_owner = raw_data['job_owner'].split('@')[0] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.num_cpus = int(raw_data['resource_list.ncpus']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.ncpus' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.ncpus' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.ncpus'], this_job.job_id)) try: this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.mpiprocs' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.mpiprocs' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.mpiprocs'], this_job.job_id)) try: this_job.num_machines = int(raw_data['resource_list.nodect']) except KeyError: self.logger.debug("No 'resource_list.nodect' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.nodect' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.nodect'], this_job.job_id)) # Double check of redundant info if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) try: this_job.queue_name = raw_data['queue'] except KeyError: self.logger.debug("No 'queue' field for job id " "{}".format(this_job.job_id)) try: this_job.RequestedWallclockTime = (self._convert_time( raw_data['resource_list.walltime'])) except KeyError: self.logger.debug("No 'resource_list.walltime' field for " "job id {}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'resource_list.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.wallclock_time_seconds = (self._convert_time( raw_data['resources_used.walltime'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.cpu_time = (self._convert_time( raw_data['resources_used.cput'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.cput' " "for job id {}".format(this_job.job_id)) # # ctime: The time that the job was created # mtime: The time that the job was last modified, changed state, # or changed locations. # qtime: The time that the job entered the current queue # stime: The time when the job started execution. # etime: The time that the job became eligible to run, i.e. in a # queued state while residing in an execution queue. try: this_job.submission_time = self._parse_time_string( raw_data['ctime']) except KeyError: self.logger.debug("No 'ctime' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'ctime' for job id " "{}".format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( raw_data['stime']) except KeyError: # The job may not have been started yet pass except ValueError: self.logger.warning("Error parsing 'stime' for job id " "{}".format(this_job.job_id)) # TODO: see if we want to set also finish_time for finished jobs, # if there are any # Everything goes here anyway for debugging purposes this_job.raw_data = raw_data # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self.fields) # I don't raise because if I pass a list of jobs, # I get a non-zero status # if one of the job is not in the list anymore # retval should be zero #if retval != 0: #self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output and # there is no line containing "Invalid job id specified", that happens # when I ask for specific calculations, and they are all finished if stderr.strip() and "Invalid job id specified" not in stderr: self.logger.warning("Warning in _parse_joblist_output, non-empty " "stderr='{}'".format(stderr.strip())) if retval != 0: raise SchedulerError( "Error during squeue parsing (_parse_joblist_output function)" ) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_field_separator, num_fields) for l in stdout.splitlines() if _field_separator in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)} this_job = JobInfo() try: this_job.job_id = thisjob_dict['job_id'] this_job.annotation = thisjob_dict['annotation'] job_state_raw = thisjob_dict['state_raw'] except KeyError: # I skip this calculation if I couldn't find this basic info # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'" "".format(job)) continue try: job_state_string = _map_status_slurm[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_raw, this_job.job_id)) job_state_string = job_states.UNDETERMINED # QUEUED_HELD states are not specific states in SLURM; # they are instead set with state QUEUED, and then the # annotation tells if the job is held. # I check for 'Dependency', 'JobHeldUser', # 'JobHeldAdmin', 'BeginTime'. # Other states should not bring the job in QUEUED_HELD, I believe # (the man page of slurm seems to be incomplete, for instance # JobHeld* are not reported there; I also checked at the source code # of slurm 2.6 on github (https://github.com/SchedMD/slurm), # file slurm/src/common/slurm_protocol_defs.c, # and these seem all the states to be taken into account for the # QUEUED_HELD status). # There are actually a few others, like possible # failures, or partition-related reasons, but for the moment I # leave them in the QUEUED state. if (job_state_string == job_states.QUEUED and this_job.annotation in [ 'Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime' ]): job_state_string = job_states.QUEUED_HELD this_job.job_state = job_state_string #### # Up to here, I just made sure that there were at least three # fields, to set the most important fields for a job. # I now check if the length is equal to the number of fields if len(job) < num_fields: # I store this job only with the information # gathered up to now, and continue to the next job # Also print a warning self.logger.warning("Wrong line length in squeue output!" "Skipping optional fields. Line: '{}'" "".format(jobdata_raw)) # I append this job before continuing job_list.append(this_job) continue # TODO: store executing_host? this_job.job_owner = thisjob_dict['username'] try: this_job.num_machines = int(thisjob_dict['number_nodes']) except ValueError: self.logger.warning("The number of allocated nodes is not " "an integer ({}) for job id {}!".format( thisjob_dict['number_nodes'], this_job.job_id)) try: this_job.num_mpiprocs = int(thisjob_dict['number_cpus']) except ValueError: self.logger.warning("The number of allocated cores is not " "an integer ({}) for job id {}!".format( thisjob_dict['number_cpus'], this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == job_states.RUNNING: this_job.allocated_machines_raw = thisjob_dict[ 'allocated_machines'] this_job.queue_name = thisjob_dict['partition'] try: this_job.requested_wallclock_time_seconds = ( self._convert_time(thisjob_dict['time_limit'])) except ValueError: self.logger.warning("Error parsing the time limit " "for job id {}".format(this_job.job_id)) # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == job_states.RUNNING: try: this_job.wallclock_time_seconds = (self._convert_time( thisjob_dict['time_used'])) except ValueError: self.logger.warning("Error parsing time_used " "for job id {}".format( this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( thisjob_dict['dispatch_time']) except ValueError: self.logger.warning("Error parsing dispatch_time for job " "id {}".format(this_job.job_id)) try: this_job.submission_time = self._parse_time_string( thisjob_dict['submission_time']) except ValueError: self.logger.warning("Error parsing submission_time for job " "id {}".format(this_job.job_id)) this_job.title = thisjob_dict['job_name'] # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. .. note:: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ import re filtered_stderr = '\n'.join(l for l in stderr.split('\n')) if filtered_stderr.strip(): self.logger.warning( "Warning in _parse_joblist_output, non-empty " "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError( "Error during direct execution parsing (_parse_joblist_output function)" ) # Create dictionary and parse specific fields job_list = [] for line in stdout.split('\n'): if re.search('^\s*PID', line) or line == '': # Skip the header if present continue line = re.sub('^\s+', '', line) job = re.split('\s+', line) this_job = JobInfo() this_job.job_id = job[0] try: job_state_string = job[1] try: if job_state_string[0] == 'S': this_job.job_state = job_states.SUSPENDED else: this_job.job_state = \ _map_status_ps[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except KeyError: self.logger.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = job_states.UNDETERMINED try: # I strip the part after the @: is this always ok? this_job.job_owner = job[2] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.wallclock_time_seconds = self._convert_time(job[3]) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' " "for job id {}".format(this_job.job_id)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self._joblist_fields) if retval != 0: self.logger.warning("Error in _parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during parsing joblist output, " "retval={}\n" "stdout={}\nstderr={}".format( retval, stdout, stderr)) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_field_separator, num_fields) for l in stdout.splitlines() if _field_separator in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: # Each job should have all fields. if len(job) != num_fields: # I skip this calculation # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'" "".format(job)) continue this_job = JobInfo() this_job.job_id = job[0] this_job.annotation = job[2] job_state_raw = job[1] try: job_state_string = _map_status_lsf[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_raw, this_job.job_id)) job_state_string = job_states.UNDETERMINED this_job.job_state = job_state_string # I get the remaining fields # The first three were already obtained # I know that the length is exactly num_fields because # I used split(_field_separator, num_fields) before # when creting 'job' # (_, _, _, executing_host, username, number_nodes, # number_cpus, allocated_machines, partition, # time_limit, time_used, dispatch_time, job_name) = job (_, _, _, executing_host, username, number_nodes, number_cpus, allocated_machines, partition, finish_time, start_time, percent_complete, submission_time, job_name) = job this_job.job_owner = username try: this_job.num_machines = int(number_nodes) except ValueError: self.logger.warning("The number of allocated nodes is not " "an integer ({}) for job id {}!".format( number_nodes, this_job.job_id)) try: this_job.num_mpiprocs = int(number_cpus) except ValueError: self.logger.warning("The number of allocated cores is not " "an integer ({}) for job id {}!".format( number_cpus, this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == job_states.RUNNING: this_job.allocated_machines_raw = allocated_machines this_job.queue_name = partition psd_finish_time = self._parse_time_string(finish_time, fmt='%b %d %H:%M') psd_start_time = self._parse_time_string(start_time, fmt='%b %d %H:%M') psd_submission_time = self._parse_time_string(submission_time, fmt='%b %d %H:%M') # Now get the time in seconds which has been used # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == job_states.RUNNING: try: requested_walltime = psd_finish_time - psd_start_time # fix of a weird bug. Since the year is not parsed, it is assumed # to always be 1900. Therefore, job submitted # in december and finishing in january would produce negative time differences if requested_walltime.total_seconds() < 0: import datetime old_month = psd_finish_time.month old_day = psd_finish_time.day old_hour = psd_finish_time.hour old_minute = psd_finish_time.minute new_year = psd_start_time.year + 1 # note: we assume that no job will last more than 1 year... psd_finish_time = datetime.datetime(year=new_year, month=old_month, day=old_day, hour=old_hour, minute=old_minute) requested_walltime = psd_finish_time - psd_start_time this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds( ) except (TypeError, ValueError): self.logger.warning("Error parsing the time limit " "for job id {}".format( this_job.job_id)) try: psd_percent_complete = float( percent_complete.strip(' L').strip("%")) this_job.wallclock_time_seconds = requested_walltime.total_seconds( ) * psd_percent_complete / 100. except ValueError: self.logger.warning("Error parsing the time used " "for job id {}".format( this_job.job_id)) try: this_job.submission_time = psd_submission_time except ValueError: self.logger.warning("Error parsing submission time for job " "id {}".format(this_job.job_id)) this_job.title = job_name # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list