def _parse_submit_output(self, retval, stdout, stderr): """ Parse the output of the submit command, as returned by executing the command returned by _get_submit_command command. To be implemented by the plugin. Return a string with the JobID. """ if retval != 0: self.logger.error("Error in _parse_submit_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during submission, retval={}\n" "stdout={}\nstderr={}".format( retval, stdout, stderr)) try: transport_string = " for {}".format(self.transport) except SchedulerError: transport_string = "" if stderr.strip(): self.logger.warning("in _parse_submit_output{}: " "there was some text in stderr: {}".format( transport_string, stderr)) try: return stdout.strip().split('Job <')[1].split('>')[0] except IndexError: raise SchedulerParsingError("Cannot parse submission output: {}" "".format(stdout))
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # I don't raise because if I pass a list of jobs, I get a non-zero status # if one of the job is not in the list anymore # retval should be zero #if retval != 0: #self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output # but I strip lines containing "Unknown Job Id", that happens # also when I ask for a calculation that has finished # # I also strip for "Job has finished" because this happens for # those schedulers configured to leave the job in the output # of qstat for some time after job completion. filtered_stderr = '\n'.join( l for l in stderr.split('\n') if "Unknown Job Id" not in l and "Job has finished" not in l) if filtered_stderr.strip(): self.logger.warning("Warning in _parse_joblist_output, non-empty " "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError( "Error during qstat parsing (_parse_joblist_output function)") jobdata_raw = [] # will contain raw data parsed from qstat output # Get raw data and split in lines for line_num, l in enumerate(stdout.split('\n'), start=1): # Each new job stanza starts with the string 'Job Id:': I # create a new item in the jobdata_raw list if l.startswith('Job Id:'): jobdata_raw.append( {'id': l.split(':', 1)[1].strip(), 'lines': [], 'warning_lines_idx': []}) # warning_lines_idx: lines that do not start either with # tab or space else: if l.strip(): # This is a non-empty line, therefore it is an attribute # of the last job found if not jobdata_raw: # The list is still empty! (This means that I found a # non-empty line, before finding the first 'Job Id:' # string: it is an error. However this may happen # only before the first job. raise SchedulerParsingError("I did not find the header for the first job") #self.logger.warning("I found some text before the " #"first job: {}".format(l)) else: if l.startswith(' '): # If it starts with a space, it is a new field jobdata_raw[-1]['lines'].append(l) elif l.startswith('\t'): # If a line starts with a TAB, # I append to the previous string # stripping the TAB if not jobdata_raw[-1]['lines']: raise SchedulerParsingError( "Line {} is the first line of the job, but it " "starts with a TAB! ({})".format(line_num, l)) jobdata_raw[-1]['lines'][-1] += l[1:] else: #raise SchedulerParsingError( # "Wrong starting character at line {}! ({})" # "".format(line_num, l)) ## For some reasons, the output of 'comment' and ## 'Variable_List', for instance, can have ## newlines if they are included... # I do a ## workaround jobdata_raw[-1]['lines'][-1] += "\n{}".format(l) jobdata_raw[-1]['warning_lines_idx'].append( len(jobdata_raw[-1]['lines']) - 1) # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: this_job = JobInfo() this_job.job_id = job['id'] lines_without_equals_sign = [i for i in job['lines'] if '=' not in i] # There are lines without equals sign: this is bad if lines_without_equals_sign: # Should I only warn? self.logger.error("There are lines without equals sign! {}" "".format(lines_without_equals_sign)) raise (SchedulerParsingError("There are lines without equals " "sign.")) raw_data = {i.split('=', 1)[0].strip().lower(): i.split('=', 1)[1].lstrip() for i in job['lines'] if '=' in i} ## I ignore the errors for the time being - this seems to be ## a problem if there are \n in the content of some variables? ## I consider this a workaround... #for line_with_warning in set(job['warning_lines_idx']): # if job['lines'][line_with_warning].split( # '=',1)[0].strip().lower() != "comment": # raise SchedulerParsingError( # "Wrong starting character in one of the lines " # "of job {}, and it's not a comment! ({})" # "".format(this_job.job_id, # job['lines'][line_with_warning])) problematic_fields = [] for line_with_warning in set(job['warning_lines_idx']): problematic_fields.append(job['lines'][line_with_warning].split( '=', 1)[0].strip().lower()) if problematic_fields: # These are the fields that contain unexpected newlines raw_data['warning_fields_with_newlines'] = problematic_fields # I believe that exit_status and terminating_signal cannot be # retrieved from the qstat -f output. # I wrap calls in try-except clauses to avoid errors if a field # is missing try: this_job.title = raw_data['job_name'] except KeyError: self.logger.debug("No 'job_name' field for job id " "{}".format(this_job.job_id)) try: this_job.annotation = raw_data['comment'] except KeyError: # Many jobs do not have a comment; I do not complain about it. pass #self.logger.debug("No 'comment' field for job id {}".format( # this_job.job_id)) try: job_state_string = raw_data['job_state'] try: this_job.job_state = self._map_status[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except KeyError: self.logger.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = job_states.UNDETERMINED try: this_job.job_substate = raw_data['substate'] except KeyError: self.logger.debug("No 'substate' field for job id {}".format( this_job.job_id)) try: exec_hosts = raw_data['exec_host'].split('+') except KeyError: # No exec_host information found (it may be ok, if the job # is not running) pass else: # parse each host; syntax, from the man page: # hosta/J1+hostb/J2*P+... # where J1 and J2 are an index of the job # on the named host and P is the number of # processors allocated from that host to this job. # P does not appear if it is 1. try: exec_host_list = [] for exec_host in exec_hosts: node = MachineInfo() node.name, data = exec_host.split('/') data = data.split('*') if len(data) == 1: node.jobIndex = int(data[0]) node.num_cpus = 1 elif len(data) == 2: node.jobIndex = int(data[0]) node.num_cpus = int(data[1]) else: raise ValueError("Wrong number of pieces: {} " "instead of 1 or 2 in exec_hosts: " "{}".format(len(data), exec_hosts)) exec_host_list.append(node) this_job.allocated_machines = exec_host_list except Exception as e: self.logger.debug("Problem parsing the node names, I " "got Exception {} with message {}; " "exec_hosts was {}".format( str(type(e)), e.message, exec_hosts)) try: # I strip the part after the @: is this always ok? this_job.job_owner = raw_data['job_owner'].split('@')[0] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.num_cpus = int(raw_data['resource_list.ncpus']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.ncpus' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.ncpus' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.ncpus'], this_job.job_id)) try: this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.mpiprocs' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.mpiprocs' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.mpiprocs'], this_job.job_id)) try: this_job.num_machines = int(raw_data['resource_list.nodect']) except KeyError: self.logger.debug("No 'resource_list.nodect' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.nodect' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.nodect'], this_job.job_id)) # Double check of redundant info if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) try: this_job.queue_name = raw_data['queue'] except KeyError: self.logger.debug("No 'queue' field for job id " "{}".format(this_job.job_id)) try: this_job.RequestedWallclockTime = (self._convert_time( raw_data['resource_list.walltime'])) except KeyError: self.logger.debug("No 'resource_list.walltime' field for " "job id {}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'resource_list.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.wallclock_time_seconds = (self._convert_time( raw_data['resources_used.walltime'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.cpu_time = (self._convert_time( raw_data['resources_used.cput'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.cput' " "for job id {}".format(this_job.job_id)) # # ctime: The time that the job was created # mtime: The time that the job was last modified, changed state, # or changed locations. # qtime: The time that the job entered the current queue # stime: The time when the job started execution. # etime: The time that the job became eligible to run, i.e. in a # queued state while residing in an execution queue. try: this_job.submission_time = self._parse_time_string( raw_data['ctime']) except KeyError: self.logger.debug("No 'ctime' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'ctime' for job id " "{}".format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( raw_data['stime']) except KeyError: # The job may not have been started yet pass except ValueError: self.logger.warning("Error parsing 'stime' for job id " "{}".format(this_job.job_id)) # TODO: see if we want to set also finish_time for finished jobs, # if there are any # Everything goes here anyway for debugging purposes this_job.raw_data = raw_data # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): import xml.dom.minidom if retval != 0: self.logger.error("Error in _parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during joblist retrieval, retval={}".\ format(retval)) if stderr.strip(): self.logger.warning("in _parse_joblist_output for {}: " "there was some text in stderr: {}".format( str(self.transport), stderr)) if stdout: try: xmldata = xml.dom.minidom.parseString(stdout) except xml.parsers.expat.ExpatError: self.logger.error("in sge._parse_joblist_output: " "xml parsing of stdout failed:" "{}".format(stdout)) raise SchedulerParsingError("Error during joblist retrieval," "xml parsing of stdout failed") else: self.logger.error("Error in sge._parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during joblist retrieval," "no stdout produced") try: first_child = xmldata.firstChild second_childs = first_child.childNodes tag_names_sec = [elem.tagName for elem in second_childs \ if elem.nodeType == 1] if not 'queue_info' in tag_names_sec: self.logger.error("Error in sge._parse_joblist_output: " "no queue_info: {}".\ format(stdout)) raise SchedulerError if not 'job_info' in tag_names_sec: self.logger.error("Error in sge._parse_joblist_output: " "no job_info: {}".\ format(stdout)) raise SchedulerError except SchedulerError: self.logger.error("Error in sge._parse_joblist_output: stdout={}"\ .format(stdout)) raise SchedulerError("Error during xml processing, of stdout:" "There is no 'job_info' or no 'queue_info'" "element or there are no jobs!") #If something weird happens while firstChild, pop, etc: except Exception: self.logger.error("Error in sge._parse_joblist_output: stdout={}"\ .format(stdout)) raise SchedulerError("Error during xml processing, of stdout") jobs = [i for i in first_child.getElementsByTagName('job_list')] #jobs = [i for i in jobinfo.getElementsByTagName('job_list')] #print [i[0].childNodes[0].data for i in job_numbers if i] joblist = [] for job in jobs: this_job = JobInfo() #In case the user needs more information the xml-data for #each job is stored: this_job.raw_data = job.toxml() try: job_element = job.getElementsByTagName('JB_job_number').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_id = str(element_child.data).strip() if not this_job.job_id: raise SchedulerError except SchedulerError: self.logger.error("Error in sge._parse_joblist_output:" "no job id is given, stdout={}"\ .format(stdout)) raise SchedulerError("Error in sge._parse_joblist_output:" "no job id is given") except IndexError: self.logger.error("No 'job_number' given for job index {} in " "job list, stdout={}".format(jobs.index(job)\ ,stdout)) raise IndexError("Error in sge._parse_joblist_output:" "no job id is given") try: job_element = job.getElementsByTagName('state').pop(0) element_child = job_element.childNodes.pop(0) job_state_string = str(element_child.data).strip() try: this_job.job_state = _map_status_sge[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except IndexError: self.logger.warning("No 'job_state' field for job id {} in" "stdout={}".format(this_job.job_id, stdout)) this_job.job_state = job_states.UNDETERMINED try: job_element = job.getElementsByTagName('JB_owner').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_owner = str(element_child.data).strip() except IndexError: self.logger.warning("No 'job_owner' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName('JB_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.title = str(element_child.data).strip() except IndexError: self.logger.warning("No 'title' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName('queue_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.queue_name = str(element_child.data).strip() except IndexError: if this_job.job_state == job_states.RUNNING: self.logger.warning("No 'queue_name' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName( 'JB_submission_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.submission_time = self._parse_time_string( time_string) except ValueError: self.logger.warning("Error parsing 'JB_submission_time' " "for job id {} ('{}')".format( this_job.job_id, time_string)) except IndexError: try: job_element = job.getElementsByTagName( 'JAT_start_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.dispatch_time = self._parse_time_string( time_string) except ValueError: self.logger.warning("Error parsing 'JAT_start_time'" "for job id {} ('{}')".format( this_job.job_id, time_string)) except IndexError: self.logger.warning("No 'JB_submission_time' and no " "'JAT_start_time' field for job " "id {}".format(this_job.job_id)) #There is also cpu_usage, mem_usage, io_usage information available: if this_job.job_state == job_states.RUNNING: try: job_element = job.getElementsByTagName('slots').pop(0) element_child = job_element.childNodes.pop(0) this_job.num_mpiprocs = str(element_child.data).strip() except IndexError: self.logger.warning("No 'slots' field for job " "id {}".format(this_job.job_id)) joblist.append(this_job) #self.logger.debug("joblist final: {}".format(joblist)) return joblist