def get_jobs(self, jobs=None, user=None, as_dict=False): """ Overrides original method from DirectScheduler in order to list missing processes as DONE. """ job_stats = super().get_jobs(jobs=jobs, user=user, as_dict=as_dict) found_jobs = [] # Get the list of known jobs if as_dict: found_jobs = job_stats.keys() else: found_jobs = [j.job_id for j in job_stats] # Now check if there are any the user requested but were not found not_found_jobs = list(set(jobs) - set(found_jobs)) if jobs else [] for job_id in not_found_jobs: job = JobInfo() job.job_id = job_id job.job_state = JobState.DONE # Owner and wallclock time is unknown if as_dict: job_stats[job_id] = job else: job_stats.append(job) return job_stats
def test_serialization(self): """Test the serialization/deserialization of JobInfo classes.""" from aiida.schedulers.datastructures import JobInfo, JobState from datetime import datetime dict_serialized_content = { 'job_id': '12723', 'title': 'some title', 'queue_name': 'some_queue', 'account': 'my_account' } to_serialize = {'job_state': (JobState.QUEUED, 'job_state'), 'submission_time': (datetime.now(), 'date')} job_info = JobInfo() for key, val in dict_serialized_content.items(): setattr(job_info, key, val) for key, (val, field_type) in to_serialize.items(): setattr(job_info, key, val) # Also append to the dictionary for easier comparison later dict_serialized_content[key] = JobInfo.serialize_field(value=val, field_type=field_type) self.assertEqual(job_info.get_dict(), dict_serialized_content) # Full loop via JSON, moving data from job_info to job_info2; # we check that the content is fully preserved job_info2 = JobInfo.load_from_serialized(job_info.serialize()) self.assertEqual(job_info2.get_dict(), dict_serialized_content) # Check that fields are properly re-serialized with the correct type self.assertEqual(job_info2.job_state, to_serialize['job_state'][0]) # Check that fields are properly re-serialized with the correct type self.assertEqual(job_info2.submission_time, to_serialize['submission_time'][0])
def get_last_job_info(self): """Return the last information asked to the scheduler about the status of the job. :return: a `JobInfo` object (that closely resembles a dictionary) or None. """ from aiida.schedulers.datastructures import JobInfo last_job_info_serialized = self.get_attribute( self.SCHEUDLER_LAST_JOB_INFO_KEY, None) if last_job_info_serialized is not None: job_info = JobInfo() job_info.load_from_serialized(last_job_info_serialized) else: job_info = None return job_info
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. """ if stderr.strip(): self.logger.warning("Stderr when parsing joblist: {}".format( stderr.strip())) job_list = [job.split() for job in stdout.split('\n') if job] job_infos = [] for job_id, status in job_list: job = JobInfo() job.job_id = job_id job.job_state = _MAP_STATUS_YASCHEDULER[status] job_infos.append(job) return job_infos
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self.fields) # I don't raise because if I pass a list of jobs, # I get a non-zero status # if one of the job is not in the list anymore # retval should be zero # if retval != 0: # self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output and # there is no line containing "Invalid job id specified", that happens # when I ask for specific calculations, and they are all finished if stderr.strip() and 'Invalid job id specified' not in stderr: self.logger.warning("Warning in _parse_joblist_output, non-empty stderr='{}'".format(stderr.strip())) if retval != 0: raise SchedulerError('Error during squeue parsing (_parse_joblist_output function)') # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)} this_job = JobInfo() try: this_job.job_id = thisjob_dict['job_id'] this_job.annotation = thisjob_dict['annotation'] job_state_raw = thisjob_dict['state_raw'] except KeyError: # I skip this calculation if I couldn't find this basic info # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'".format(job)) continue try: job_state_string = _MAP_STATUS_SLURM[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED # QUEUED_HELD states are not specific states in SLURM; # they are instead set with state QUEUED, and then the # annotation tells if the job is held. # I check for 'Dependency', 'JobHeldUser', # 'JobHeldAdmin', 'BeginTime'. # Other states should not bring the job in QUEUED_HELD, I believe # (the man page of slurm seems to be incomplete, for instance # JobHeld* are not reported there; I also checked at the source code # of slurm 2.6 on github (https://github.com/SchedMD/slurm), # file slurm/src/common/slurm_protocol_defs.c, # and these seem all the states to be taken into account for the # QUEUED_HELD status). # There are actually a few others, like possible # failures, or partition-related reasons, but for the moment I # leave them in the QUEUED state. if (job_state_string == JobState.QUEUED and this_job.annotation in ['Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime']): job_state_string = JobState.QUEUED_HELD this_job.job_state = job_state_string #### # Up to here, I just made sure that there were at least three # fields, to set the most important fields for a job. # I now check if the length is equal to the number of fields if len(job) < num_fields: # I store this job only with the information # gathered up to now, and continue to the next job # Also print a warning self.logger.warning('Wrong line length in squeue output!' "Skipping optional fields. Line: '{}'" ''.format(jobdata_raw)) # I append this job before continuing job_list.append(this_job) continue # TODO: store executing_host? this_job.job_owner = thisjob_dict['username'] try: this_job.num_machines = int(thisjob_dict['number_nodes']) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_nodes'], this_job.job_id)) try: this_job.num_mpiprocs = int(thisjob_dict['number_cpus']) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_cpus'], this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = thisjob_dict['allocated_machines'] this_job.queue_name = thisjob_dict['partition'] try: this_job.requested_wallclock_time_seconds = (self._convert_time(thisjob_dict['time_limit'])) except ValueError: self.logger.warning('Error parsing the time limit for job id {}'.format(this_job.job_id)) # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: this_job.wallclock_time_seconds = (self._convert_time(thisjob_dict['time_used'])) except ValueError: self.logger.warning('Error parsing time_used for job id {}'.format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string(thisjob_dict['dispatch_time']) except ValueError: self.logger.warning('Error parsing dispatch_time for job id {}'.format(this_job.job_id)) try: this_job.submission_time = self._parse_time_string(thisjob_dict['submission_time']) except ValueError: self.logger.warning('Error parsing submission_time for job id {}'.format(this_job.job_id)) this_job.title = thisjob_dict['job_name'] # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): # pylint: disable=too-many-statements,too-many-branches if retval != 0: self.logger.error( f'Error in _parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}' ) raise SchedulerError( f'Error during joblist retrieval, retval={retval}') if stderr.strip(): self.logger.warning( f'in _parse_joblist_output for {str(self.transport)}: there was some text in stderr: {stderr}' ) if stdout: try: xmldata = xml.dom.minidom.parseString(stdout) except xml.parsers.expat.ExpatError: self.logger.error( f'in sge._parse_joblist_output: xml parsing of stdout failed: {stdout}' ) raise SchedulerParsingError( 'Error during joblist retrieval, xml parsing of stdout failed' ) else: self.logger.error( f'Error in sge._parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}' ) raise SchedulerError( 'Error during joblist retrieval, no stdout produced') try: first_child = xmldata.firstChild second_childs = first_child.childNodes tag_names_sec = [elem.tagName for elem in second_childs \ if elem.nodeType == 1] if 'queue_info' not in tag_names_sec: self.logger.error( f'Error in sge._parse_joblist_output: no queue_info: {stdout}' ) raise SchedulerError if 'job_info' not in tag_names_sec: self.logger.error( f'Error in sge._parse_joblist_output: no job_info: {stdout}' ) raise SchedulerError except SchedulerError: self.logger.error( f'Error in sge._parse_joblist_output: stdout={stdout}') raise SchedulerError('Error during xml processing, of stdout:' "There is no 'job_info' or no 'queue_info'" 'element or there are no jobs!') # If something weird happens while firstChild, pop, etc: except Exception: self.logger.error( f'Error in sge._parse_joblist_output: stdout={stdout}') raise SchedulerError('Error during xml processing, of stdout') jobs = list(first_child.getElementsByTagName('job_list')) # jobs = [i for i in jobinfo.getElementsByTagName('job_list')] # print [i[0].childNodes[0].data for i in job_numbers if i] joblist = [] for job in jobs: this_job = JobInfo() # In case the user needs more information the xml-data for # each job is stored: this_job.raw_data = job.toxml() try: job_element = job.getElementsByTagName('JB_job_number').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_id = str(element_child.data).strip() if not this_job.job_id: raise SchedulerError except SchedulerError: self.logger.error( f'Error in sge._parse_joblist_output:no job id is given, stdout={stdout}' ) raise SchedulerError( 'Error in sge._parse_joblist_output: no job id is given') except IndexError: self.logger.error("No 'job_number' given for job index {} in " 'job list, stdout={}'.format(jobs.index(job) \ , stdout)) raise IndexError( 'Error in sge._parse_joblist_output: no job id is given') try: job_element = job.getElementsByTagName('state').pop(0) element_child = job_element.childNodes.pop(0) job_state_string = str(element_child.data).strip() try: this_job.job_state = _MAP_STATUS_SGE[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id)) this_job.job_state = JobState.UNDETERMINED except IndexError: self.logger.warning("No 'job_state' field for job id {} in" 'stdout={}'.format(this_job.job_id, stdout)) this_job.job_state = JobState.UNDETERMINED try: job_element = job.getElementsByTagName('JB_owner').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_owner = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'job_owner' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName('JB_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.title = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'title' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName('queue_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.queue_name = str(element_child.data).strip() except IndexError: if this_job.job_state == JobState.RUNNING: self.logger.warning( f"No 'queue_name' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName( 'JB_submission_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.submission_time = self._parse_time_string( time_string) except ValueError: self.logger.warning( f"Error parsing 'JB_submission_time' for job id {this_job.job_id} ('{time_string}')" ) except IndexError: try: job_element = job.getElementsByTagName( 'JAT_start_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.dispatch_time = self._parse_time_string( time_string) except ValueError: self.logger.warning( f"Error parsing 'JAT_start_time'for job id {this_job.job_id} ('{time_string}')" ) except IndexError: self.logger.warning("No 'JB_submission_time' and no " "'JAT_start_time' field for job " 'id {}'.format(this_job.job_id)) # There is also cpu_usage, mem_usage, io_usage information available: if this_job.job_state == JobState.RUNNING: try: job_element = job.getElementsByTagName('slots').pop(0) element_child = job_element.childNodes.pop(0) this_job.num_mpiprocs = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'slots' field for job id {this_job.job_id}") joblist.append(this_job) # self.logger.debug("joblist final: {}".format(joblist)) return joblist
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches num_fields = len(self._joblist_fields) if retval != 0: self.logger.warning('Error in _parse_joblist_output: retval={}; ' 'stdout={}; stderr={}'.format( retval, stdout, stderr)) raise SchedulerError('Error during parsing joblist output, ' 'retval={}\n' 'stdout={}\nstderr={}'.format( retval, stdout, stderr)) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: # Each job should have all fields. if len(job) != num_fields: # I skip this calculation # (I don't append anything to job_list before continuing) self.logger.error( "Wrong line length in squeue output! '{}'".format(job)) continue this_job = JobInfo() this_job.job_id = job[0] this_job.annotation = job[2] job_state_raw = job[1] try: job_state_string = _MAP_STATUS_LSF[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED this_job.job_state = job_state_string # I get the remaining fields # The first three were already obtained # I know that the length is exactly num_fields because # I used split(_field_separator, num_fields) before # when creting 'job' # (_, _, _, executing_host, username, number_nodes, # number_cpus, allocated_machines, partition, # time_limit, time_used, dispatch_time, job_name) = job (_, _, _, _, username, number_nodes, number_cpus, allocated_machines, partition, finish_time, start_time, percent_complete, submission_time, job_name) = job this_job.job_owner = username try: this_job.num_machines = int(number_nodes) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format( number_nodes, this_job.job_id)) try: this_job.num_mpiprocs = int(number_cpus) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format( number_cpus, this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = allocated_machines this_job.queue_name = partition psd_finish_time = self._parse_time_string(finish_time, fmt='%b %d %H:%M') psd_start_time = self._parse_time_string(start_time, fmt='%b %d %H:%M') psd_submission_time = self._parse_time_string(submission_time, fmt='%b %d %H:%M') # Now get the time in seconds which has been used # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: requested_walltime = psd_finish_time - psd_start_time # fix of a weird bug. Since the year is not parsed, it is assumed # to always be 1900. Therefore, job submitted # in december and finishing in january would produce negative time differences if requested_walltime.total_seconds() < 0: import datetime old_month = psd_finish_time.month old_day = psd_finish_time.day old_hour = psd_finish_time.hour old_minute = psd_finish_time.minute new_year = psd_start_time.year + 1 # note: we assume that no job will last more than 1 year... psd_finish_time = datetime.datetime(year=new_year, month=old_month, day=old_day, hour=old_hour, minute=old_minute) requested_walltime = psd_finish_time - psd_start_time this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds( ) # pylint: disable=invalid-name except (TypeError, ValueError): self.logger.warning( 'Error parsing the time limit for job id {}'.format( this_job.job_id)) try: psd_percent_complete = float( percent_complete.strip(' L').strip('%')) this_job.wallclock_time_seconds = requested_walltime.total_seconds( ) * psd_percent_complete / 100. except ValueError: self.logger.warning( 'Error parsing the time used for job id {}'.format( this_job.job_id)) try: this_job.submission_time = psd_submission_time except ValueError: self.logger.warning( 'Error parsing submission time for job id {}'.format( this_job.job_id)) this_job.title = job_name # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # I don't raise because if I pass a list of jobs, I get a non-zero status # if one of the job is not in the list anymore # retval should be zero # if retval != 0: # _LOGGER.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output # but I strip lines containing "Unknown Job Id", that happens # also when I ask for a calculation that has finished # # I also strip for "Job has finished" because this happens for # those schedulers configured to leave the job in the output # of qstat for some time after job completion. filtered_stderr = '\n'.join( l for l in stderr.split('\n') if 'Unknown Job Id' not in l and 'Job has finished' not in l) if filtered_stderr.strip(): _LOGGER.warning('Warning in _parse_joblist_output, non-empty ' "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError('Error during qstat parsing, retval={}\n' 'stdout={}\nstderr={}'.format( retval, stdout, stderr)) jobdata_raw = [] # will contain raw data parsed from qstat output # Get raw data and split in lines for line_num, line in enumerate(stdout.split('\n'), start=1): # Each new job stanza starts with the string 'Job Id:': I # create a new item in the jobdata_raw list if line.startswith('Job Id:'): jobdata_raw.append({ 'id': line.split(':', 1)[1].strip(), 'lines': [], 'warning_lines_idx': [] }) # warning_lines_idx: lines that do not start either with # tab or space else: if line.strip(): # This is a non-empty line, therefore it is an attribute # of the last job found if not jobdata_raw: # The list is still empty! (This means that I found a # non-empty line, before finding the first 'Job Id:' # string: it is an error. However this may happen # only before the first job. raise SchedulerParsingError( 'I did not find the header for the first job') # _LOGGER.warning("I found some text before the " # "first job: {}".format(l)) else: if line.startswith(' '): # If it starts with a space, it is a new field jobdata_raw[-1]['lines'].append(line) elif line.startswith('\t'): # If a line starts with a TAB, # I append to the previous string # stripping the TAB if not jobdata_raw[-1]['lines']: raise SchedulerParsingError( 'Line {} is the first line of the job, but it ' 'starts with a TAB! ({})'.format( line_num, line)) jobdata_raw[-1]['lines'][-1] += line[1:] else: # raise SchedulerParsingError( # "Wrong starting character at line {}! ({})" # "".format(line_num, l)) ## For some reasons, the output of 'comment' and ## 'Variable_List', for instance, can have ## newlines if they are included... # I do a ## workaround jobdata_raw[-1]['lines'][-1] += '\n{}'.format(line) jobdata_raw[-1]['warning_lines_idx'].append( len(jobdata_raw[-1]['lines']) - 1) # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: this_job = JobInfo() this_job.job_id = job['id'] lines_without_equals_sign = [ i for i in job['lines'] if '=' not in i ] # There are lines without equals sign: this is bad if lines_without_equals_sign: # Should I only warn? _LOGGER.error('There are lines without equals sign! {}'.format( lines_without_equals_sign)) raise SchedulerParsingError( 'There are lines without equals sign.') raw_data = { i.split('=', 1)[0].strip().lower(): i.split('=', 1)[1].lstrip() for i in job['lines'] if '=' in i } ## I ignore the errors for the time being - this seems to be ## a problem if there are \n in the content of some variables? ## I consider this a workaround... # for line_with_warning in set(job['warning_lines_idx']): # if job['lines'][line_with_warning].split( # '=',1)[0].strip().lower() != "comment": # raise SchedulerParsingError( # "Wrong starting character in one of the lines " # "of job {}, and it's not a comment! ({})" # "".format(this_job.job_id, # job['lines'][line_with_warning])) problematic_fields = [] for line_with_warning in set(job['warning_lines_idx']): problematic_fields.append( job['lines'][line_with_warning].split( '=', 1)[0].strip().lower()) if problematic_fields: # These are the fields that contain unexpected newlines raw_data['warning_fields_with_newlines'] = problematic_fields # I believe that exit_status and terminating_signal cannot be # retrieved from the qstat -f output. # I wrap calls in try-except clauses to avoid errors if a field # is missing try: this_job.title = raw_data['job_name'] except KeyError: _LOGGER.debug("No 'job_name' field for job id {}".format( this_job.job_id)) try: this_job.annotation = raw_data['comment'] except KeyError: # Many jobs do not have a comment; I do not complain about it. pass # _LOGGER.debug("No 'comment' field for job id {}".format( # this_job.job_id)) try: job_state_string = raw_data['job_state'] try: this_job.job_state = self._map_status[job_state_string] except KeyError: _LOGGER.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id)) this_job.job_state = JobState.UNDETERMINED except KeyError: _LOGGER.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = JobState.UNDETERMINED try: this_job.job_substate = raw_data['substate'] except KeyError: _LOGGER.debug("No 'substate' field for job id {}".format( this_job.job_id)) try: exec_hosts = raw_data['exec_host'].split('+') except KeyError: # No exec_host information found (it may be ok, if the job # is not running) pass else: # parse each host; syntax, from the man page: # hosta/J1+hostb/J2*P+... # where J1 and J2 are an index of the job # on the named host and P is the number of # processors allocated from that host to this job. # P does not appear if it is 1. try: exec_host_list = [] for exec_host in exec_hosts: node = MachineInfo() node.name, data = exec_host.split('/') data = data.split('*') if len(data) == 1: node.jobIndex = int(data[0]) node.num_cpus = 1 elif len(data) == 2: node.jobIndex = int(data[0]) node.num_cpus = int(data[1]) else: raise ValueError( 'Wrong number of pieces: {} ' 'instead of 1 or 2 in exec_hosts: ' '{}'.format(len(data), exec_hosts)) exec_host_list.append(node) this_job.allocated_machines = exec_host_list except Exception as exc: _LOGGER.debug('Problem parsing the node names, I ' 'got Exception {} with message {}; ' 'exec_hosts was {}'.format( str(type(exc)), exc, exec_hosts)) try: # I strip the part after the @: is this always ok? this_job.job_owner = raw_data['job_owner'].split('@')[0] except KeyError: _LOGGER.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.num_cpus = int(raw_data['resource_list.ncpus']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: _LOGGER.debug( "No 'resource_list.ncpus' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.ncpus' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.ncpus'], this_job.job_id)) try: this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: _LOGGER.debug( "No 'resource_list.mpiprocs' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.mpiprocs' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.mpiprocs'], this_job.job_id)) try: this_job.num_machines = int(raw_data['resource_list.nodect']) except KeyError: _LOGGER.debug( "No 'resource_list.nodect' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.nodect' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.nodect'], this_job.job_id)) # Double check of redundant info if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len( set(machine.name for machine in this_job. allocated_machines)) != this_job.num_machines: _LOGGER.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) try: this_job.queue_name = raw_data['queue'] except KeyError: _LOGGER.debug("No 'queue' field for job id {}".format( this_job.job_id)) try: this_job.RequestedWallclockTime = (self._convert_time( raw_data['resource_list.walltime'])) except KeyError: _LOGGER.debug( "No 'resource_list.walltime' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning( "Error parsing 'resource_list.walltime' for job id {}". format(this_job.job_id)) try: this_job.wallclock_time_seconds = (self._convert_time( raw_data['resources_used.walltime'])) except KeyError: # May not have started yet pass except ValueError: _LOGGER.warning( "Error parsing 'resources_used.walltime' for job id {}". format(this_job.job_id)) try: this_job.cpu_time = (self._convert_time( raw_data['resources_used.cput'])) except KeyError: # May not have started yet pass except ValueError: _LOGGER.warning( "Error parsing 'resources_used.cput' for job id {}".format( this_job.job_id)) # # ctime: The time that the job was created # mtime: The time that the job was last modified, changed state, # or changed locations. # qtime: The time that the job entered the current queue # stime: The time when the job started execution. # etime: The time that the job became eligible to run, i.e. in a # queued state while residing in an execution queue. try: this_job.submission_time = self._parse_time_string( raw_data['ctime']) except KeyError: _LOGGER.debug("No 'ctime' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("Error parsing 'ctime' for job id {}".format( this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( raw_data['stime']) except KeyError: # The job may not have been started yet pass except ValueError: _LOGGER.warning("Error parsing 'stime' for job id {}".format( this_job.job_id)) # TODO: see if we want to set also finish_time for finished jobs, # if there are any # Everything goes here anyway for debugging purposes this_job.raw_data = raw_data # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. .. note:: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ import re filtered_stderr = '\n'.join(l for l in stderr.split('\n')) if filtered_stderr.strip(): self.logger.warning( 'Warning in _parse_joblist_output, non-empty ' "(filtered) stderr='{}'".format(filtered_stderr) ) if retval != 0: raise SchedulerError('Error during direct execution parsing (_parse_joblist_output function)') # Create dictionary and parse specific fields job_list = [] for line in stdout.split('\n'): if re.search(r'^\s*PID', line) or line == '': # Skip the header if present continue line = re.sub(r'^\s+', '', line) job = re.split(r'\s+', line) this_job = JobInfo() this_job.job_id = job[0] if len(job) < 3: raise SchedulerError( 'Unexpected output from the scheduler, ' "not enough fields in line '{}'".format(line) ) try: job_state_string = job[1][0] # I just check the first character except IndexError: self.logger.debug("No 'job_state' field for job id {}".format(this_job.job_id)) this_job.job_state = JobState.UNDETERMINED else: try: this_job.job_state = \ _MAP_STATUS_PS[job_state_string] except KeyError: self.logger.warning( "Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id) ) this_job.job_state = JobState.UNDETERMINED try: # I strip the part after the @: is this always ok? this_job.job_owner = job[2] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format(this_job.job_id)) try: this_job.wallclock_time_seconds = self._convert_time(job[3]) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' for job id {}".format(this_job.job_id)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def get_jobs(self, jobs=None, user=None, as_dict=False): """ Return the list of currently active jobs """ computer_id = self.transport._machine # Host name is used as the identifier lpad = self.lpad query = { "spec._aiida_job_info.computer_id": computer_id, # Limit to this machine # Ignore completed and archived jobs "state": { "$not": { "$in": ["COMPLETED", "ARCHIVED"] } } } # Limit to the specific fw_ids if jobs: # Convert to integer keys jobs = [int(job_id) for job_id in jobs] query['fw_id'] = {'$in': jobs} fw_ids = lpad.get_fw_ids(query) joblist = [] for fid in fw_ids: # Get the information of the fireworks in the dict format # this is more robust than getting Fireworks objects try: fw_dict = lpad.get_fw_dict_by_id(fid) except ValueError: raise SchedulerError(f"No FW found for id: {fid}") spec = fw_dict.get("spec", {}) this_job = JobInfo() this_job.job_id = str(fid) try: this_job.job_state = _MAP_STATUS_FW[fw_dict['state']] except IndexError: this_job.job_state = JobState.UNDETERMINED this_job.title = fw_dict.get('name') # Category or categories are mapped to queue_name attribute category = spec.get('category') if isinstance(category, str): this_job.queue_name = category elif isinstance(category, (tuple, list)): this_job.queue_name = ":".join(category) # The created_on is mapped to the submission time try: this_job.submission_time = datetime.strptime( fw_dict['created_on'], "%Y-%m-%dT%H:%M:%S.%f") except ValueError: pass # NOTE: add information about the dispatch time by looking into the launches joblist.append(this_job) if as_dict: jobdict = {job.job_id: job for job in joblist} if None in jobdict: raise SchedulerError('Found at least one job without jobid') return jobdict return joblist