def get_jobs(self, jobs=None, user=None, as_dict=False): """ Overrides original method from DirectScheduler in order to list missing processes as DONE. """ job_stats = super().get_jobs(jobs=jobs, user=user, as_dict=as_dict) found_jobs = [] # Get the list of known jobs if as_dict: found_jobs = job_stats.keys() else: found_jobs = [j.job_id for j in job_stats] # Now check if there are any the user requested but were not found not_found_jobs = list(set(jobs) - set(found_jobs)) if jobs else [] for job_id in not_found_jobs: job = JobInfo() job.job_id = job_id job.job_state = JobState.DONE # Owner and wallclock time is unknown if as_dict: job_stats[job_id] = job else: job_stats.append(job) return job_stats
def get_last_job_info(self): """Return the last information asked to the scheduler about the status of the job. :return: a `JobInfo` object (that closely resembles a dictionary) or None. """ from aiida.schedulers.datastructures import JobInfo last_job_info_serialized = self.get_attribute( self.SCHEUDLER_LAST_JOB_INFO_KEY, None) if last_job_info_serialized is not None: job_info = JobInfo() job_info.load_from_serialized(last_job_info_serialized) else: job_info = None return job_info
def test_serialization(self): """Test the serialization/deserialization of JobInfo classes.""" from aiida.schedulers.datastructures import JobInfo, JobState from datetime import datetime dict_serialized_content = { 'job_id': '12723', 'title': 'some title', 'queue_name': 'some_queue', 'account': 'my_account' } to_serialize = {'job_state': (JobState.QUEUED, 'job_state'), 'submission_time': (datetime.now(), 'date')} job_info = JobInfo() for key, val in dict_serialized_content.items(): setattr(job_info, key, val) for key, (val, field_type) in to_serialize.items(): setattr(job_info, key, val) # Also append to the dictionary for easier comparison later dict_serialized_content[key] = JobInfo.serialize_field(value=val, field_type=field_type) self.assertEqual(job_info.get_dict(), dict_serialized_content) # Full loop via JSON, moving data from job_info to job_info2; # we check that the content is fully preserved job_info2 = JobInfo.load_from_serialized(job_info.serialize()) self.assertEqual(job_info2.get_dict(), dict_serialized_content) # Check that fields are properly re-serialized with the correct type self.assertEqual(job_info2.job_state, to_serialize['job_state'][0]) # Check that fields are properly re-serialized with the correct type self.assertEqual(job_info2.submission_time, to_serialize['submission_time'][0])
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. """ if stderr.strip(): self.logger.warning("Stderr when parsing joblist: {}".format( stderr.strip())) job_list = [job.split() for job in stdout.split('\n') if job] job_infos = [] for job_id, status in job_list: job = JobInfo() job.job_id = job_id job.job_state = _MAP_STATUS_YASCHEDULER[status] job_infos.append(job) return job_infos
def get_last_job_info(self) -> Optional['JobInfo']: """Return the last information asked to the scheduler about the status of the job. The last job info is updated on every poll of the scheduler, except for the final poll when the job drops from the scheduler's job queue. For completed jobs, the last job info therefore contains the "second-to-last" job info that still shows the job as running. Please use :meth:`~aiida.orm.nodes.process.calculation.calcjob.CalcJobNode.get_detailed_job_info` instead. :return: a `JobInfo` object (that closely resembles a dictionary) or None. """ from aiida.schedulers.datastructures import JobInfo last_job_info_dictserialized = self.get_attribute(self.SCHEDULER_LAST_JOB_INFO_KEY, None) if last_job_info_dictserialized is not None: job_info = JobInfo.load_from_dict(last_job_info_dictserialized) else: job_info = None return job_info
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self.fields) # I don't raise because if I pass a list of jobs, # I get a non-zero status # if one of the job is not in the list anymore # retval should be zero # if retval != 0: # self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output and # there is no line containing "Invalid job id specified", that happens # when I ask for specific calculations, and they are all finished if stderr.strip() and 'Invalid job id specified' not in stderr: self.logger.warning("Warning in _parse_joblist_output, non-empty stderr='{}'".format(stderr.strip())) if retval != 0: raise SchedulerError('Error during squeue parsing (_parse_joblist_output function)') # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)} this_job = JobInfo() try: this_job.job_id = thisjob_dict['job_id'] this_job.annotation = thisjob_dict['annotation'] job_state_raw = thisjob_dict['state_raw'] except KeyError: # I skip this calculation if I couldn't find this basic info # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'".format(job)) continue try: job_state_string = _MAP_STATUS_SLURM[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED # QUEUED_HELD states are not specific states in SLURM; # they are instead set with state QUEUED, and then the # annotation tells if the job is held. # I check for 'Dependency', 'JobHeldUser', # 'JobHeldAdmin', 'BeginTime'. # Other states should not bring the job in QUEUED_HELD, I believe # (the man page of slurm seems to be incomplete, for instance # JobHeld* are not reported there; I also checked at the source code # of slurm 2.6 on github (https://github.com/SchedMD/slurm), # file slurm/src/common/slurm_protocol_defs.c, # and these seem all the states to be taken into account for the # QUEUED_HELD status). # There are actually a few others, like possible # failures, or partition-related reasons, but for the moment I # leave them in the QUEUED state. if (job_state_string == JobState.QUEUED and this_job.annotation in ['Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime']): job_state_string = JobState.QUEUED_HELD this_job.job_state = job_state_string #### # Up to here, I just made sure that there were at least three # fields, to set the most important fields for a job. # I now check if the length is equal to the number of fields if len(job) < num_fields: # I store this job only with the information # gathered up to now, and continue to the next job # Also print a warning self.logger.warning('Wrong line length in squeue output!' "Skipping optional fields. Line: '{}'" ''.format(jobdata_raw)) # I append this job before continuing job_list.append(this_job) continue # TODO: store executing_host? this_job.job_owner = thisjob_dict['username'] try: this_job.num_machines = int(thisjob_dict['number_nodes']) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_nodes'], this_job.job_id)) try: this_job.num_mpiprocs = int(thisjob_dict['number_cpus']) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format(thisjob_dict['number_cpus'], this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = thisjob_dict['allocated_machines'] this_job.queue_name = thisjob_dict['partition'] try: this_job.requested_wallclock_time_seconds = (self._convert_time(thisjob_dict['time_limit'])) except ValueError: self.logger.warning('Error parsing the time limit for job id {}'.format(this_job.job_id)) # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: this_job.wallclock_time_seconds = (self._convert_time(thisjob_dict['time_used'])) except ValueError: self.logger.warning('Error parsing time_used for job id {}'.format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string(thisjob_dict['dispatch_time']) except ValueError: self.logger.warning('Error parsing dispatch_time for job id {}'.format(this_job.job_id)) try: this_job.submission_time = self._parse_time_string(thisjob_dict['submission_time']) except ValueError: self.logger.warning('Error parsing submission_time for job id {}'.format(this_job.job_id)) this_job.title = thisjob_dict['job_name'] # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): # pylint: disable=too-many-statements,too-many-branches if retval != 0: self.logger.error( f'Error in _parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}' ) raise SchedulerError( f'Error during joblist retrieval, retval={retval}') if stderr.strip(): self.logger.warning( f'in _parse_joblist_output for {str(self.transport)}: there was some text in stderr: {stderr}' ) if stdout: try: xmldata = xml.dom.minidom.parseString(stdout) except xml.parsers.expat.ExpatError: self.logger.error( f'in sge._parse_joblist_output: xml parsing of stdout failed: {stdout}' ) raise SchedulerParsingError( 'Error during joblist retrieval, xml parsing of stdout failed' ) else: self.logger.error( f'Error in sge._parse_joblist_output: retval={retval}; stdout={stdout}; stderr={stderr}' ) raise SchedulerError( 'Error during joblist retrieval, no stdout produced') try: first_child = xmldata.firstChild second_childs = first_child.childNodes tag_names_sec = [elem.tagName for elem in second_childs \ if elem.nodeType == 1] if 'queue_info' not in tag_names_sec: self.logger.error( f'Error in sge._parse_joblist_output: no queue_info: {stdout}' ) raise SchedulerError if 'job_info' not in tag_names_sec: self.logger.error( f'Error in sge._parse_joblist_output: no job_info: {stdout}' ) raise SchedulerError except SchedulerError: self.logger.error( f'Error in sge._parse_joblist_output: stdout={stdout}') raise SchedulerError('Error during xml processing, of stdout:' "There is no 'job_info' or no 'queue_info'" 'element or there are no jobs!') # If something weird happens while firstChild, pop, etc: except Exception: self.logger.error( f'Error in sge._parse_joblist_output: stdout={stdout}') raise SchedulerError('Error during xml processing, of stdout') jobs = list(first_child.getElementsByTagName('job_list')) # jobs = [i for i in jobinfo.getElementsByTagName('job_list')] # print [i[0].childNodes[0].data for i in job_numbers if i] joblist = [] for job in jobs: this_job = JobInfo() # In case the user needs more information the xml-data for # each job is stored: this_job.raw_data = job.toxml() try: job_element = job.getElementsByTagName('JB_job_number').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_id = str(element_child.data).strip() if not this_job.job_id: raise SchedulerError except SchedulerError: self.logger.error( f'Error in sge._parse_joblist_output:no job id is given, stdout={stdout}' ) raise SchedulerError( 'Error in sge._parse_joblist_output: no job id is given') except IndexError: self.logger.error("No 'job_number' given for job index {} in " 'job list, stdout={}'.format(jobs.index(job) \ , stdout)) raise IndexError( 'Error in sge._parse_joblist_output: no job id is given') try: job_element = job.getElementsByTagName('state').pop(0) element_child = job_element.childNodes.pop(0) job_state_string = str(element_child.data).strip() try: this_job.job_state = _MAP_STATUS_SGE[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id)) this_job.job_state = JobState.UNDETERMINED except IndexError: self.logger.warning("No 'job_state' field for job id {} in" 'stdout={}'.format(this_job.job_id, stdout)) this_job.job_state = JobState.UNDETERMINED try: job_element = job.getElementsByTagName('JB_owner').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_owner = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'job_owner' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName('JB_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.title = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'title' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName('queue_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.queue_name = str(element_child.data).strip() except IndexError: if this_job.job_state == JobState.RUNNING: self.logger.warning( f"No 'queue_name' field for job id {this_job.job_id}") try: job_element = job.getElementsByTagName( 'JB_submission_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.submission_time = self._parse_time_string( time_string) except ValueError: self.logger.warning( f"Error parsing 'JB_submission_time' for job id {this_job.job_id} ('{time_string}')" ) except IndexError: try: job_element = job.getElementsByTagName( 'JAT_start_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.dispatch_time = self._parse_time_string( time_string) except ValueError: self.logger.warning( f"Error parsing 'JAT_start_time'for job id {this_job.job_id} ('{time_string}')" ) except IndexError: self.logger.warning("No 'JB_submission_time' and no " "'JAT_start_time' field for job " 'id {}'.format(this_job.job_id)) # There is also cpu_usage, mem_usage, io_usage information available: if this_job.job_state == JobState.RUNNING: try: job_element = job.getElementsByTagName('slots').pop(0) element_child = job_element.childNodes.pop(0) this_job.num_mpiprocs = str(element_child.data).strip() except IndexError: self.logger.warning( f"No 'slots' field for job id {this_job.job_id}") joblist.append(this_job) # self.logger.debug("joblist final: {}".format(joblist)) return joblist
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches num_fields = len(self._joblist_fields) if retval != 0: self.logger.warning('Error in _parse_joblist_output: retval={}; ' 'stdout={}; stderr={}'.format( retval, stdout, stderr)) raise SchedulerError('Error during parsing joblist output, ' 'retval={}\n' 'stdout={}\nstderr={}'.format( retval, stdout, stderr)) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_FIELD_SEPARATOR, num_fields) for l in stdout.splitlines() if _FIELD_SEPARATOR in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: # Each job should have all fields. if len(job) != num_fields: # I skip this calculation # (I don't append anything to job_list before continuing) self.logger.error( "Wrong line length in squeue output! '{}'".format(job)) continue this_job = JobInfo() this_job.job_id = job[0] this_job.annotation = job[2] job_state_raw = job[1] try: job_state_string = _MAP_STATUS_LSF[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_raw, this_job.job_id)) job_state_string = JobState.UNDETERMINED this_job.job_state = job_state_string # I get the remaining fields # The first three were already obtained # I know that the length is exactly num_fields because # I used split(_field_separator, num_fields) before # when creting 'job' # (_, _, _, executing_host, username, number_nodes, # number_cpus, allocated_machines, partition, # time_limit, time_used, dispatch_time, job_name) = job (_, _, _, _, username, number_nodes, number_cpus, allocated_machines, partition, finish_time, start_time, percent_complete, submission_time, job_name) = job this_job.job_owner = username try: this_job.num_machines = int(number_nodes) except ValueError: self.logger.warning('The number of allocated nodes is not ' 'an integer ({}) for job id {}!'.format( number_nodes, this_job.job_id)) try: this_job.num_mpiprocs = int(number_cpus) except ValueError: self.logger.warning('The number of allocated cores is not ' 'an integer ({}) for job id {}!'.format( number_cpus, this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == JobState.RUNNING: this_job.allocated_machines_raw = allocated_machines this_job.queue_name = partition psd_finish_time = self._parse_time_string(finish_time, fmt='%b %d %H:%M') psd_start_time = self._parse_time_string(start_time, fmt='%b %d %H:%M') psd_submission_time = self._parse_time_string(submission_time, fmt='%b %d %H:%M') # Now get the time in seconds which has been used # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == JobState.RUNNING: try: requested_walltime = psd_finish_time - psd_start_time # fix of a weird bug. Since the year is not parsed, it is assumed # to always be 1900. Therefore, job submitted # in december and finishing in january would produce negative time differences if requested_walltime.total_seconds() < 0: import datetime old_month = psd_finish_time.month old_day = psd_finish_time.day old_hour = psd_finish_time.hour old_minute = psd_finish_time.minute new_year = psd_start_time.year + 1 # note: we assume that no job will last more than 1 year... psd_finish_time = datetime.datetime(year=new_year, month=old_month, day=old_day, hour=old_hour, minute=old_minute) requested_walltime = psd_finish_time - psd_start_time this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds( ) # pylint: disable=invalid-name except (TypeError, ValueError): self.logger.warning( 'Error parsing the time limit for job id {}'.format( this_job.job_id)) try: psd_percent_complete = float( percent_complete.strip(' L').strip('%')) this_job.wallclock_time_seconds = requested_walltime.total_seconds( ) * psd_percent_complete / 100. except ValueError: self.logger.warning( 'Error parsing the time used for job id {}'.format( this_job.job_id)) try: this_job.submission_time = psd_submission_time except ValueError: self.logger.warning( 'Error parsing submission time for job id {}'.format( this_job.job_id)) this_job.title = job_name # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # I don't raise because if I pass a list of jobs, I get a non-zero status # if one of the job is not in the list anymore # retval should be zero # if retval != 0: # _LOGGER.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output # but I strip lines containing "Unknown Job Id", that happens # also when I ask for a calculation that has finished # # I also strip for "Job has finished" because this happens for # those schedulers configured to leave the job in the output # of qstat for some time after job completion. filtered_stderr = '\n'.join( l for l in stderr.split('\n') if 'Unknown Job Id' not in l and 'Job has finished' not in l) if filtered_stderr.strip(): _LOGGER.warning('Warning in _parse_joblist_output, non-empty ' "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError('Error during qstat parsing, retval={}\n' 'stdout={}\nstderr={}'.format( retval, stdout, stderr)) jobdata_raw = [] # will contain raw data parsed from qstat output # Get raw data and split in lines for line_num, line in enumerate(stdout.split('\n'), start=1): # Each new job stanza starts with the string 'Job Id:': I # create a new item in the jobdata_raw list if line.startswith('Job Id:'): jobdata_raw.append({ 'id': line.split(':', 1)[1].strip(), 'lines': [], 'warning_lines_idx': [] }) # warning_lines_idx: lines that do not start either with # tab or space else: if line.strip(): # This is a non-empty line, therefore it is an attribute # of the last job found if not jobdata_raw: # The list is still empty! (This means that I found a # non-empty line, before finding the first 'Job Id:' # string: it is an error. However this may happen # only before the first job. raise SchedulerParsingError( 'I did not find the header for the first job') # _LOGGER.warning("I found some text before the " # "first job: {}".format(l)) else: if line.startswith(' '): # If it starts with a space, it is a new field jobdata_raw[-1]['lines'].append(line) elif line.startswith('\t'): # If a line starts with a TAB, # I append to the previous string # stripping the TAB if not jobdata_raw[-1]['lines']: raise SchedulerParsingError( 'Line {} is the first line of the job, but it ' 'starts with a TAB! ({})'.format( line_num, line)) jobdata_raw[-1]['lines'][-1] += line[1:] else: # raise SchedulerParsingError( # "Wrong starting character at line {}! ({})" # "".format(line_num, l)) ## For some reasons, the output of 'comment' and ## 'Variable_List', for instance, can have ## newlines if they are included... # I do a ## workaround jobdata_raw[-1]['lines'][-1] += '\n{}'.format(line) jobdata_raw[-1]['warning_lines_idx'].append( len(jobdata_raw[-1]['lines']) - 1) # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: this_job = JobInfo() this_job.job_id = job['id'] lines_without_equals_sign = [ i for i in job['lines'] if '=' not in i ] # There are lines without equals sign: this is bad if lines_without_equals_sign: # Should I only warn? _LOGGER.error('There are lines without equals sign! {}'.format( lines_without_equals_sign)) raise SchedulerParsingError( 'There are lines without equals sign.') raw_data = { i.split('=', 1)[0].strip().lower(): i.split('=', 1)[1].lstrip() for i in job['lines'] if '=' in i } ## I ignore the errors for the time being - this seems to be ## a problem if there are \n in the content of some variables? ## I consider this a workaround... # for line_with_warning in set(job['warning_lines_idx']): # if job['lines'][line_with_warning].split( # '=',1)[0].strip().lower() != "comment": # raise SchedulerParsingError( # "Wrong starting character in one of the lines " # "of job {}, and it's not a comment! ({})" # "".format(this_job.job_id, # job['lines'][line_with_warning])) problematic_fields = [] for line_with_warning in set(job['warning_lines_idx']): problematic_fields.append( job['lines'][line_with_warning].split( '=', 1)[0].strip().lower()) if problematic_fields: # These are the fields that contain unexpected newlines raw_data['warning_fields_with_newlines'] = problematic_fields # I believe that exit_status and terminating_signal cannot be # retrieved from the qstat -f output. # I wrap calls in try-except clauses to avoid errors if a field # is missing try: this_job.title = raw_data['job_name'] except KeyError: _LOGGER.debug("No 'job_name' field for job id {}".format( this_job.job_id)) try: this_job.annotation = raw_data['comment'] except KeyError: # Many jobs do not have a comment; I do not complain about it. pass # _LOGGER.debug("No 'comment' field for job id {}".format( # this_job.job_id)) try: job_state_string = raw_data['job_state'] try: this_job.job_state = self._map_status[job_state_string] except KeyError: _LOGGER.warning("Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id)) this_job.job_state = JobState.UNDETERMINED except KeyError: _LOGGER.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = JobState.UNDETERMINED try: this_job.job_substate = raw_data['substate'] except KeyError: _LOGGER.debug("No 'substate' field for job id {}".format( this_job.job_id)) try: exec_hosts = raw_data['exec_host'].split('+') except KeyError: # No exec_host information found (it may be ok, if the job # is not running) pass else: # parse each host; syntax, from the man page: # hosta/J1+hostb/J2*P+... # where J1 and J2 are an index of the job # on the named host and P is the number of # processors allocated from that host to this job. # P does not appear if it is 1. try: exec_host_list = [] for exec_host in exec_hosts: node = MachineInfo() node.name, data = exec_host.split('/') data = data.split('*') if len(data) == 1: node.jobIndex = int(data[0]) node.num_cpus = 1 elif len(data) == 2: node.jobIndex = int(data[0]) node.num_cpus = int(data[1]) else: raise ValueError( 'Wrong number of pieces: {} ' 'instead of 1 or 2 in exec_hosts: ' '{}'.format(len(data), exec_hosts)) exec_host_list.append(node) this_job.allocated_machines = exec_host_list except Exception as exc: _LOGGER.debug('Problem parsing the node names, I ' 'got Exception {} with message {}; ' 'exec_hosts was {}'.format( str(type(exc)), exc, exec_hosts)) try: # I strip the part after the @: is this always ok? this_job.job_owner = raw_data['job_owner'].split('@')[0] except KeyError: _LOGGER.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.num_cpus = int(raw_data['resource_list.ncpus']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: _LOGGER.debug( "No 'resource_list.ncpus' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.ncpus' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.ncpus'], this_job.job_id)) try: this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: _LOGGER.debug( "No 'resource_list.mpiprocs' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.mpiprocs' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.mpiprocs'], this_job.job_id)) try: this_job.num_machines = int(raw_data['resource_list.nodect']) except KeyError: _LOGGER.debug( "No 'resource_list.nodect' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("'resource_list.nodect' is not an integer " '({}) for job id {}!'.format( raw_data['resource_list.nodect'], this_job.job_id)) # Double check of redundant info if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len( set(machine.name for machine in this_job. allocated_machines)) != this_job.num_machines: _LOGGER.error('The length of the list of allocated ' 'nodes ({}) is different from the ' 'expected number of nodes ({})!'.format( len(this_job.allocated_machines), this_job.num_machines)) try: this_job.queue_name = raw_data['queue'] except KeyError: _LOGGER.debug("No 'queue' field for job id {}".format( this_job.job_id)) try: this_job.RequestedWallclockTime = (self._convert_time( raw_data['resource_list.walltime'])) except KeyError: _LOGGER.debug( "No 'resource_list.walltime' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning( "Error parsing 'resource_list.walltime' for job id {}". format(this_job.job_id)) try: this_job.wallclock_time_seconds = (self._convert_time( raw_data['resources_used.walltime'])) except KeyError: # May not have started yet pass except ValueError: _LOGGER.warning( "Error parsing 'resources_used.walltime' for job id {}". format(this_job.job_id)) try: this_job.cpu_time = (self._convert_time( raw_data['resources_used.cput'])) except KeyError: # May not have started yet pass except ValueError: _LOGGER.warning( "Error parsing 'resources_used.cput' for job id {}".format( this_job.job_id)) # # ctime: The time that the job was created # mtime: The time that the job was last modified, changed state, # or changed locations. # qtime: The time that the job entered the current queue # stime: The time when the job started execution. # etime: The time that the job became eligible to run, i.e. in a # queued state while residing in an execution queue. try: this_job.submission_time = self._parse_time_string( raw_data['ctime']) except KeyError: _LOGGER.debug("No 'ctime' field for job id {}".format( this_job.job_id)) except ValueError: _LOGGER.warning("Error parsing 'ctime' for job id {}".format( this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( raw_data['stime']) except KeyError: # The job may not have been started yet pass except ValueError: _LOGGER.warning("Error parsing 'stime' for job id {}".format( this_job.job_id)) # TODO: see if we want to set also finish_time for finished jobs, # if there are any # Everything goes here anyway for debugging purposes this_job.raw_data = raw_data # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. .. note:: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ import re filtered_stderr = '\n'.join(l for l in stderr.split('\n')) if filtered_stderr.strip(): self.logger.warning( 'Warning in _parse_joblist_output, non-empty ' "(filtered) stderr='{}'".format(filtered_stderr) ) if retval != 0: raise SchedulerError('Error during direct execution parsing (_parse_joblist_output function)') # Create dictionary and parse specific fields job_list = [] for line in stdout.split('\n'): if re.search(r'^\s*PID', line) or line == '': # Skip the header if present continue line = re.sub(r'^\s+', '', line) job = re.split(r'\s+', line) this_job = JobInfo() this_job.job_id = job[0] if len(job) < 3: raise SchedulerError( 'Unexpected output from the scheduler, ' "not enough fields in line '{}'".format(line) ) try: job_state_string = job[1][0] # I just check the first character except IndexError: self.logger.debug("No 'job_state' field for job id {}".format(this_job.job_id)) this_job.job_state = JobState.UNDETERMINED else: try: this_job.job_state = \ _MAP_STATUS_PS[job_state_string] except KeyError: self.logger.warning( "Unrecognized job_state '{}' for job " 'id {}'.format(job_state_string, this_job.job_id) ) this_job.job_state = JobState.UNDETERMINED try: # I strip the part after the @: is this always ok? this_job.job_owner = job[2] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format(this_job.job_id)) try: this_job.wallclock_time_seconds = self._convert_time(job[3]) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' for job id {}".format(this_job.job_id)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def get_jobs(self, jobs=None, user=None, as_dict=False): """ Return the list of currently active jobs """ computer_id = self.transport._machine # Host name is used as the identifier lpad = self.lpad query = { "spec._aiida_job_info.computer_id": computer_id, # Limit to this machine # Ignore completed and archived jobs "state": { "$not": { "$in": ["COMPLETED", "ARCHIVED"] } } } # Limit to the specific fw_ids if jobs: # Convert to integer keys jobs = [int(job_id) for job_id in jobs] query['fw_id'] = {'$in': jobs} fw_ids = lpad.get_fw_ids(query) joblist = [] for fid in fw_ids: # Get the information of the fireworks in the dict format # this is more robust than getting Fireworks objects try: fw_dict = lpad.get_fw_dict_by_id(fid) except ValueError: raise SchedulerError(f"No FW found for id: {fid}") spec = fw_dict.get("spec", {}) this_job = JobInfo() this_job.job_id = str(fid) try: this_job.job_state = _MAP_STATUS_FW[fw_dict['state']] except IndexError: this_job.job_state = JobState.UNDETERMINED this_job.title = fw_dict.get('name') # Category or categories are mapped to queue_name attribute category = spec.get('category') if isinstance(category, str): this_job.queue_name = category elif isinstance(category, (tuple, list)): this_job.queue_name = ":".join(category) # The created_on is mapped to the submission time try: this_job.submission_time = datetime.strptime( fw_dict['created_on'], "%Y-%m-%dT%H:%M:%S.%f") except ValueError: pass # NOTE: add information about the dispatch time by looking into the launches joblist.append(this_job) if as_dict: jobdict = {job.job_id: job for job in joblist} if None in jobdict: raise SchedulerError('Found at least one job without jobid') return jobdict return joblist