def __do_acct(self, job, cmd, parse): """Run `cmd` to get accounting information and update `job` state accordingly.""" exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = parse(stdout) job.update(jobstatus) if 'exitcode' in jobstatus: if 'signal' in jobstatus: job.returncode = (jobstatus['signal'], jobstatus['exitcode']) else: # XXX: we're assuming the batch system executes the # job through a shell, and collects the shell exit # code -- IOW, a job is never exec()'d directly from # the batch system daemon. I'm not sure this is # actually true in all cases. job.returncode = Run.shellexit_to_returncode( int(jobstatus['exitcode'])) job.state = Run.State.TERMINATING return job.state else: raise gc3libs.exceptions.AuxiliaryCommandError( "Failed running accounting command `%s`:" " exit code: %d, stderr: '%s'" % (cmd, exit_code, stderr), do_log=True)
def _parse_acct_output(self, stdout, stderr): acctinfo = {} for line in stdout.split("\n"): # skip empty and header lines line = line.strip() if line == "" or "===" in line: continue # extract key/value pairs from `qacct` output key, value = line.split(" ", 1) value = value.strip() if key == "failed": # value may be, e.g., "100 : assumedly after job" value = value.split()[0] try: dest, conv = self._qacct_keyval_mapping[key] acctinfo[dest] = conv(value) except KeyError: # no conversion by default -- keep it a string acctinfo["sge_" + key] = value except (ValueError, TypeError) as err: log.error( "Cannot parse value '%s' for qacct parameter '%s': %s: %s", value, key, err.__class__.__name__, str(err), ) acctinfo[dest] = None assert "exitcode" in acctinfo, "Could not extract exit code from `tracejob` output" acctinfo["termstatus"] = Run.shellexit_to_returncode(acctinfo.pop("exitcode")) return acctinfo
def _parse_acct_output(self, stdout, stderr): """Parse `tracejob` output.""" acctinfo = {} for line in stdout.split('\n'): for pattern, carry_on in [ # regexp exit loop? # ===================== ========== (self._tracejob_queued_re, True), (self._tracejob_run_re, True), (self._tracejob_last_re, False), ]: match = pattern.match(line) if match: for key, value in match.groupdict().items(): attr, conv = self._tracejob_keyval_mapping[key] acctinfo[attr] = conv(value) if carry_on: continue else: break assert 'exitcode' in acctinfo, ( "Could not extract exit code from `tracejob` output") acctinfo['termstatus'] = Run.shellexit_to_returncode( acctinfo.pop('exitcode')) return acctinfo
def _parse_acct_output(self, stdout, stderr): """Parse `tracejob` output.""" acctinfo = {} for line in stdout.split('\n'): for pattern, carry_on in [ # regexp exit loop? # ===================== ========== (self._tracejob_queued_re, True), (self._tracejob_run_re, True), (self._tracejob_last_re, False), ]: match = pattern.match(line) if match: for key, value in match.groupdict().iteritems(): attr, conv = self._tracejob_keyval_mapping[key] acctinfo[attr] = conv(value) if carry_on: continue else: break assert 'exitcode' in acctinfo, ( "Could not extract exit code from `tracejob` output") acctinfo['termstatus'] = Run.shellexit_to_returncode( acctinfo.pop('exitcode')) return acctinfo
def _parse_acct_output(self, stdout, stderr): acctinfo = {} for line in stdout.split('\n'): # skip empty and header lines line = line.strip() if line == '' or '===' in line: continue # extract key/value pairs from `qacct` output key, value = line.split(' ', 1) value = value.strip() if key == 'failed': # value may be, e.g., "100 : assumedly after job" value = value.split()[0] try: dest, conv = self._qacct_keyval_mapping[key] acctinfo[dest] = conv(value) except KeyError: # no conversion by default -- keep it a string acctinfo['sge_' + key] = value except (ValueError, TypeError) as err: log.error( "Cannot parse value '%s' for qacct parameter '%s': %s: %s", value, key, err.__class__.__name__, str(err)) acctinfo[dest] = None assert 'exitcode' in acctinfo, ( "Could not extract exit code from `tracejob` output") acctinfo['termstatus'] = Run.shellexit_to_returncode( acctinfo.pop('exitcode')) return acctinfo
def _parse_stat_output(self, stdout, stderr): # LSF's `bjobs` can only report info for terminated jobs, if # they finished no longer than ``CLEAN_PERIOD`` seconds # before; for older jobs it just prints ``Job XXX is not # found`` to STDERR. However, it does the same when passed a # non-existent job ID. We cannot distinguish the two cases # here; let's just be optimistic and presume that if a job ID # is not found, it must have been terminated since (at least) # we have it in our records so it *was* submitted... See # issue #513 for details. if self._job_not_found_re.match(stderr): return self._stat_result(Run.State.TERMINATING, None) # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein # a line is truncated at 79 characters and continues upon the # next one; continuation lines start with a fixed amount of # whitespace. However, the amount of whitespace varies with # LSF release and possibly other factors, so we need to guess # or have users configure it... if self._CONTINUATION_LINE_START is None: self._CONTINUATION_LINE_START = ' ' \ * self._guess_continuation_line_prefix_len(stdout) # Join continuation lines, so that we can work on a single # block of text. lines = [] for line in stdout.split('\n'): if len(line) == 0: continue if line.startswith(self._CONTINUATION_LINE_START): lines[-1] += line[len(self._CONTINUATION_LINE_START):] else: lines.append(line) # now rebuild stdout by joining the reconstructed lines stdout = '\n'.join(lines) state = Run.State.UNKNOWN termstatus = None # XXX: this only works if the current status is the first one # reported in STDOUT ... match = LsfLrms._status_re.search(stdout) if match: lsf_job_state = match.group('state') state = LsfLrms._lsf_state_to_gc3pie_state(lsf_job_state) if lsf_job_state == 'DONE': # DONE = success termstatus = (0, 0) elif lsf_job_state == 'EXIT': # EXIT = job exited with exit code != 0 match = LsfLrms._unsuccessful_exit_re.search(stdout) if match: exit_status = int(match.group('exit_status')) termstatus = Run.shellexit_to_returncode(exit_status) return self._stat_result(state, termstatus)
def _parse_stat_output(self, stdout, stderr): # LSF's `bjobs` can only report info for terminated jobs, if # they finished no longer than ``CLEAN_PERIOD`` seconds # before; for older jobs it just prints ``Job XXX is not # found`` to STDERR. However, it does the same when passed a # non-existent job ID. We cannot distinguish the two cases # here; let's just be optimistic and presume that if a job ID # is not found, it must have been terminated since (at least) # we have it in our records so it *was* submitted... See # issue #513 for details. if self._job_not_found_re.match(stderr): return self._stat_result(Run.State.TERMINATING, None) # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein # a line is truncated at 79 characters and continues upon the # next one; continuation lines start with a fixed amount of # whitespace. However, the amount of whitespace varies with # LSF release and possibly other factors, so we need to guess # or have users configure it... if self._CONTINUATION_LINE_START is None: self._CONTINUATION_LINE_START = ' ' \ * self._guess_continuation_line_prefix_len(stdout) # Join continuation lines, so that we can work on a single # block of text. lines = [] for line in stdout.split('\n'): if len(line) == 0: continue if line.startswith(self._CONTINUATION_LINE_START): lines[-1] += line[len(self._CONTINUATION_LINE_START):] else: lines.append(line) # now rebuild stdout by joining the reconstructed lines stdout = str.join('\n', lines) state = Run.State.UNKNOWN termstatus = None # XXX: this only works if the current status is the first one # reported in STDOUT ... match = LsfLrms._status_re.search(stdout) if match: lsf_job_state = match.group('state') state = LsfLrms._lsf_state_to_gc3pie_state(lsf_job_state) if lsf_job_state == 'DONE': # DONE = success termstatus = (0, 0) elif lsf_job_state == 'EXIT': # EXIT = job exited with exit code != 0 match = LsfLrms._unsuccessful_exit_re.search(stdout) if match: exit_status = int(match.group('exit_status')) termstatus = Run.shellexit_to_returncode(exit_status) return self._stat_result(state, termstatus)
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug( "Process with PID %s found." " Checking its running status", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING else: log.debug( "Process with PID %d not found." " Checking wrapper file ...", pid) app.execution.state = Run.State.TERMINATING if pid in self.job_infos: self.job_infos[pid]['terminated'] = True assert (app.requested_memory == self.job_infos[pid] ['requested_memory']) if app.requested_memory: self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: wrapper_file = self.transport.open(wrapper_filename, 'r') except Exception as err: self._delete_job_resource_file(pid) raise gc3libs.exceptions.InvalidValue( "Could not open wrapper file '%s' for task '%s': %s" % (wrapper_filename, app, err), do_log=True) try: outcome = self._parse_wrapper_output(wrapper_file) app.execution.returncode = \ Run.shellexit_to_returncode(int(outcome.ReturnCode)) self._delete_job_resource_file(pid) finally: wrapper_file.close() self._get_persisted_resource_state() return app.execution.state
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug("Process with PID %s found." " Checking its running status", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING else: log.debug( "Process with PID %d not found." " Checking wrapper file ...", pid) app.execution.state = Run.State.TERMINATING if pid in self.job_infos: self.job_infos[pid]['terminated'] = True assert (app.requested_memory == self.job_infos[pid]['requested_memory']) if app.requested_memory: self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: wrapper_file = self.transport.open(wrapper_filename, 'r') except Exception as err: self._delete_job_resource_file(pid) raise gc3libs.exceptions.InvalidValue( "Could not open wrapper file '%s' for task '%s': %s" % (wrapper_filename, app, err), do_log=True) try: outcome = self._parse_wrapper_output(wrapper_file) app.execution.returncode = \ Run.shellexit_to_returncode(int(outcome.ReturnCode)) self._delete_job_resource_file(pid) finally: wrapper_file.close() self._get_persisted_resource_state() return app.execution.state
def _parse_secondary_acct_output(self, stdout, stderr): """Parse `qstat -x -f` output (PBSPro only).""" acctinfo = {} # FIXME: could be a bit smarter and not use a dumb quadratic # complexity algo... for line in stdout.split('\n'): for key, (attr, conv) in self._pbspro_keyval_mapping: if (key + ' = ') in line: value = line.split('=')[1].strip() acctinfo[attr] = conv(value) assert 'exitcode' in acctinfo, ( "Could not extract exit code from `qstat -x -f` output") acctinfo['termstatus'] = Run.shellexit_to_returncode( acctinfo.pop('exitcode')) return acctinfo
def _parse_returncode_string(val): return Run.shellexit_to_returncode(int(val))
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning("Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error("Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug("Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug( "The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug(" remote command returned stdout: '%s'", stdout) log.debug(" remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning( "Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error( "Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug( "Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug("The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug( " remote command returned stdout: '%s'", stdout) log.debug( " remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state