def _cleanup_terminating_task(self, app, pid, termstatus=None): app.execution.state = Run.State.TERMINATING if termstatus is not None: app.execution.returncode = termstatus if pid in self.job_infos: self.job_infos[pid]['terminated'] = True if app.requested_memory is not None: assert (app.requested_memory == self.job_infos[pid] ['requested_memory']) self.available_memory += app.requested_memory wrapper_filename = posixpath.join(app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: log.debug( "Reading resource utilization from wrapper file `%s` for task %s ...", wrapper_filename, app) with self.transport.open(wrapper_filename, 'r') as wrapper_file: outcome = self._parse_wrapper_output(wrapper_file) app.execution.update(outcome) if termstatus is None: app.execution.returncode = outcome.returncode except Exception as err: msg = ( "Could not open wrapper file `{0}` for task `{1}`: {2}".format( wrapper_filename, app, err)) log.warning( "%s -- Termination status and resource utilization fields will not be set.", msg) raise gc3libs.exceptions.InvalidValue(msg) finally: self._delete_job_resource_file(pid)
def free(self, app): """ Delete the temporary directory where a child process has run. The temporary directory is removed with all its content, recursively. If the deletion is successful, the `lrms_execdir` attribute in `app.execution` is reset to `None`; subsequent invocations of this method on the same applications do nothing. """ try: if app.execution.lrms_execdir is not None: self.transport.connect() self.transport.remove_tree(app.execution.lrms_execdir) app.execution.lrms_execdir = None except Exception as ex: log.warning("Could not remove directory '%s': %s: %s", app.execution.lrms_execdir, ex.__class__.__name__, ex) try: pid = app.execution.lrms_jobid self._delete_job_resource_file(pid) except AttributeError: # lrms_jobid not yet assigned # probabaly submit process failed before # ingnore and continue pass
def _cleanup_terminating_task(self, app, pid, termstatus=None): app.execution.state = Run.State.TERMINATING if termstatus is not None: app.execution.returncode = termstatus if pid in self.job_infos: self.job_infos[pid]['terminated'] = True if app.requested_memory is not None: assert (app.requested_memory == self.job_infos[pid]['requested_memory']) self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: log.debug( "Reading resource utilization from wrapper file `%s` for task %s ...", wrapper_filename, app) with self.transport.open(wrapper_filename, 'r') as wrapper_file: outcome = self._parse_wrapper_output(wrapper_file) app.execution.update(outcome) if termstatus is None: app.execution.returncode = outcome.returncode except Exception as err: msg = ("Could not open wrapper file `{0}` for task `{1}`: {2}" .format(wrapper_filename, app, err)) log.warning("%s -- Termination status and resource utilization fields will not be set.", msg) raise gc3libs.exceptions.InvalidValue(msg) finally: self._delete_job_resource_file(pid)
def free(self, app): job = app.execution try: self.transport.connect() self.transport.remove_tree(job.ssh_remote_folder) except Exception as err: log.warning("Failed removing remote folder '%s': %s: %s", job.ssh_remote_folder, err.__class__, err) return
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug("Process with PID %s found." " Checking its running status ...", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING # if `requested_walltime` is set, enforce it as a # running time limit if app.requested_walltime is not None: exit_code2, stdout2, stderr2 = self.transport.execute_command( "ps -p %d -o etimes=" % pid) if exit_code2 != 0: # job terminated already, do cleanup and return self._cleanup_terminating_task(app, pid) return app.execution.state cancel = False elapsed = Duration(stdout2.strip() + 'seconds') if elapsed > self.max_walltime: log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.", app, elapsed.to_timedelta(), self.max_walltime, self.name) cancel = True if elapsed > app.requested_walltime: log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.", app, elapsed.to_timedelta(), app.requested_walltime) cancel = True if cancel: self.cancel_job(app) # set signal to SIGTERM in termination status self._cleanup_terminating_task(app, pid, termstatus=(15, -1)) return app.execution.state else: log.debug( "Process with PID %d not found," " assuming task %s has finished running.", pid, app) self._cleanup_terminating_task(app, pid) self._get_persisted_resource_state() return app.execution.state
def free(self, app): job = app.execution try: self.transport.connect() self.transport.remove_tree(job.ssh_remote_folder) except: log.warning("Failed removing remote folder '%s': %s: %s", job.ssh_remote_folder, sys.exc_info()[0], sys.exc_info()[1]) return
def _parse_stat_output(self, stdout, stderr): ge_status_code = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code) if ge_status_code in ["s", "S", "T"] or ge_status_code.startswith("h"): state = Run.State.STOPPED elif "qw" in ge_status_code: state = Run.State.SUBMITTED elif "r" in ge_status_code or "R" in ge_status_code or "t" in ge_status_code: state = Run.State.RUNNING elif ge_status_code == "E": # error condition state = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code) state = Run.State.UNKNOWN # to get the exit status information we'll have to parse # `qacct` output so put ``None`` here return self._stat_result(state, None)
def _parse_acct_output(self, stdout): # Antonio: this is an ugly fix, but we have issues with bacct # on some LSF installation being veeeeery slow, so we have to # try and use `bjobs` whenever possible, and fall back to # bacct if bjobs does not work. # # However, since the user could update the configuration file # and put `bacct = bacct`, we also have to ensure that we are # calling the correct function to parse the output of the acct # command. if self._bacct.startswith('bacct'): return self.__parse_acct_output_w_bacct(stdout) elif self._bacct.startswith('bjobs'): return self.__parse_acct_output_w_bjobs(stdout) else: log.warning( "Unknown acct command `%s`. Assuming its output is compatible" " with `bacct`" % self._bacct) return self.__parse_acct_output_w_bacct(stdout)
def _parse_stat_output(self, stdout): job_status = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", job_status) jobstatus = dict() if job_status in ['s', 'S', 'T'] or job_status.startswith('h'): jobstatus['state'] = Run.State.STOPPED elif 'qw' in job_status: jobstatus['state'] = Run.State.SUBMITTED elif 'r' in job_status or 'R' in job_status or 't' in job_status: jobstatus['state'] = Run.State.RUNNING elif job_status == 'E': # error condition jobstatus['state'] = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", job_status) jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def __init__(self, name, # this are inherited from the base LRMS class architecture, max_cores, max_cores_per_job, max_memory_per_core, max_walltime, auth, # these are specific to the ARC0 backend arc_ldap, frontend=None, lost_job_timeout=gc3libs.Default.ARC_LOST_JOB_TIMEOUT, **extra_args): log.warning( "The ARC1 backend (used in resource '%s') is deprecated" " and will be removed in a future release." " Consider changing your configuration.", name) # check if arc module has been imported if not have_arc_module: raise gc3libs.exceptions.LRMSError( "Could not import `arc` module, disable ARC1 resources.") # init base class LRMS.__init__( self, name, architecture, max_cores, max_cores_per_job, max_memory_per_core, max_walltime, auth) # ARC1-specific setup self.lost_job_timeout = lost_job_timeout self.arc_ldap = arc_ldap if frontend is None: if self.arc_ldap is not None: # extract frontend information from arc_ldap entry try: resource_url = gc3libs.url.Url(arc_ldap) self.frontend = resource_url.hostname except Exception, err: raise gc3libs.exceptions.ConfigurationError( "Configuration error: resource '%s' has no valid 'arc_ldap' setting: %s: %s" % (name, err.__class__.__name__, err.message)) else: self.frontend = None
def _parse_stat_output(self, stdout, stderr): ge_status_code = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code) if (ge_status_code in ['s', 'S', 'T'] or ge_status_code.startswith('h')): state = Run.State.STOPPED elif 'qw' in ge_status_code: state = Run.State.SUBMITTED elif ('r' in ge_status_code or 'R' in ge_status_code or 't' in ge_status_code): state = Run.State.RUNNING elif ge_status_code == 'E': # error condition state = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code) state = Run.State.UNKNOWN # to get the exit status information we'll have to parse # `qacct` output so put ``None`` here return self._stat_result(state, None)
def _lsf_state_to_gc3pie_state(stat): log.debug("Translating LSF's `bjobs` status '%s' to gc3libs.Run.State ...", stat) try: return { # LSF 'stat' mapping: 'PEND' : Run.State.SUBMITTED, 'RUN' : Run.State.RUNNING, 'PSUSP' : Run.State.STOPPED, 'USUSP' : Run.State.STOPPED, 'SSUSP' : Run.State.STOPPED, # DONE = successful termination 'DONE' : Run.State.TERMINATING, # EXIT = job was killed / exit forced 'EXIT' : Run.State.TERMINATING, # ZOMBI = job "killed" and unreachable 'ZOMBI' : Run.State.TERMINATING, 'UNKWN' : Run.State.UNKNOWN, }[stat] except KeyError: log.warning("Unknown LSF job status '%s', returning `UNKNOWN`", stat) return Run.State.UNKNOWN
def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert job.has_key('lrms_jobid'), \ "Missing attribute `lrms_jobid` on `Job` instance passed to `ArcLrms.peek`." controller, j = self._get_job_and_controller(job.lrms_jobid) if size is None: size = sys.maxint # `local_file` could be a file name (string) or a file-like # object, as per function docstring; ensure `local_file_name` # is the local path try: local_file_name = local_file.name except AttributeError: local_file_name = local_file # `local_file` could be a file name (string) or a file-like # object, as per function docstring; ensure `local_file_name` # is the local path try: local_file_name = local_file.name except AttributeError: local_file_name = local_file source_url = arc.URL(job.lrms_jobid + '/' + remote_filename) destination_url = arc.URL(local_file_name) # download file log.debug("Arc1Lrms.peek(): Downloading remote file '%s' into local file '%s' ..." % (remote_filename, local_file_name)) if not controller.ARCCopyFile(source_url, destination_url): log.warning("Failed downloading '%s' to '%s'" % (source_url.str(), destination_url.str())) log.debug("Arc1LRMS.peek(): arc.JobController.ARCCopyFile: completed")
def _lsf_state_to_gc3pie_state(stat): log.debug("Translating LSF's `bjobs` status '%s' to" " gc3libs.Run.State ...", stat) try: return { # LSF 'stat' mapping: 'PEND': Run.State.SUBMITTED, 'RUN': Run.State.RUNNING, 'PSUSP': Run.State.STOPPED, 'USUSP': Run.State.STOPPED, 'SSUSP': Run.State.STOPPED, # DONE = successful termination 'DONE': Run.State.TERMINATING, # EXIT = job was killed / exit forced 'EXIT': Run.State.TERMINATING, # ZOMBI = job "killed" and unreachable 'ZOMBI': Run.State.TERMINATING, 'UNKWN': Run.State.UNKNOWN, }[stat] except KeyError: log.warning( "Unknown LSF job status '%s', returning `UNKNOWN`", stat) return Run.State.UNKNOWN
raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning( "Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.exitcode = int(jobstatus['exit_status']) # XXX: we should set the `signal` part accordingly job.signal = 0 # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else:
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning( "Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error( "Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug( "Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug("The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug( " remote command returned stdout: '%s'", stdout) log.debug( " remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug( "Process with PID %s found." " Checking its running status ...", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING # if `requested_walltime` is set, enforce it as a # running time limit if app.requested_walltime is not None: exit_code2, stdout2, stderr2 = self.transport.execute_command( "ps -p %d -o etime=" % pid) if exit_code2 != 0: # job terminated already, do cleanup and return self._cleanup_terminating_task(app, pid) return app.execution.state cancel = False elapsed = _parse_time_duration(stdout2.strip()) if elapsed > self.max_walltime: log.warning( "Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.", app, elapsed.to_timedelta(), self.max_walltime, self.name) cancel = True if elapsed > app.requested_walltime: log.warning( "Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.", app, elapsed.to_timedelta(), app.requested_walltime) cancel = True if cancel: self.cancel_job(app) # set signal to SIGTERM in termination status self._cleanup_terminating_task(app, pid, termstatus=(15, -1)) return app.execution.state else: log.debug( "Process with PID %d not found," " assuming task %s has finished running.", pid, app) self._cleanup_terminating_task(app, pid) self._get_persisted_resource_state() return app.execution.state
def get_results(self, app, download_dir, overwrite=False): jobid = app.execution.lrms_jobid # XXX: can raise encoding/decoding error if `download_dir` # is not ASCII, but the ARClib bindings don't accept # Python `unicode` strings. download_dir = str(download_dir) c, j = self._get_job_and_controller(jobid) # as ARC complains when downloading to an already-existing # directory, make a temporary directory for downloading files; # then move files to their final destination and delete the # temporary location. tmp_download_dir = tempfile.mkdtemp(suffix='.d', dir=download_dir) log.debug("Downloading %s output into temporary location '%s' ...", app, tmp_download_dir) # Get a list of downloadable files download_file_list = c.GetDownloadFiles(j.JobID); source_url = arc.URL(j.JobID.str()) destination_url = arc.URL(tmp_download_dir) source_path_prefix = source_url.Path() destination_path_prefix = destination_url.Path() errors = 0 for remote_file in download_file_list: source_url.ChangePath(os.path.join(source_path_prefix,remote_file)) destination_url.ChangePath(os.path.join(destination_path_prefix,remote_file)) if not c.ARCCopyFile(source_url,destination_url): log.warning("Failed downloading '%s' to '%s'", source_url.str(), destination_url.str()) errors += 1 if errors > 0: # remove temporary download location shutil.rmtree(tmp_download_dir, ignore_errors=True) raise gc3libs.exceptions.UnrecoverableDataStagingError( "Failed downloading remote folder of job '%s' into '%s'." " There were %d errors, reported at the WARNING level in log files." % (jobid, download_dir, errors)) log.debug("Moving %s output into download location '%s' ...", app, download_dir) entries = os.listdir(tmp_download_dir) if not overwrite: # raise an early error before we start mixing files from # the old and new download directories for entry in entries: dst = os.path.join(download_dir, entry) if os.path.exists(entry): # remove temporary download location shutil.rmtree(tmp_download_dir, ignore_errors=True) raise gc3libs.exceptions.UnrecoverableDataStagingError( "Entry '%s' in download directory '%s' already exists," " and no overwriting was requested." % (entry, download_dir)) # move all entries to the final destination for entry in entries: src = os.path.join(tmp_download_dir, entry) dst = os.path.join(download_dir, entry) if os.path.isdir(dst): shutil.rmtree(dst) os.rename(src, dst) # remove temporary download location (XXX: is it correct to ignore errors here?) shutil.rmtree(tmp_download_dir, ignore_errors=True) app.execution.download_dir = download_dir return
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning("Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error("Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug("Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug( "The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug(" remote command returned stdout: '%s'", stdout) log.debug(" remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state