def _parse_acct_output(self, stdout, stderr): acctinfo = {} for line in stdout.split("\n"): # skip empty and header lines line = line.strip() if line == "" or "===" in line: continue # extract key/value pairs from `qacct` output key, value = line.split(" ", 1) value = value.strip() if key == "failed": # value may be, e.g., "100 : assumedly after job" value = value.split()[0] try: dest, conv = self._qacct_keyval_mapping[key] acctinfo[dest] = conv(value) except KeyError: # no conversion by default -- keep it a string acctinfo["sge_" + key] = value except (ValueError, TypeError) as err: log.error( "Cannot parse value '%s' for qacct parameter '%s': %s: %s", value, key, err.__class__.__name__, str(err), ) acctinfo[dest] = None assert "exitcode" in acctinfo, "Could not extract exit code from `tracejob` output" acctinfo["termstatus"] = Run.shellexit_to_returncode(acctinfo.pop("exitcode")) return acctinfo
def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert 'ssh_remote_folder' in job, \ "Missing attribute `ssh_remote_folder` on `Job` instance" \ " passed to `PbsLrms.peek`." if size is None: size = sys.maxsize _filename_mapping = generic_filename_mapping( job.lrms_jobname, job.lrms_jobid, remote_filename) _remote_filename = os.path.join( job.ssh_remote_folder, _filename_mapping) try: self.transport.connect() remote_handler = self.transport.open( _remote_filename, mode='r', bufsize=-1) remote_handler.seek(offset) data = remote_handler.read(size) except Exception as ex: log.error("Could not read remote file '%s': %s: %s", _remote_filename, ex.__class__.__name__, str(ex)) try: local_file.write(data) except (TypeError, AttributeError): output_file = open(local_file, 'w+b') output_file.write(data) output_file.close() log.debug('... Done.')
def _parse_acct_output(self, stdout): jobstatus = dict() for line in stdout.split('\n'): # skip empty and header lines line = line.strip() if line == '' or '===' in line: continue # extract key/value pairs from `qacct` output key, value = line.split(' ', 1) value = value.strip() if key == 'failed': # value may be, e.g., "100 : assumedly after job" value = value.split()[0] try: dest, conv = self._qacct_keyval_mapping[key] jobstatus[dest] = conv(value) except KeyError: # no conversion by default -- keep it a string jobstatus['sge_' + key] = value except (ValueError, TypeError) as err: log.error( "Cannot parse value '%s' for qacct parameter '%s': %s: %s", value, key, err.__class__.__name__, str(err)) jobstatus[dest] = None return jobstatus
def cancel_job(self, app): try: pid = int(app.execution.lrms_jobid) except ValueError: raise gc3libs.exceptions.InvalidArgument( "Invalid field `lrms_jobid` in Job '%s':" " expected a number, got '%s' (%s) instead" % (app, app.execution.lrms_jobid, type( app.execution.lrms_jobid))) self.transport.connect() exit_code, stdout, stderr = self.transport.execute_command('kill %d' % pid) # XXX: should we check that the process actually died? if exit_code != 0: # Error killing the process. It may not exists or we don't # have permission to kill it. exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: # The PID refers to an existing process, but we # couldn't kill it. log.error("Could not kill job '%s': %s", pid, stderr) else: # The PID refers to a non-existing process. log.error( "Could not kill job '%s'. It refers to non-existent" " local process %s.", app, app.execution.lrms_jobid) self._delete_job_resource_file(pid)
def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert 'ssh_remote_folder' in job, \ "Missing attribute `ssh_remote_folder` on `Job` instance" \ " passed to `PbsLrms.peek`." if size is None: size = sys.maxsize _filename_mapping = generic_filename_mapping(job.lrms_jobname, job.lrms_jobid, remote_filename) _remote_filename = os.path.join(job.ssh_remote_folder, _filename_mapping) try: self.transport.connect() remote_handler = self.transport.open(_remote_filename, mode='r', bufsize=-1) remote_handler.seek(offset) data = remote_handler.read(size) except Exception as ex: log.error("Could not read remote file '%s': %s: %s", _remote_filename, ex.__class__.__name__, str(ex)) try: local_file.write(data) except (TypeError, AttributeError): output_file = open(local_file, 'w+b') output_file.write(data) output_file.close() log.debug('... Done.')
def cancel_job(self, app): try: pid = int(app.execution.lrms_jobid) except ValueError: raise gc3libs.exceptions.InvalidArgument( "Invalid field `lrms_jobid` in Job '%s':" " expected a number, got '%s' (%s) instead" % (app, app.execution.lrms_jobid, type(app.execution.lrms_jobid))) self.transport.connect() exit_code, stdout, stderr = self.transport.execute_command( 'kill %d' % pid) # XXX: should we check that the process actually died? if exit_code != 0: # Error killing the process. It may not exists or we don't # have permission to kill it. exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: # The PID refers to an existing process, but we # couldn't kill it. log.error("Could not kill job '%s': %s", pid, stderr) else: # The PID refers to a non-existing process. log.error( "Could not kill job '%s'. It refers to non-existent" " local process %s.", app, app.execution.lrms_jobid) self._delete_job_resource_file(pid)
def _parse_acct_output(self, stdout, stderr): acctinfo = {} for line in stdout.split('\n'): # skip empty and header lines line = line.strip() if line == '' or '===' in line: continue # extract key/value pairs from `qacct` output key, value = line.split(' ', 1) value = value.strip() if key == 'failed': # value may be, e.g., "100 : assumedly after job" value = value.split()[0] try: dest, conv = self._qacct_keyval_mapping[key] acctinfo[dest] = conv(value) except KeyError: # no conversion by default -- keep it a string acctinfo['sge_' + key] = value except (ValueError, TypeError) as err: log.error( "Cannot parse value '%s' for qacct parameter '%s': %s: %s", value, key, err.__class__.__name__, str(err)) acctinfo[dest] = None assert 'exitcode' in acctinfo, ( "Could not extract exit code from `tracejob` output") acctinfo['termstatus'] = Run.shellexit_to_returncode( acctinfo.pop('exitcode')) return acctinfo
def free(self, app): controller, job = self._get_job_and_controller(app.execution.lrms_jobid) log.debug("Calling JobController.CleanJob") if not controller.CleanJob(job): log.error("arc1.JobController.CleanJob returned False for ARC job ID '%s'", app.execution.lrms_jobid) # XXX: this is necessary as the other component of arc library seems to refer to the job.xml file # remove Job from job.xml file log.debug("Removing job '%s' from jobfile '%s'", app, gc3libs.Default.ARC_JOBLIST_LOCATION) job.RemoveJobsFromFile(gc3libs.Default.ARC_JOBLIST_LOCATION, [job.IDFromEndpoint])
def get_resource_status(self): try: self.transport.connect() _command = ("%s -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_stdout, stderr)) _command = ("%s -F -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_F_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_F_stdout, stderr)) (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) slots = compute_nr_of_slots(qstat_F_stdout) self.free_slots = int(slots['global']['available']) self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def terminated(self): # full path to output file on local filesystem output_file = join(self.output_dir, self.output_file_name) # if the output file is not there, log an error and exit if not exists(output_file): log.error("Expected output file `%s` from %s does not exists!", output_file, self) return # ensure destination directory exists if not exists('pictures'): os.mkdir('pictures') # the trailing slash ensures `shutil.move` raises an error if # the destination exists but is not a directory move(output_file, 'pictures/')
def get_resource_status(self): try: self.transport.connect() _command = ("%s -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_stdout, stderr)) _command = ("%s -F -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_F_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_F_stdout, stderr)) (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) slots = compute_nr_of_slots(qstat_F_stdout) self.free_slots = int(slots['global']['available']) self.used_quota = -1 log.info("Updated resource '%s' status:" " free slots: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def get_resource_status(self): self.updated = False try: self.transport.connect() _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" % self._squeue) log.debug("Running `%s`...", _command) exitcode, stdout, stderr = self.transport.execute_command(_command) if exitcode != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SLURM backend failed executing '%s':" " exit code: %d; stdout: '%s', stderr: '%s'" % (_command, exitcode, stdout, stderr)) log.debug("Computing updated values for total/available slots ...") (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(stdout, self._username) self.total_run = total_running self.free_slots = -1 self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " total running: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.total_run, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex), exc_info=True) raise
def _read_job_resource_file(self, pid): """ Get resource information on job with pid `pid`, if it exists. Returns None if it does not exist. """ self.transport.connect() log.debug("Reading resource file for pid %s", pid) jobinfo = None fname = posixpath.join(self.resource_dir, str(pid)) with self.transport.open(fname, 'rb') as fp: try: jobinfo = pickle.load(fp) except Exception as ex: log.error("Unable to read remote resource file %s: %s", fname, ex) raise return jobinfo
def get_resource_status(self): self.updated = False try: self.transport.connect() _command = ('%s -a' % self._qstat) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "PBS backend failed executing '%s':" " exit code: %d; stdout: '%s', stderr: '%s'" % (_command, exit_code, qstat_stdout, stderr)) log.debug("Computing updated values for total/available slots ...") (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) self.total_run = total_running self.free_slots = -1 self.used_quota = -1 log.info("Updated resource '%s' status:" " free slots: %d," " total running: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.total_run, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex), exc_info=True) raise
def cancel_job(self, app): job = app.execution try: self.transport.connect() cmd = self._cancel_command(job.lrms_jobid) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code != 0: # XXX: It is possible that 'qdel' fails because job # has been already completed thus the cancel_job # behaviour should be tolerant to these errors. log.error("Failed executing remote command '%s'; exit status %d", cmd, exit_code) log.debug(" remote command returned STDOUT '%s'", stdout) log.debug(" remote command returned STDERR '%s'", stderr) if exit_code == 127: # command was not executed, time to signal an exception raise gc3libs.exceptions.LRMSError( "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd,) ) return job except: log.critical("Failure checking status") raise
def cancel_job(self, app): job = app.execution try: self.transport.connect() cmd = self._cancel_command(job.lrms_jobid) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code != 0: # XXX: It is possible that 'qdel' fails because job # has been already completed thus the cancel_job # behaviour should be tolerant to these errors. log.error( "Failed executing remote command '%s'; exit status %d", cmd, exit_code) log.debug(" remote command returned STDOUT '%s'", stdout) log.debug(" remote command returned STDERR '%s'", stderr) if exit_code == 127: # command was not executed, time to signal an exception raise gc3libs.exceptions.LRMSError( "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd, )) return job except: log.critical('Failure checking status') raise
def cancel_job(self, app): try: pid = int(app.execution.lrms_jobid) except ValueError: raise gc3libs.exceptions.InvalidArgument( "Invalid field `lrms_jobid` in Job '%s':" " expected a number, got '%s' (%s) instead" % (app, app.execution.lrms_jobid, type( app.execution.lrms_jobid))) self.transport.connect() # Kill all the processes belonging to the same session as the # pid we actually started. # On linux, kill '$(ps -o pid= -g $(ps -o sess= -p %d))' would # be enough, but on MacOSX it doesn't work. exit_code, stdout, stderr = self.transport.execute_command( "ps -p %d -o sess=" % pid) if exit_code != 0 or not stdout.strip(): # No PID found. We cannot recover the session group of the # process, so we cannot kill any remaining orphan process. log.error("Unable to find job '%s': no pid found." % pid) else: exit_code, stdout, stderr = self.transport.execute_command( 'kill $(ps -ax -o sess=,pid= | egrep "^[ \t]*%s[ \t]")' % stdout.strip()) # XXX: should we check that the process actually died? if exit_code != 0: # Error killing the process. It may not exists or we don't # have permission to kill it. exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: # The PID refers to an existing process, but we # couldn't kill it. log.error("Could not kill job '%s': %s", pid, stderr) else: # The PID refers to a non-existing process. log.error( "Could not kill job '%s'. It refers to non-existent" " local process %s.", app, app.execution.lrms_jobid) self._delete_job_resource_file(pid)
def cancel_job(self, app): try: pid = int(app.execution.lrms_jobid) except ValueError: raise gc3libs.exceptions.InvalidArgument( "Invalid field `lrms_jobid` in Job '%s':" " expected a number, got '%s' (%s) instead" % (app, app.execution.lrms_jobid, type(app.execution.lrms_jobid))) self.transport.connect() # Kill all the processes belonging to the same session as the # pid we actually started. # On linux, kill '$(ps -o pid= -g $(ps -o sess= -p %d))' would # be enough, but on MacOSX it doesn't work. exit_code, stdout, stderr = self.transport.execute_command( "ps -p %d -o sess=" % pid) if exit_code != 0 or not stdout.strip(): # No PID found. We cannot recover the session group of the # process, so we cannot kill any remaining orphan process. log.error("Unable to find job '%s': no pid found." % pid) else: exit_code, stdout, stderr = self.transport.execute_command( 'kill $(ps -ax -o sess=,pid= | egrep "^[ \t]*%s[ \t]")' % stdout.strip()) # XXX: should we check that the process actually died? if exit_code != 0: # Error killing the process. It may not exists or we don't # have permission to kill it. exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: # The PID refers to an existing process, but we # couldn't kill it. log.error("Could not kill job '%s': %s", pid, stderr) else: # The PID refers to a non-existing process. log.error( "Could not kill job '%s'. It refers to non-existent" " local process %s.", app, app.execution.lrms_jobid) self._delete_job_resource_file(pid)
if 'exit_status' in jobstatus: job.exitcode = int(jobstatus['exit_status']) # XXX: we should set the `signal` part accordingly job.signal = 0 # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error( "Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug( "The `qstat`/`bjobs` command returned no job information;" " trying with '%s' instead ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_acct_output(stdout) job.update(jobstatus)
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning( "Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error( "Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug( "Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug("The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug( " remote command returned stdout: '%s'", stdout) log.debug( " remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state
def _gather_machine_specs(self): """ Gather information about this machine and, if `self.override` is true, also update the value of `max_cores` and `max_memory_per_jobs` attributes. This method works with both Linux and MacOSX. """ self.transport.connect() # expand env variables in the `resource_dir` setting exit_code, stdout, stderr = self.transport.execute_command( 'echo %s' % sh_quote_unsafe(self.cfg_resourcedir)) self.resource_dir = stdout.strip() # XXX: it is actually necessary to create the folder # as a separate step if not self.transport.exists(self.resource_dir): try: log.info("Creating resource file directory: '%s' ...", self.resource_dir) self.transport.makedirs(self.resource_dir) except Exception as ex: log.error("Failed creating resource directory '%s':" " %s: %s", self.resource_dir, type(ex), str(ex)) # cannot continue raise exit_code, stdout, stderr = self.transport.execute_command('uname -m') arch = gc3libs.config._parse_architecture(stdout) if arch != self.architecture: raise gc3libs.exceptions.ConfigurationError( "Invalid architecture: configuration file says `%s` but " "it actually is `%s`" % (str.join(', ', self.architecture), str.join(', ', arch))) exit_code, stdout, stderr = self.transport.execute_command('uname -s') self.running_kernel = stdout.strip() # ensure `time_cmd` points to a valid value self.time_cmd = self._locate_gnu_time() if not self.time_cmd: raise gc3libs.exceptions.ConfigurationError( "Unable to find GNU `time` installed on your system." " Please, install GNU time and set the `time_cmd`" " configuration option in gc3pie.conf.") if not self.override: # Ignore other values. return if self.running_kernel == 'Linux': exit_code, stdout, stderr = self.transport.execute_command('nproc') max_cores = int(stdout) # get the amount of total memory from /proc/meminfo with self.transport.open('/proc/meminfo', 'r') as fd: for line in fd: if line.startswith('MemTotal'): self.total_memory = int(line.split()[1]) * Memory.KiB break elif self.running_kernel == 'Darwin': exit_code, stdout, stderr = self.transport.execute_command( 'sysctl hw.ncpu') max_cores = int(stdout.split(':')[-1]) exit_code, stdout, stderr = self.transport.execute_command( 'sysctl hw.memsize') self.total_memory = int(stdout.split(':')[1]) * Memory.B if max_cores != self.max_cores: log.info( "Mismatch of value `max_cores` on resource '%s':" " configuration file says `max_cores=%d` while it's actually `%d`." " Updating current value.", self.name, self.max_cores, max_cores) self.max_cores = max_cores if self.total_memory != self.max_memory_per_core: log.info( "Mismatch of value `max_memory_per_core` on resource %s:" " configuration file says `max_memory_per_core=%s` while it's" " actually `%s`. Updating current value.", self.name, self.max_memory_per_core, self.total_memory.to_str('%g%s', unit=Memory.MB)) self.max_memory_per_core = self.total_memory self.available_memory = self.total_memory
def submit_job(self, app): """ Run an `Application` instance as a local process. :see: `LRMS.submit_job` """ # Update current resource usage to check how many jobs are # running in there. Please note that for consistency with # other backends, these updated information are not kept! try: self.transport.connect() except gc3libs.exceptions.TransportError as ex: raise gc3libs.exceptions.LRMSSubmitError( "Unable to access shellcmd resource at %s: %s" % (self.frontend, str(ex))) job_infos = self._get_persisted_resource_state() free_slots = self.max_cores - self._compute_used_cores(job_infos) available_memory = self.total_memory - \ self._compute_used_memory(job_infos) if self.free_slots == 0 or free_slots == 0: # XXX: We shouldn't check for self.free_slots ! raise gc3libs.exceptions.LRMSSubmitError( "Resource %s already running maximum allowed number of jobs" " (%s). Increase 'max_cores' to raise." % (self.name, self.max_cores)) if app.requested_memory and \ (available_memory < app.requested_memory or self.available_memory < app.requested_memory): raise gc3libs.exceptions.LRMSSubmitError( "Resource %s does not have enough available memory:" " %s requested, but only %s available." % ( self.name, app.requested_memory.to_str('%g%s', unit=Memory.MB), available_memory.to_str('%g%s', unit=Memory.MB), )) log.debug("Executing local command '%s' ...", str.join(" ", app.arguments)) # Check if spooldir is a valid directory if not self.spooldir: ex, stdout, stderr = self.transport.execute_command( 'cd "$TMPDIR" && pwd') if ex != 0 or stdout.strip() == '' or not stdout[0] == '/': log.debug( "Unable to recover a valid absolute path for spooldir." " Using `/var/tmp`.") self.spooldir = '/var/tmp' else: self.spooldir = stdout.strip() # determine execution directory exit_code, stdout, stderr = self.transport.execute_command( "mktemp -d %s " % posixpath.join(self.spooldir, 'gc3libs.XXXXXX')) if exit_code != 0: log.error("Error creating temporary directory on host %s: %s", self.frontend, stderr) log.debug('Freeing resources used by failed application') self.free(app) raise gc3libs.exceptions.LRMSSubmitError( "Error creating temporary directory on host %s: %s", self.frontend, stderr) execdir = stdout.strip() app.execution.lrms_execdir = execdir # Copy input files to remote dir for local_path, remote_path in app.inputs.items(): if local_path.scheme != 'file': continue remote_path = posixpath.join(execdir, remote_path) remote_parent = os.path.dirname(remote_path) try: if (remote_parent not in ['', '.'] and not self.transport.exists(remote_parent)): log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote host '%s' failed", local_path.path, self.frontend) log.debug('Cleaning up failed application') self.free(app) raise # try to ensure that a local executable really has # execute permissions, but ignore failures (might be a # link to a file we do not own) if app.arguments[0].startswith('./'): try: self.transport.chmod( posixpath.join(execdir, app.arguments[0][2:]), 0o755) # os.chmod(app.arguments[0], 0755) except: log.error("Failed setting execution flag on remote file '%s'", posixpath.join(execdir, app.arguments[0])) # set up redirection redirection_arguments = '' if app.stdin is not None: # stdin = open(app.stdin, 'r') redirection_arguments += " <%s" % app.stdin if app.stdout is not None: redirection_arguments += " >%s" % app.stdout stdout_dir = os.path.dirname(app.stdout) if stdout_dir: self.transport.makedirs(posixpath.join(execdir, stdout_dir)) if app.join: redirection_arguments += " 2>&1" else: if app.stderr is not None: redirection_arguments += " 2>%s" % app.stderr stderr_dir = os.path.dirname(app.stderr) if stderr_dir: self.transport.makedirs(posixpath.join( execdir, stderr_dir)) # set up environment env_commands = [] for k, v in app.environment.iteritems(): env_commands.append("export {k}={v};".format(k=sh_quote_safe(k), v=sh_quote_unsafe(v))) # Create the directory in which pid, output and wrapper script # files will be stored wrapper_dir = posixpath.join(execdir, ShellcmdLrms.WRAPPER_DIR) if not self.transport.isdir(wrapper_dir): try: self.transport.makedirs(wrapper_dir) except: log.error("Failed creating remote folder '%s'" % wrapper_dir) self.free(app) raise # Set up scripts to download/upload the swift/http files downloadfiles = [] uploadfiles = [] wrapper_downloader_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_DOWNLOADER) for url, outfile in app.inputs.items(): if url.scheme in [ 'swift', 'swifts', 'swt', 'swts', 'http', 'https' ]: downloadfiles.append( "python '%s' download '%s' '%s'" % (wrapper_downloader_filename, str(url), outfile)) for infile, url in app.outputs.items(): if url.scheme in ['swift', 'swt', 'swifts', 'swts']: uploadfiles.append( "python '%s' upload '%s' '%s'" % (wrapper_downloader_filename, str(url), infile)) if downloadfiles or uploadfiles: # Also copy the downloader. with open( resource_filename(Requirement.parse("gc3pie"), "gc3libs/etc/downloader.py")) as fd: wrapper_downloader = self.transport.open( wrapper_downloader_filename, 'w') wrapper_downloader.write(fd.read()) wrapper_downloader.close() # Build pidfilename = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_PID) wrapper_output_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) wrapper_script_fname = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_SCRIPT) try: # Create the wrapper script wrapper_script = self.transport.open(wrapper_script_fname, 'w') commands = (r"""#!/bin/sh echo $$ >{pidfilename} cd {execdir} exec {redirections} {environment} {downloadfiles} '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command} rc=$? {uploadfiles} rc2=$? if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi """.format( pidfilename=pidfilename, execdir=execdir, time_cmd=self.time_cmd, wrapper_out=wrapper_output_filename, fmt=ShellcmdLrms.TIMEFMT, redirections=redirection_arguments, environment=str.join('\n', env_commands), downloadfiles=str.join('\n', downloadfiles), uploadfiles=str.join('\n', uploadfiles), command=(str.join(' ', (sh_quote_unsafe(arg) for arg in app.arguments))), )) wrapper_script.write(commands) wrapper_script.close() #log.info("Wrapper script: <<<%s>>>", commands) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise try: self.transport.chmod(wrapper_script_fname, 0o755) # Execute the script in background self.transport.execute_command(wrapper_script_fname, detach=True) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise # Just after the script has been started the pidfile should be # filled in with the correct pid. # # However, the script can have not been able to write the # pidfile yet, so we have to wait a little bit for it... pidfile = None for retry in gc3libs.utils.ExponentialBackoff(): try: pidfile = self.transport.open(pidfilename, 'r') break except gc3libs.exceptions.TransportError as ex: if '[Errno 2]' in str(ex): # no such file or directory time.sleep(retry) continue else: raise if pidfile is None: # XXX: probably self.free(app) should go here as well raise gc3libs.exceptions.LRMSSubmitError( "Unable to get PID file of submitted process from" " execution directory `%s`: %s" % (execdir, pidfilename)) pid = pidfile.read().strip() try: pid = int(pid) except ValueError: # XXX: probably self.free(app) should go here as well pidfile.close() raise gc3libs.exceptions.LRMSSubmitError( "Invalid pid `%s` in pidfile %s." % (pid, pidfilename)) pidfile.close() # Update application and current resources app.execution.lrms_jobid = pid # We don't need to update free_slots since its value is # checked at runtime. if app.requested_memory: self.available_memory -= app.requested_memory self.job_infos[pid] = { 'requested_cores': app.requested_cores, 'requested_memory': app.requested_memory, 'execution_dir': execdir, 'terminated': False, } self._update_job_resource_file(pid, self.job_infos[pid]) return app
def get_resource_status(self): """ Get dynamic information out of the LSF subsystem. return self dynamic information required (at least those): total_queued free_slots user_running user_queued """ try: self.transport.connect() # Run lhosts to get the list of available nodes and their # related number of cores # used to compute self.total_slots # lhost output format: # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp) _command = ('%s -w' % self._lshosts) exit_code, stdout, stderr = self.transport.execute_command( _command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: lhosts_output = stdout.strip().split('\n') # Remove Header lhosts_output.pop(0) else: lhosts_output = [] # compute self.total_slots self.max_cores = 0 for line in lhosts_output: # HOST_NAME type model cpuf ncpus maxmem maxswp server RESOURCES # noqa (hostname, h_type, h_model, h_cpuf, h_ncpus) = \ line.strip().split()[0:5] try: self.max_cores += int(h_ncpus) except ValueError: # h_ncpus == '-' pass # Run `bjobs -u all -w` to get information about the jobs # for a given user used to compute `running_jobs`, # `self.queued`, `self.user_run` and `self.user_queued`. # # bjobs output format: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa _command = ('%s -u all -w' % self._bjobs) log.debug("Runing `%s`... ", _command) exit_code, stdout, stderr = \ self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: bjobs_output = stdout.strip().split('\n') # Remove Header bjobs_output.pop(0) else: bjobs_output = [] # user runing/queued used_cores = 0 self.queued = 0 self.user_queued = 0 self.user_run = 0 queued_statuses = [ 'PEND', 'PSUSP', 'USUSP', 'SSUSP', 'WAIT', 'ZOMBI' ] for line in bjobs_output: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa (jobid, user, stat, queue, from_h, exec_h) = \ line.strip().split()[0:6] # to compute the number of cores allocated per each job # we use the output format of EXEC_HOST field # e.g.: 1*cpt178:2*cpt151 for node in exec_h.split(':'): try: # multi core (cores, n_name) = node.split('*') except ValueError: # single core cores = 1 try: cores = int(cores) except ValueError: # core == '-' pass used_cores += cores if stat in queued_statuses: self.queued += 1 if user == self._username: if stat in queued_statuses: self.user_queued += 1 else: self.user_run += 1 self.free_slots = self.max_cores - used_cores return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning("Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error("Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug("Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug( "The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug(" remote command returned stdout: '%s'", stdout) log.debug(" remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state
def submit_job(self, app): """ Run an `Application` instance as a local process. :see: `LRMS.submit_job` """ # Update current resource usage to check how many jobs are # running in there. Please note that for consistency with # other backends, these updated information are not kept! try: self.transport.connect() except gc3libs.exceptions.TransportError as ex: raise gc3libs.exceptions.LRMSSubmitError( "Unable to access shellcmd resource at %s: %s" % (self.frontend, str(ex))) job_infos = self._get_persisted_resource_state() free_slots = self.max_cores - self._compute_used_cores(job_infos) available_memory = self.total_memory - \ self._compute_used_memory(job_infos) if self.free_slots == 0 or free_slots == 0: # XXX: We shouldn't check for self.free_slots ! raise gc3libs.exceptions.LRMSSubmitError( "Resource %s already running maximum allowed number of jobs" " (%s). Increase 'max_cores' to raise." % (self.name, self.max_cores)) if app.requested_memory and \ (available_memory < app.requested_memory or self.available_memory < app.requested_memory): raise gc3libs.exceptions.LRMSSubmitError( "Resource %s does not have enough available memory:" " %s requested, but only %s available." % (self.name, app.requested_memory.to_str('%g%s', unit=Memory.MB), available_memory.to_str('%g%s', unit=Memory.MB),) ) log.debug("Executing local command '%s' ...", str.join(" ", app.arguments)) # Check if spooldir is a valid directory if not self.spooldir: ex, stdout, stderr = self.transport.execute_command( 'cd "$TMPDIR" && pwd') if ex != 0 or stdout.strip() == '' or not stdout[0] == '/': log.debug( "Unable to recover a valid absolute path for spooldir." " Using `/var/tmp`.") self.spooldir = '/var/tmp' else: self.spooldir = stdout.strip() # determine execution directory exit_code, stdout, stderr = self.transport.execute_command( "mktemp -d %s " % posixpath.join( self.spooldir, 'gc3libs.XXXXXX')) if exit_code != 0: log.error( "Error creating temporary directory on host %s: %s", self.frontend, stderr) log.debug('Freeing resources used by failed application') self.free(app) raise gc3libs.exceptions.LRMSSubmitError( "Error creating temporary directory on host %s: %s", self.frontend, stderr) execdir = stdout.strip() app.execution.lrms_execdir = execdir # Copy input files to remote dir for local_path, remote_path in app.inputs.items(): if local_path.scheme != 'file': continue remote_path = posixpath.join(execdir, remote_path) remote_parent = os.path.dirname(remote_path) try: if (remote_parent not in ['', '.'] and not self.transport.exists(remote_parent)): log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote host '%s' failed", local_path.path, self.frontend) log.debug('Cleaning up failed application') self.free(app) raise # try to ensure that a local executable really has # execute permissions, but ignore failures (might be a # link to a file we do not own) if app.arguments[0].startswith('./'): try: self.transport.chmod( posixpath.join(execdir, app.arguments[0][2:]), 0o755) # os.chmod(app.arguments[0], 0755) except: log.error( "Failed setting execution flag on remote file '%s'", posixpath.join(execdir, app.arguments[0])) # set up redirection redirection_arguments = '' if app.stdin is not None: # stdin = open(app.stdin, 'r') redirection_arguments += " <%s" % app.stdin if app.stdout is not None: redirection_arguments += " >%s" % app.stdout stdout_dir = os.path.dirname(app.stdout) if stdout_dir: self.transport.makedirs(posixpath.join(execdir, stdout_dir)) if app.join: redirection_arguments += " 2>&1" else: if app.stderr is not None: redirection_arguments += " 2>%s" % app.stderr stderr_dir = os.path.dirname(app.stderr) if stderr_dir: self.transport.makedirs(posixpath.join(execdir, stderr_dir)) # set up environment env_commands = [] for k, v in app.environment.iteritems(): env_commands.append( "export {k}={v};" .format(k=sh_quote_safe(k), v=sh_quote_unsafe(v))) # Create the directory in which pid, output and wrapper script # files will be stored wrapper_dir = posixpath.join( execdir, ShellcmdLrms.WRAPPER_DIR) if not self.transport.isdir(wrapper_dir): try: self.transport.makedirs(wrapper_dir) except: log.error("Failed creating remote folder '%s'" % wrapper_dir) self.free(app) raise # Set up scripts to download/upload the swift/http files downloadfiles = [] uploadfiles = [] wrapper_downloader_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_DOWNLOADER) for url, outfile in app.inputs.items(): if url.scheme in ['swift', 'swifts', 'swt', 'swts', 'http', 'https']: downloadfiles.append("python '%s' download '%s' '%s'" % (wrapper_downloader_filename, str(url), outfile)) for infile, url in app.outputs.items(): if url.scheme in ['swift', 'swt', 'swifts', 'swts']: uploadfiles.append("python '%s' upload '%s' '%s'" % (wrapper_downloader_filename, str(url), infile)) if downloadfiles or uploadfiles: # Also copy the downloader. with open(resource_filename(Requirement.parse("gc3pie"), "gc3libs/etc/downloader.py")) as fd: wrapper_downloader = self.transport.open( wrapper_downloader_filename, 'w') wrapper_downloader.write(fd.read()) wrapper_downloader.close() # Build pidfilename = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_PID) wrapper_output_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) wrapper_script_fname = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_SCRIPT) try: # Create the wrapper script wrapper_script = self.transport.open( wrapper_script_fname, 'w') commands = ( r"""#!/bin/sh echo $$ >{pidfilename} cd {execdir} exec {redirections} {environment} {downloadfiles} '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command} rc=$? {uploadfiles} rc2=$? if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi """.format( pidfilename=pidfilename, execdir=execdir, time_cmd=self.time_cmd, wrapper_out=wrapper_output_filename, fmt=ShellcmdLrms.TIMEFMT, redirections=redirection_arguments, environment=str.join('\n', env_commands), downloadfiles=str.join('\n', downloadfiles), uploadfiles=str.join('\n', uploadfiles), command=(str.join(' ', (sh_quote_unsafe(arg) for arg in app.arguments))), )) wrapper_script.write(commands) wrapper_script.close() #log.info("Wrapper script: <<<%s>>>", commands) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise try: self.transport.chmod(wrapper_script_fname, 0o755) # Execute the script in background self.transport.execute_command(wrapper_script_fname, detach=True) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise # Just after the script has been started the pidfile should be # filled in with the correct pid. # # However, the script can have not been able to write the # pidfile yet, so we have to wait a little bit for it... pidfile = None for retry in gc3libs.utils.ExponentialBackoff(): try: pidfile = self.transport.open(pidfilename, 'r') break except gc3libs.exceptions.TransportError as ex: if '[Errno 2]' in str(ex): # no such file or directory time.sleep(retry) continue else: raise if pidfile is None: # XXX: probably self.free(app) should go here as well raise gc3libs.exceptions.LRMSSubmitError( "Unable to get PID file of submitted process from" " execution directory `%s`: %s" % (execdir, pidfilename)) pid = pidfile.read().strip() try: pid = int(pid) except ValueError: # XXX: probably self.free(app) should go here as well pidfile.close() raise gc3libs.exceptions.LRMSSubmitError( "Invalid pid `%s` in pidfile %s." % (pid, pidfilename)) pidfile.close() # Update application and current resources app.execution.lrms_jobid = pid # We don't need to update free_slots since its value is # checked at runtime. if app.requested_memory: self.available_memory -= app.requested_memory self.job_infos[pid] = { 'requested_cores': app.requested_cores, 'requested_memory': app.requested_memory, 'execution_dir': execdir, 'terminated': False, } self._update_job_resource_file(pid, self.job_infos[pid]) return app
def get_resource_status(self): """ Get dynamic information out of the LSF subsystem. return self dynamic information required (at least those): total_queued free_slots user_running user_queued """ try: self.transport.connect() # Run lhosts to get the list of available nodes and their # related number of cores # used to compute self.total_slots # lhost output format: # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp) _command = ('%s -w' % self._lshosts) exit_code, stdout, stderr = self.transport.execute_command( _command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: lhosts_output = stdout.strip().split('\n') # Remove Header lhosts_output.pop(0) else: lhosts_output = [] # compute self.total_slots self.max_cores = 0 for line in lhosts_output: # HOST_NAME type model cpuf ncpus maxmem maxswp server RESOURCES # noqa (hostname, h_type, h_model, h_cpuf, h_ncpus) = \ line.strip().split()[0:5] try: self.max_cores += int(h_ncpus) except ValueError: # h_ncpus == '-' pass # Run `bjobs -u all -w` to get information about the jobs # for a given user used to compute `running_jobs`, # `self.queued`, `self.user_run` and `self.user_queued`. # # bjobs output format: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa _command = ('%s -u all -w' % self._bjobs) log.debug("Runing `%s`... ", _command) exit_code, stdout, stderr = \ self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: bjobs_output = stdout.strip().split('\n') # Remove Header bjobs_output.pop(0) else: bjobs_output = [] # user runing/queued used_cores = 0 self.queued = 0 self.user_queued = 0 self.user_run = 0 queued_statuses = ['PEND', 'PSUSP', 'USUSP', 'SSUSP', 'WAIT', 'ZOMBI'] for line in bjobs_output: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa (jobid, user, stat, queue, from_h, exec_h) = \ line.strip().split()[0:6] # to compute the number of cores allocated per each job # we use the output format of EXEC_HOST field # e.g.: 1*cpt178:2*cpt151 for node in exec_h.split(':'): try: # multi core (cores, n_name) = node.split('*') except ValueError: # single core cores = 1 try: cores = int(cores) except ValueError: # core == '-' pass used_cores += cores if stat in queued_statuses: self.queued += 1 if user == self._username: if stat in queued_statuses: self.user_queued += 1 else: self.user_run += 1 self.free_slots = self.max_cores - used_cores return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise