def count_jobs(qstat_output, whoami): """ Parse PBS/Torque's ``qstat`` output (as contained in string `qstat_output`) and return a quadruple `(R, Q, r, q)` where: * `R` is the total number of running jobs in the PBS/Torque cell (from any user); * `Q` is the total number of queued jobs in the PBS/Torque cell (from any user); * `r` is the number of running jobs submitted by user `whoami`; * `q` is the number of queued jobs submitted by user `whoami` """ total_running = 0 total_queued = 0 own_running = 0 own_queued = 0 for line in qstat_output.split('\n'): log.info("Output line: %s" % line) m = _qstat_line_re.match(line) if not m: continue if m.group('state') in ['R']: total_running += 1 if m.group('username') == whoami: own_running += 1 elif m.group('state') in ['Q']: total_queued += 1 if m.group('username') == whoami: own_queued += 1 log.info("running: %d, queued: %d" % (total_running, total_queued)) return (total_running, total_queued, own_running, own_queued)
def get_resource_status(self): try: self.transport.connect() _command = ("%s -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_stdout, stderr)) _command = ("%s -F -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_F_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_F_stdout, stderr)) (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) slots = compute_nr_of_slots(qstat_F_stdout) self.free_slots = int(slots['global']['available']) self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def get_resource_status(self): try: self.transport.connect() _command = ("%s -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_stdout, stderr)) _command = ("%s -F -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_F_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_F_stdout, stderr)) (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) slots = compute_nr_of_slots(qstat_F_stdout) self.free_slots = int(slots['global']['available']) self.used_quota = -1 log.info("Updated resource '%s' status:" " free slots: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def get_resource_status(self): self.updated = False try: self.transport.connect() _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" % self._squeue) log.debug("Running `%s`...", _command) exitcode, stdout, stderr = self.transport.execute_command(_command) if exitcode != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SLURM backend failed executing '%s':" " exit code: %d; stdout: '%s', stderr: '%s'" % (_command, exitcode, stdout, stderr)) log.debug("Computing updated values for total/available slots ...") (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(stdout, self._username) self.total_run = total_running self.free_slots = -1 self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " total running: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.total_run, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex), exc_info=True) raise
def get_results(self, app, download_dir, overwrite=False): if app.output_base_url is not None: raise gc3libs.exceptions.UnrecoverableDataStagingError( "Retrieval of output files to non-local destinations" " is not supported (yet).") job = app.execution try: self.transport.connect() # Make list of files to copy, in the form of (remote_path, # local_path) pairs. This entails walking the # `Application.outputs` list to expand wildcards and # directory references. stageout = list() for remote_relpath, local_url in app.outputs.iteritems(): local_relpath = local_url.path if remote_relpath == gc3libs.ANY_OUTPUT: remote_relpath = '' local_relpath = '' stageout += _make_remote_and_local_path_pair( self.transport, job, remote_relpath, download_dir, local_relpath) # copy back all files, renaming them to adhere to the # ArcLRMS convention log.debug("Downloading job output into '%s' ...", download_dir) for remote_path, local_path in stageout: log.debug("Downloading remote file '%s' to local file '%s'", remote_path, local_path) if (overwrite or not os.path.exists(local_path) or os.path.isdir(local_path)): log.debug("Copying remote '%s' to local '%s'" % (remote_path, local_path)) # ignore missing files (this is what ARC does too) self.transport.get(remote_path, local_path, ignore_nonexisting=True) else: log.info("Local file '%s' already exists;" " will not be overwritten!", local_path) return # XXX: should we return list of downloaded files? except: raise
def get_resource_status(self): self.updated = False try: self.transport.connect() _command = ('%s -a' % self._qstat) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "PBS backend failed executing '%s':" " exit code: %d; stdout: '%s', stderr: '%s'" % (_command, exit_code, qstat_stdout, stderr)) log.debug("Computing updated values for total/available slots ...") (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) self.total_run = total_running self.free_slots = -1 self.used_quota = -1 log.info("Updated resource '%s' status:" " free slots: %d," " total running: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.total_run, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex), exc_info=True) raise
def submit_job(self, app): """This method will create a remote directory to store job's sandbox, and will copy the sandbox in there. """ job = app.execution # Create the remote directory. try: self.transport.connect() cmd = "mkdir -p $HOME/.gc3pie_jobs;" \ " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX" log.info("Creating remote temporary folder: command '%s' " % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: ssh_remote_folder = stdout.split('\n')[0] else: raise gc3libs.exceptions.LRMSError( "Failed executing command '%s' on resource '%s';" " exit code: %d, stderr: '%s'." % (cmd, self.name, exit_code, stderr)) except gc3libs.exceptions.TransportError, x: raise
def terminated(self): if self.execution.signal != 0: log.info("Task %s killed by signal %d", self, self.execution.signal) else: # self.execution.signal == 0, hence normal termination if self.execution.exitcode == 0: log.info("Task %s exited successfully!", self) else: log.info("Task %s exited with error code %d", self, self.execution.exitcode)
def terminated(self): err_file_path = os.path.join(self.output_dir, self.stderr) with open(err_file_path, 'r') as err_file: errors = err_file.read() if 'Out of memory' in errors or 'exceeds maximum array size' in errors: self.execution.exitcode = 11 # verbosely notify user if self.execution.signal != 0: log.info("Task %s killed by signal %d", self, self.execution.signal) else: # self.execution.signal == 0, hence normal termination if self.execution.exitcode == 0: log.info("Task %s exited successfully!", self) else: log.info("Task %s exited with error code %d", self, self.execution.exitcode)
def _gather_machine_specs(self): """ Gather information about this machine and, if `self.override` is true, also update the value of `max_cores` and `max_memory_per_jobs` attributes. This method works with both Linux and MacOSX. """ self.transport.connect() # expand env variables in the `resource_dir` setting exit_code, stdout, stderr = self.transport.execute_command( 'echo %s' % sh_quote_unsafe(self.cfg_resourcedir)) self.resource_dir = stdout.strip() # XXX: it is actually necessary to create the folder # as a separate step if not self.transport.exists(self.resource_dir): try: log.info("Creating resource file directory: '%s' ...", self.resource_dir) self.transport.makedirs(self.resource_dir) except Exception as ex: log.error("Failed creating resource directory '%s':" " %s: %s", self.resource_dir, type(ex), str(ex)) # cannot continue raise exit_code, stdout, stderr = self.transport.execute_command('uname -m') arch = gc3libs.config._parse_architecture(stdout) if arch != self.architecture: raise gc3libs.exceptions.ConfigurationError( "Invalid architecture: configuration file says `%s` but " "it actually is `%s`" % (str.join(', ', self.architecture), str.join(', ', arch))) exit_code, stdout, stderr = self.transport.execute_command('uname -s') self.running_kernel = stdout.strip() # ensure `time_cmd` points to a valid value self.time_cmd = self._locate_gnu_time() if not self.time_cmd: raise gc3libs.exceptions.ConfigurationError( "Unable to find GNU `time` installed on your system." " Please, install GNU time and set the `time_cmd`" " configuration option in gc3pie.conf.") if not self.override: # Ignore other values. return if self.running_kernel == 'Linux': exit_code, stdout, stderr = self.transport.execute_command('nproc') max_cores = int(stdout) # get the amount of total memory from /proc/meminfo with self.transport.open('/proc/meminfo', 'r') as fd: for line in fd: if line.startswith('MemTotal'): self.total_memory = int(line.split()[1]) * Memory.KiB break elif self.running_kernel == 'Darwin': exit_code, stdout, stderr = self.transport.execute_command( 'sysctl hw.ncpu') max_cores = int(stdout.split(':')[-1]) exit_code, stdout, stderr = self.transport.execute_command( 'sysctl hw.memsize') self.total_memory = int(stdout.split(':')[1]) * Memory.B if max_cores != self.max_cores: log.info( "Mismatch of value `max_cores` on resource '%s':" " configuration file says `max_cores=%d` while it's actually `%d`." " Updating current value.", self.name, self.max_cores, max_cores) self.max_cores = max_cores if self.total_memory != self.max_memory_per_core: log.info( "Mismatch of value `max_memory_per_core` on resource %s:" " configuration file says `max_memory_per_core=%s` while it's" " actually `%s`. Updating current value.", self.name, self.max_memory_per_core, self.total_memory.to_str('%g%s', unit=Memory.MB)) self.max_memory_per_core = self.total_memory self.available_memory = self.total_memory
def submit_job(self, app): """This method will create a remote directory to store job's sandbox, and will copy the sandbox in there. """ job = app.execution # Create the remote directory. try: self.transport.connect() cmd = "mkdir -p $HOME/.gc3pie_jobs;" \ " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX" log.info("Creating remote temporary folder: command '%s' " % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: ssh_remote_folder = stdout.split('\n')[0] else: raise gc3libs.exceptions.LRMSError( "Failed executing command '%s' on resource '%s';" " exit code: %d, stderr: '%s'." % (cmd, self.name, exit_code, stderr)) except gc3libs.exceptions.TransportError: raise except: raise # Copy the input file(s) to remote directory. for local_path, remote_path in app.inputs.items(): remote_path = os.path.join(ssh_remote_folder, remote_path) remote_parent = os.path.dirname(remote_path) try: if remote_parent not in ['', '.']: log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote cluster '%s' failed", local_path.path, self.frontend) raise if app.arguments[0].startswith('./'): gc3libs.log.debug("Making remote path '%s' executable.", app.arguments[0]) self.transport.chmod(os.path.join(ssh_remote_folder, app.arguments[0]), 0o755) # if STDOUT/STDERR should be saved in a directory, ensure it # exists (see Issue 495 for details) for dest in (app.stdout, app.stderr): if dest: destdir = os.path.dirname(dest) if destdir: self.transport.makedirs( posixpath.join(ssh_remote_folder, destdir)) try: sub_cmd, aux_script = self._submit_command(app) if aux_script != '': # create temporary script name script_filename = ('./script.%s.sh' % uuid.uuid4()) # save script to a temporary file and submit that one instead local_script_file = tempfile.NamedTemporaryFile() local_script_file.write('#!/bin/sh\n') # Add preamble file prologue = self.get_prologue_script(app) if prologue: local_script_file.write(prologue) local_script_file.write(aux_script) # Add epilogue files epilogue = self.get_epilogue_script(app) if epilogue: local_script_file.write(epilogue) local_script_file.flush() # upload script to remote location self.transport.put( local_script_file.name, os.path.join(ssh_remote_folder, script_filename)) # set execution mode on remote script self.transport.chmod( os.path.join(ssh_remote_folder, script_filename), 0o755) # cleanup local_script_file.close() if os.path.exists(local_script_file.name): os.unlink(local_script_file.name) else: # we still need a script name even if there is no # script to submit script_filename = '' # Submit it exit_code, stdout, stderr = self.transport.execute_command( "/bin/sh -c %s" % sh_quote_safe('cd %s && %s %s' % ( ssh_remote_folder, sub_cmd, script_filename))) if exit_code != 0: raise gc3libs.exceptions.LRMSError( "Failed executing command 'cd %s && %s %s' on resource" " '%s'; exit code: %d, stderr: '%s'." % (ssh_remote_folder, sub_cmd, script_filename, self.name, exit_code, stderr)) jobid = self._parse_submit_output(stdout) log.debug('Job submitted with jobid: %s', jobid) job.execution_target = self.frontend job.lrms_jobid = jobid job.lrms_jobname = jobid try: if app.jobname: job.lrms_jobname = app.jobname except: pass if 'stdout' in app: job.stdout_filename = app.stdout else: job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid) if app.join: job.stderr_filename = job.stdout_filename else: if 'stderr' in app: job.stderr_filename = app.stderr else: job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid) job.history.append('Submitted to %s @ %s, got jobid %s' % (self._batchsys_name, self.name, jobid)) job.history.append("Submission command output:\n" " === stdout ===\n%s" " === stderr ===\n%s" " === end ===\n" % (stdout, stderr), 'pbs', 'qsub') job.ssh_remote_folder = ssh_remote_folder return job except: log.critical( "Failure submitting job to resource '%s' - " "see log file for errors", self.name) raise
def submit_job(self, app): """This method will create a remote directory to store job's sandbox, and will copy the sandbox in there. """ job = app.execution # Create the remote directory. try: self.transport.connect() cmd = "mkdir -p $HOME/.gc3pie_jobs;" \ " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX" log.info("Creating remote temporary folder: command '%s' " % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: ssh_remote_folder = stdout.split('\n')[0] else: raise gc3libs.exceptions.LRMSError( "Failed executing command '%s' on resource '%s';" " exit code: %d, stderr: '%s'." % (cmd, self.name, exit_code, stderr)) except gc3libs.exceptions.TransportError: raise except: raise # Copy the input file(s) to remote directory. for local_path, remote_path in app.inputs.items(): remote_path = os.path.join(ssh_remote_folder, remote_path) remote_parent = os.path.dirname(remote_path) try: if remote_parent not in ['', '.']: log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote cluster '%s' failed", local_path.path, self.frontend) raise if app.arguments[0].startswith('./'): gc3libs.log.debug("Making remote path '%s' executable.", app.arguments[0]) self.transport.chmod( os.path.join(ssh_remote_folder, app.arguments[0]), 0o755) # if STDOUT/STDERR should be saved in a directory, ensure it # exists (see Issue 495 for details) for dest in (app.stdout, app.stderr): if dest: destdir = os.path.dirname(dest) if destdir: self.transport.makedirs( posixpath.join(ssh_remote_folder, destdir)) try: sub_cmd, aux_script = self._submit_command(app) if aux_script != '': # create temporary script name script_filename = ('./script.%s.sh' % uuid.uuid4()) # save script to a temporary file and submit that one instead local_script_file = tempfile.NamedTemporaryFile() local_script_file.write('#!/bin/sh\n') # Add preamble file prologue = self.get_prologue_script(app) if prologue: local_script_file.write(prologue) local_script_file.write(aux_script) # Add epilogue files epilogue = self.get_epilogue_script(app) if epilogue: local_script_file.write(epilogue) local_script_file.flush() # upload script to remote location self.transport.put( local_script_file.name, os.path.join(ssh_remote_folder, script_filename)) # set execution mode on remote script self.transport.chmod( os.path.join(ssh_remote_folder, script_filename), 0o755) # cleanup local_script_file.close() if os.path.exists(local_script_file.name): os.unlink(local_script_file.name) else: # we still need a script name even if there is no # script to submit script_filename = '' # Submit it exit_code, stdout, stderr = self.transport.execute_command( "/bin/sh -c %s" % sh_quote_safe('cd %s && %s %s' % (ssh_remote_folder, sub_cmd, script_filename))) if exit_code != 0: raise gc3libs.exceptions.LRMSError( "Failed executing command 'cd %s && %s %s' on resource" " '%s'; exit code: %d, stderr: '%s'." % (ssh_remote_folder, sub_cmd, script_filename, self.name, exit_code, stderr)) jobid = self._parse_submit_output(stdout) log.debug('Job submitted with jobid: %s', jobid) job.execution_target = self.frontend job.lrms_jobid = jobid job.lrms_jobname = jobid try: if app.jobname: job.lrms_jobname = app.jobname except: pass if 'stdout' in app: job.stdout_filename = app.stdout else: job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid) if app.join: job.stderr_filename = job.stdout_filename else: if 'stderr' in app: job.stderr_filename = app.stderr else: job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid) job.history.append('Submitted to %s @ %s, got jobid %s' % (self._batchsys_name, self.name, jobid)) job.history.append( "Submission command output:\n" " === stdout ===\n%s" " === stderr ===\n%s" " === end ===\n" % (stdout, stderr), 'pbs', 'qsub') job.ssh_remote_folder = ssh_remote_folder return job except: log.critical( "Failure submitting job to resource '%s' - " "see log file for errors", self.name) raise